From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rocksdb/utilities/agg_merge/agg_merge.cc | 238 + src/rocksdb/utilities/agg_merge/agg_merge.h | 49 + src/rocksdb/utilities/agg_merge/agg_merge_test.cc | 135 + src/rocksdb/utilities/agg_merge/test_agg_merge.cc | 104 + src/rocksdb/utilities/agg_merge/test_agg_merge.h | 47 + src/rocksdb/utilities/backup/backup_engine.cc | 3181 ++++++++++ src/rocksdb/utilities/backup/backup_engine_impl.h | 36 + src/rocksdb/utilities/backup/backup_engine_test.cc | 4219 +++++++++++++ .../utilities/blob_db/blob_compaction_filter.cc | 490 ++ .../utilities/blob_db/blob_compaction_filter.h | 204 + src/rocksdb/utilities/blob_db/blob_db.cc | 114 + src/rocksdb/utilities/blob_db/blob_db.h | 266 + src/rocksdb/utilities/blob_db/blob_db_gc_stats.h | 56 + src/rocksdb/utilities/blob_db/blob_db_impl.cc | 2177 +++++++ src/rocksdb/utilities/blob_db/blob_db_impl.h | 503 ++ .../utilities/blob_db/blob_db_impl_filesnapshot.cc | 113 + src/rocksdb/utilities/blob_db/blob_db_iterator.h | 150 + src/rocksdb/utilities/blob_db/blob_db_listener.h | 71 + src/rocksdb/utilities/blob_db/blob_db_test.cc | 2407 +++++++ src/rocksdb/utilities/blob_db/blob_dump_tool.cc | 282 + src/rocksdb/utilities/blob_db/blob_dump_tool.h | 58 + src/rocksdb/utilities/blob_db/blob_file.cc | 318 + src/rocksdb/utilities/blob_db/blob_file.h | 246 + src/rocksdb/utilities/cache_dump_load.cc | 69 + src/rocksdb/utilities/cache_dump_load_impl.cc | 393 ++ src/rocksdb/utilities/cache_dump_load_impl.h | 359 ++ .../cassandra/cassandra_compaction_filter.cc | 110 + .../cassandra/cassandra_compaction_filter.h | 57 + .../utilities/cassandra/cassandra_format_test.cc | 377 ++ .../cassandra/cassandra_functional_test.cc | 446 ++ .../utilities/cassandra/cassandra_options.h | 43 + .../cassandra/cassandra_row_merge_test.cc | 98 + .../cassandra/cassandra_serialize_test.cc | 164 + src/rocksdb/utilities/cassandra/format.cc | 367 ++ src/rocksdb/utilities/cassandra/format.h | 183 + src/rocksdb/utilities/cassandra/merge_operator.cc | 82 + src/rocksdb/utilities/cassandra/merge_operator.h | 44 + src/rocksdb/utilities/cassandra/serialize.h | 81 + src/rocksdb/utilities/cassandra/test_utils.cc | 69 + src/rocksdb/utilities/cassandra/test_utils.h | 42 + .../utilities/checkpoint/checkpoint_impl.cc | 469 ++ src/rocksdb/utilities/checkpoint/checkpoint_impl.h | 66 + .../utilities/checkpoint/checkpoint_test.cc | 974 +++ src/rocksdb/utilities/compaction_filters.cc | 56 + .../layered_compaction_filter_base.h | 41 + .../remove_emptyvalue_compactionfilter.cc | 26 + .../remove_emptyvalue_compactionfilter.h | 28 + .../utilities/convenience/info_log_finder.cc | 26 + src/rocksdb/utilities/counted_fs.cc | 379 ++ src/rocksdb/utilities/counted_fs.h | 158 + src/rocksdb/utilities/debug.cc | 120 + src/rocksdb/utilities/env_mirror.cc | 275 + src/rocksdb/utilities/env_mirror_test.cc | 226 + src/rocksdb/utilities/env_timed.cc | 187 + src/rocksdb/utilities/env_timed.h | 97 + src/rocksdb/utilities/env_timed_test.cc | 44 + src/rocksdb/utilities/fault_injection_env.cc | 555 ++ src/rocksdb/utilities/fault_injection_env.h | 258 + src/rocksdb/utilities/fault_injection_fs.cc | 1032 +++ src/rocksdb/utilities/fault_injection_fs.h | 584 ++ .../utilities/fault_injection_secondary_cache.cc | 131 + .../utilities/fault_injection_secondary_cache.h | 108 + .../utilities/leveldb_options/leveldb_options.cc | 57 + src/rocksdb/utilities/memory/memory_test.cc | 279 + src/rocksdb/utilities/memory/memory_util.cc | 52 + src/rocksdb/utilities/memory_allocators.h | 104 + src/rocksdb/utilities/merge_operators.cc | 120 + src/rocksdb/utilities/merge_operators.h | 36 + src/rocksdb/utilities/merge_operators/bytesxor.cc | 57 + src/rocksdb/utilities/merge_operators/bytesxor.h | 40 + src/rocksdb/utilities/merge_operators/max.cc | 80 + src/rocksdb/utilities/merge_operators/put.cc | 92 + src/rocksdb/utilities/merge_operators/sortlist.cc | 95 + src/rocksdb/utilities/merge_operators/sortlist.h | 42 + .../merge_operators/string_append/stringappend.cc | 78 + .../merge_operators/string_append/stringappend.h | 32 + .../merge_operators/string_append/stringappend2.cc | 132 + .../merge_operators/string_append/stringappend2.h | 52 + .../string_append/stringappend_test.cc | 640 ++ src/rocksdb/utilities/merge_operators/uint64add.cc | 75 + src/rocksdb/utilities/object_registry.cc | 383 ++ src/rocksdb/utilities/object_registry_test.cc | 872 +++ .../option_change_migration.cc | 186 + .../option_change_migration_test.cc | 550 ++ src/rocksdb/utilities/options/options_util.cc | 159 + src/rocksdb/utilities/options/options_util_test.cc | 779 +++ .../utilities/persistent_cache/block_cache_tier.cc | 422 ++ .../utilities/persistent_cache/block_cache_tier.h | 156 + .../persistent_cache/block_cache_tier_file.cc | 610 ++ .../persistent_cache/block_cache_tier_file.h | 293 + .../block_cache_tier_file_buffer.h | 127 + .../persistent_cache/block_cache_tier_metadata.cc | 86 + .../persistent_cache/block_cache_tier_metadata.h | 124 + .../utilities/persistent_cache/hash_table.h | 239 + .../utilities/persistent_cache/hash_table_bench.cc | 310 + .../persistent_cache/hash_table_evictable.h | 168 + .../utilities/persistent_cache/hash_table_test.cc | 163 + src/rocksdb/utilities/persistent_cache/lrulist.h | 174 + .../persistent_cache/persistent_cache_bench.cc | 359 ++ .../persistent_cache/persistent_cache_test.cc | 462 ++ .../persistent_cache/persistent_cache_test.h | 286 + .../persistent_cache/persistent_cache_tier.cc | 167 + .../persistent_cache/persistent_cache_tier.h | 342 + .../persistent_cache/persistent_cache_util.h | 67 + .../persistent_cache/volatile_tier_impl.cc | 140 + .../persistent_cache/volatile_tier_impl.h | 141 + .../utilities/simulator_cache/cache_simulator.cc | 288 + .../utilities/simulator_cache/cache_simulator.h | 231 + .../simulator_cache/cache_simulator_test.cc | 497 ++ src/rocksdb/utilities/simulator_cache/sim_cache.cc | 364 ++ .../utilities/simulator_cache/sim_cache_test.cc | 226 + .../compact_on_deletion_collector.cc | 227 + .../compact_on_deletion_collector.h | 70 + .../compact_on_deletion_collector_test.cc | 245 + .../utilities/trace/file_trace_reader_writer.cc | 133 + .../utilities/trace/file_trace_reader_writer.h | 48 + src/rocksdb/utilities/trace/replayer_impl.cc | 316 + src/rocksdb/utilities/trace/replayer_impl.h | 86 + .../utilities/transactions/lock/lock_manager.cc | 29 + .../utilities/transactions/lock/lock_manager.h | 82 + .../utilities/transactions/lock/lock_tracker.h | 209 + .../transactions/lock/point/point_lock_manager.cc | 721 +++ .../transactions/lock/point/point_lock_manager.h | 224 + .../lock/point/point_lock_manager_test.cc | 181 + .../lock/point/point_lock_manager_test.h | 324 + .../transactions/lock/point/point_lock_tracker.cc | 257 + .../transactions/lock/point/point_lock_tracker.h | 99 + .../transactions/lock/range/range_lock_manager.h | 36 + .../transactions/lock/range/range_locking_test.cc | 459 ++ .../lock/range/range_tree/lib/COPYING.AGPLv3 | 661 ++ .../lock/range/range_tree/lib/COPYING.APACHEv2 | 174 + .../lock/range/range_tree/lib/COPYING.GPLv2 | 339 + .../transactions/lock/range/range_tree/lib/README | 13 + .../transactions/lock/range/range_tree/lib/db.h | 76 + .../lock/range/range_tree/lib/ft/comparator.h | 138 + .../lock/range/range_tree/lib/ft/ft-status.h | 102 + .../range_tree/lib/locktree/concurrent_tree.cc | 139 + .../range_tree/lib/locktree/concurrent_tree.h | 174 + .../lock/range/range_tree/lib/locktree/keyrange.cc | 222 + .../lock/range/range_tree/lib/locktree/keyrange.h | 141 + .../range/range_tree/lib/locktree/lock_request.cc | 527 ++ .../range/range_tree/lib/locktree/lock_request.h | 255 + .../lock/range/range_tree/lib/locktree/locktree.cc | 1023 +++ .../lock/range/range_tree/lib/locktree/locktree.h | 580 ++ .../lock/range/range_tree/lib/locktree/manager.cc | 527 ++ .../range/range_tree/lib/locktree/range_buffer.cc | 265 + .../range/range_tree/lib/locktree/range_buffer.h | 178 + .../lock/range/range_tree/lib/locktree/treenode.cc | 520 ++ .../lock/range/range_tree/lib/locktree/treenode.h | 302 + .../range/range_tree/lib/locktree/txnid_set.cc | 120 + .../lock/range/range_tree/lib/locktree/txnid_set.h | 92 + .../lock/range/range_tree/lib/locktree/wfg.cc | 213 + .../lock/range/range_tree/lib/locktree/wfg.h | 124 + .../lock/range/range_tree/lib/portability/memory.h | 215 + .../range_tree/lib/portability/toku_assert_subst.h | 39 + .../range/range_tree/lib/portability/toku_atomic.h | 130 + .../lib/portability/toku_external_pthread.h | 83 + .../lib/portability/toku_instrumentation.h | 286 + .../range_tree/lib/portability/toku_portability.h | 87 + .../range_tree/lib/portability/toku_pthread.h | 520 ++ .../range_tree/lib/portability/toku_race_tools.h | 179 + .../range/range_tree/lib/portability/toku_time.h | 193 + .../range/range_tree/lib/portability/txn_subst.h | 27 + .../lock/range/range_tree/lib/standalone_port.cc | 132 + .../lock/range/range_tree/lib/util/dbt.cc | 153 + .../lock/range/range_tree/lib/util/dbt.h | 98 + .../range/range_tree/lib/util/growable_array.h | 144 + .../lock/range/range_tree/lib/util/memarena.cc | 201 + .../lock/range/range_tree/lib/util/memarena.h | 141 + .../lock/range/range_tree/lib/util/omt.h | 794 +++ .../lock/range/range_tree/lib/util/omt_impl.h | 1295 ++++ .../range_tree/lib/util/partitioned_counter.h | 165 + .../lock/range/range_tree/lib/util/status.h | 76 + .../range/range_tree/range_tree_lock_manager.cc | 503 ++ .../range/range_tree/range_tree_lock_manager.h | 137 + .../range/range_tree/range_tree_lock_tracker.cc | 156 + .../range/range_tree/range_tree_lock_tracker.h | 146 + .../transactions/optimistic_transaction.cc | 196 + .../transactions/optimistic_transaction.h | 101 + .../transactions/optimistic_transaction_db_impl.cc | 111 + .../transactions/optimistic_transaction_db_impl.h | 88 + .../transactions/optimistic_transaction_test.cc | 1491 +++++ .../transactions/pessimistic_transaction.cc | 1175 ++++ .../transactions/pessimistic_transaction.h | 313 + .../transactions/pessimistic_transaction_db.cc | 782 +++ .../transactions/pessimistic_transaction_db.h | 318 + .../utilities/transactions/snapshot_checker.cc | 53 + .../transactions/timestamped_snapshot_test.cc | 466 ++ .../utilities/transactions/transaction_base.cc | 731 +++ .../utilities/transactions/transaction_base.h | 384 ++ .../transactions/transaction_db_mutex_impl.cc | 135 + .../transactions/transaction_db_mutex_impl.h | 26 + .../utilities/transactions/transaction_test.cc | 6550 ++++++++++++++++++++ .../utilities/transactions/transaction_test.h | 578 ++ .../utilities/transactions/transaction_util.cc | 206 + .../utilities/transactions/transaction_util.h | 85 + .../write_committed_transaction_ts_test.cc | 588 ++ .../write_prepared_transaction_test.cc | 4078 ++++++++++++ .../utilities/transactions/write_prepared_txn.cc | 512 ++ .../utilities/transactions/write_prepared_txn.h | 119 + .../transactions/write_prepared_txn_db.cc | 1030 +++ .../utilities/transactions/write_prepared_txn_db.h | 1125 ++++ .../write_unprepared_transaction_test.cc | 790 +++ .../utilities/transactions/write_unprepared_txn.cc | 1053 ++++ .../utilities/transactions/write_unprepared_txn.h | 341 + .../transactions/write_unprepared_txn_db.cc | 473 ++ .../transactions/write_unprepared_txn_db.h | 108 + src/rocksdb/utilities/ttl/db_ttl_impl.cc | 609 ++ src/rocksdb/utilities/ttl/db_ttl_impl.h | 245 + src/rocksdb/utilities/ttl/ttl_test.cc | 912 +++ src/rocksdb/utilities/util_merge_operators_test.cc | 100 + src/rocksdb/utilities/wal_filter.cc | 23 + .../write_batch_with_index.cc | 695 +++ .../write_batch_with_index_internal.cc | 735 +++ .../write_batch_with_index_internal.h | 344 + .../write_batch_with_index_test.cc | 2419 ++++++++ 216 files changed, 82361 insertions(+) create mode 100644 src/rocksdb/utilities/agg_merge/agg_merge.cc create mode 100644 src/rocksdb/utilities/agg_merge/agg_merge.h create mode 100644 src/rocksdb/utilities/agg_merge/agg_merge_test.cc create mode 100644 src/rocksdb/utilities/agg_merge/test_agg_merge.cc create mode 100644 src/rocksdb/utilities/agg_merge/test_agg_merge.h create mode 100644 src/rocksdb/utilities/backup/backup_engine.cc create mode 100644 src/rocksdb/utilities/backup/backup_engine_impl.h create mode 100644 src/rocksdb/utilities/backup/backup_engine_test.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_compaction_filter.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_compaction_filter.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_db.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db_gc_stats.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db_impl.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_db_impl.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_db_iterator.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db_listener.h create mode 100644 src/rocksdb/utilities/blob_db/blob_db_test.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_dump_tool.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_dump_tool.h create mode 100644 src/rocksdb/utilities/blob_db/blob_file.cc create mode 100644 src/rocksdb/utilities/blob_db/blob_file.h create mode 100644 src/rocksdb/utilities/cache_dump_load.cc create mode 100644 src/rocksdb/utilities/cache_dump_load_impl.cc create mode 100644 src/rocksdb/utilities/cache_dump_load_impl.h create mode 100644 src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc create mode 100644 src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h create mode 100644 src/rocksdb/utilities/cassandra/cassandra_format_test.cc create mode 100644 src/rocksdb/utilities/cassandra/cassandra_functional_test.cc create mode 100644 src/rocksdb/utilities/cassandra/cassandra_options.h create mode 100644 src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc create mode 100644 src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc create mode 100644 src/rocksdb/utilities/cassandra/format.cc create mode 100644 src/rocksdb/utilities/cassandra/format.h create mode 100644 src/rocksdb/utilities/cassandra/merge_operator.cc create mode 100644 src/rocksdb/utilities/cassandra/merge_operator.h create mode 100644 src/rocksdb/utilities/cassandra/serialize.h create mode 100644 src/rocksdb/utilities/cassandra/test_utils.cc create mode 100644 src/rocksdb/utilities/cassandra/test_utils.h create mode 100644 src/rocksdb/utilities/checkpoint/checkpoint_impl.cc create mode 100644 src/rocksdb/utilities/checkpoint/checkpoint_impl.h create mode 100644 src/rocksdb/utilities/checkpoint/checkpoint_test.cc create mode 100644 src/rocksdb/utilities/compaction_filters.cc create mode 100644 src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h create mode 100644 src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc create mode 100644 src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h create mode 100644 src/rocksdb/utilities/convenience/info_log_finder.cc create mode 100644 src/rocksdb/utilities/counted_fs.cc create mode 100644 src/rocksdb/utilities/counted_fs.h create mode 100644 src/rocksdb/utilities/debug.cc create mode 100644 src/rocksdb/utilities/env_mirror.cc create mode 100644 src/rocksdb/utilities/env_mirror_test.cc create mode 100644 src/rocksdb/utilities/env_timed.cc create mode 100644 src/rocksdb/utilities/env_timed.h create mode 100644 src/rocksdb/utilities/env_timed_test.cc create mode 100644 src/rocksdb/utilities/fault_injection_env.cc create mode 100644 src/rocksdb/utilities/fault_injection_env.h create mode 100644 src/rocksdb/utilities/fault_injection_fs.cc create mode 100644 src/rocksdb/utilities/fault_injection_fs.h create mode 100644 src/rocksdb/utilities/fault_injection_secondary_cache.cc create mode 100644 src/rocksdb/utilities/fault_injection_secondary_cache.h create mode 100644 src/rocksdb/utilities/leveldb_options/leveldb_options.cc create mode 100644 src/rocksdb/utilities/memory/memory_test.cc create mode 100644 src/rocksdb/utilities/memory/memory_util.cc create mode 100644 src/rocksdb/utilities/memory_allocators.h create mode 100644 src/rocksdb/utilities/merge_operators.cc create mode 100644 src/rocksdb/utilities/merge_operators.h create mode 100644 src/rocksdb/utilities/merge_operators/bytesxor.cc create mode 100644 src/rocksdb/utilities/merge_operators/bytesxor.h create mode 100644 src/rocksdb/utilities/merge_operators/max.cc create mode 100644 src/rocksdb/utilities/merge_operators/put.cc create mode 100644 src/rocksdb/utilities/merge_operators/sortlist.cc create mode 100644 src/rocksdb/utilities/merge_operators/sortlist.h create mode 100644 src/rocksdb/utilities/merge_operators/string_append/stringappend.cc create mode 100644 src/rocksdb/utilities/merge_operators/string_append/stringappend.h create mode 100644 src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc create mode 100644 src/rocksdb/utilities/merge_operators/string_append/stringappend2.h create mode 100644 src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc create mode 100644 src/rocksdb/utilities/merge_operators/uint64add.cc create mode 100644 src/rocksdb/utilities/object_registry.cc create mode 100644 src/rocksdb/utilities/object_registry_test.cc create mode 100644 src/rocksdb/utilities/option_change_migration/option_change_migration.cc create mode 100644 src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc create mode 100644 src/rocksdb/utilities/options/options_util.cc create mode 100644 src/rocksdb/utilities/options/options_util_test.cc create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier.cc create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier.h create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc create mode 100644 src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h create mode 100644 src/rocksdb/utilities/persistent_cache/hash_table.h create mode 100644 src/rocksdb/utilities/persistent_cache/hash_table_bench.cc create mode 100644 src/rocksdb/utilities/persistent_cache/hash_table_evictable.h create mode 100644 src/rocksdb/utilities/persistent_cache/hash_table_test.cc create mode 100644 src/rocksdb/utilities/persistent_cache/lrulist.h create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_test.h create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h create mode 100644 src/rocksdb/utilities/persistent_cache/persistent_cache_util.h create mode 100644 src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc create mode 100644 src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h create mode 100644 src/rocksdb/utilities/simulator_cache/cache_simulator.cc create mode 100644 src/rocksdb/utilities/simulator_cache/cache_simulator.h create mode 100644 src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc create mode 100644 src/rocksdb/utilities/simulator_cache/sim_cache.cc create mode 100644 src/rocksdb/utilities/simulator_cache/sim_cache_test.cc create mode 100644 src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc create mode 100644 src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h create mode 100644 src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc create mode 100644 src/rocksdb/utilities/trace/file_trace_reader_writer.cc create mode 100644 src/rocksdb/utilities/trace/file_trace_reader_writer.h create mode 100644 src/rocksdb/utilities/trace/replayer_impl.cc create mode 100644 src/rocksdb/utilities/trace/replayer_impl.h create mode 100644 src/rocksdb/utilities/transactions/lock/lock_manager.cc create mode 100644 src/rocksdb/utilities/transactions/lock/lock_manager.h create mode 100644 src/rocksdb/utilities/transactions/lock/lock_tracker.h create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc create mode 100644 src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h create mode 100644 src/rocksdb/utilities/transactions/optimistic_transaction.cc create mode 100644 src/rocksdb/utilities/transactions/optimistic_transaction.h create mode 100644 src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc create mode 100644 src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h create mode 100644 src/rocksdb/utilities/transactions/optimistic_transaction_test.cc create mode 100644 src/rocksdb/utilities/transactions/pessimistic_transaction.cc create mode 100644 src/rocksdb/utilities/transactions/pessimistic_transaction.h create mode 100644 src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc create mode 100644 src/rocksdb/utilities/transactions/pessimistic_transaction_db.h create mode 100644 src/rocksdb/utilities/transactions/snapshot_checker.cc create mode 100644 src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc create mode 100644 src/rocksdb/utilities/transactions/transaction_base.cc create mode 100644 src/rocksdb/utilities/transactions/transaction_base.h create mode 100644 src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc create mode 100644 src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h create mode 100644 src/rocksdb/utilities/transactions/transaction_test.cc create mode 100644 src/rocksdb/utilities/transactions/transaction_test.h create mode 100644 src/rocksdb/utilities/transactions/transaction_util.cc create mode 100644 src/rocksdb/utilities/transactions/transaction_util.h create mode 100644 src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc create mode 100644 src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc create mode 100644 src/rocksdb/utilities/transactions/write_prepared_txn.cc create mode 100644 src/rocksdb/utilities/transactions/write_prepared_txn.h create mode 100644 src/rocksdb/utilities/transactions/write_prepared_txn_db.cc create mode 100644 src/rocksdb/utilities/transactions/write_prepared_txn_db.h create mode 100644 src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc create mode 100644 src/rocksdb/utilities/transactions/write_unprepared_txn.cc create mode 100644 src/rocksdb/utilities/transactions/write_unprepared_txn.h create mode 100644 src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc create mode 100644 src/rocksdb/utilities/transactions/write_unprepared_txn_db.h create mode 100644 src/rocksdb/utilities/ttl/db_ttl_impl.cc create mode 100644 src/rocksdb/utilities/ttl/db_ttl_impl.h create mode 100644 src/rocksdb/utilities/ttl/ttl_test.cc create mode 100644 src/rocksdb/utilities/util_merge_operators_test.cc create mode 100644 src/rocksdb/utilities/wal_filter.cc create mode 100644 src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc create mode 100644 src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc create mode 100644 src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h create mode 100644 src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc (limited to 'src/rocksdb/utilities') diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.cc b/src/rocksdb/utilities/agg_merge/agg_merge.cc new file mode 100644 index 000000000..a7eab1f12 --- /dev/null +++ b/src/rocksdb/utilities/agg_merge/agg_merge.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/agg_merge/agg_merge.h" + +#include + +#include +#include +#include +#include +#include + +#include "port/lang.h" +#include "port/likely.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/agg_merge.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map> func_map; +const std::string kUnnamedFuncName = ""; +const std::string kErrorFuncName = "kErrorFuncName"; + +Status AddAggregator(const std::string& function_name, + std::unique_ptr&& agg) { + if (function_name == kErrorFuncName) { + return Status::InvalidArgument( + "Cannot register function name kErrorFuncName"); + } + func_map.emplace(function_name, std::move(agg)); + return Status::OK(); +} + +AggMergeOperator::AggMergeOperator() {} + +std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, + const Slice& value) { + std::string result; + PutLengthPrefixedSlice(&result, function_name); + result += value.ToString(); + return result; +} + +Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload, + std::string& output) { + if (function_name == kErrorFuncName) { + return Status::InvalidArgument("Cannot use error function name"); + } + if (function_name != kUnnamedFuncName && + func_map.find(function_name.ToString()) == func_map.end()) { + return Status::InvalidArgument("Function name not registered"); + } + output = EncodeAggFuncAndPayloadNoCheck(function_name, payload); + return Status::OK(); +} + +bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value) { + value = op; + return GetLengthPrefixedSlice(&value, &func); +} + +bool ExtractList(const Slice& encoded_list, std::vector& decoded_list) { + decoded_list.clear(); + Slice list_slice = encoded_list; + Slice item; + while (GetLengthPrefixedSlice(&list_slice, &item)) { + decoded_list.push_back(item); + } + return list_slice.empty(); +} + +class AggMergeOperator::Accumulator { + public: + bool Add(const Slice& op, bool is_partial_aggregation) { + if (ignore_operands_) { + return true; + } + Slice my_func; + Slice my_value; + bool ret = ExtractAggFuncAndValue(op, my_func, my_value); + if (!ret) { + ignore_operands_ = true; + return true; + } + + // Determine whether we need to do partial merge. + if (is_partial_aggregation && !my_func.empty()) { + auto f = func_map.find(my_func.ToString()); + if (f == func_map.end() || !f->second->DoPartialAggregate()) { + return false; + } + } + + if (!func_valid_) { + if (my_func != kUnnamedFuncName) { + func_ = my_func; + func_valid_ = true; + } + } else if (func_ != my_func) { + // User switched aggregation function. Need to aggregate the older + // one first. + + // Previous aggreagion can't be done in partial merge + if (is_partial_aggregation) { + func_valid_ = false; + ignore_operands_ = true; + return false; + } + + // We could consider stashing an iterator into the hash of aggregators + // to avoid repeated lookups when the aggregator doesn't change. + auto f = func_map.find(func_.ToString()); + if (f == func_map.end() || !f->second->Aggregate(values_, scratch_)) { + func_valid_ = false; + ignore_operands_ = true; + return true; + } + std::swap(scratch_, aggregated_); + values_.clear(); + values_.push_back(aggregated_); + func_ = my_func; + } + values_.push_back(my_value); + return true; + } + + // Return false if aggregation fails. + // One possible reason + bool GetResult(std::string& result) { + if (!func_valid_) { + return false; + } + auto f = func_map.find(func_.ToString()); + if (f == func_map.end()) { + return false; + } + if (!f->second->Aggregate(values_, scratch_)) { + return false; + } + result = EncodeAggFuncAndPayloadNoCheck(func_, scratch_); + return true; + } + + void Clear() { + func_.clear(); + values_.clear(); + aggregated_.clear(); + scratch_.clear(); + ignore_operands_ = false; + func_valid_ = false; + } + + private: + Slice func_; + std::vector values_; + std::string aggregated_; + std::string scratch_; + bool ignore_operands_ = false; + bool func_valid_ = false; +}; + +// Creating and using a new Accumulator might invoke multiple malloc and is +// expensive if it needs to be done when processing each merge operation. +// AggMergeOperator's merge operators can be invoked concurrently by multiple +// threads so we cannot simply create one Aggregator and reuse. +// We use thread local instances instead. +AggMergeOperator::Accumulator& AggMergeOperator::GetTLSAccumulator() { + static thread_local Accumulator tls_acc; + tls_acc.Clear(); + return tls_acc; +} + +void AggMergeOperator::PackAllMergeOperands(const MergeOperationInput& merge_in, + MergeOperationOutput& merge_out) { + merge_out.new_value = ""; + PutLengthPrefixedSlice(&merge_out.new_value, kErrorFuncName); + if (merge_in.existing_value != nullptr) { + PutLengthPrefixedSlice(&merge_out.new_value, *merge_in.existing_value); + } + for (const Slice& op : merge_in.operand_list) { + PutLengthPrefixedSlice(&merge_out.new_value, op); + } +} + +bool AggMergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + Accumulator& agg = GetTLSAccumulator(); + if (merge_in.existing_value != nullptr) { + agg.Add(*merge_in.existing_value, /*is_partial_aggregation=*/false); + } + for (const Slice& e : merge_in.operand_list) { + agg.Add(e, /*is_partial_aggregation=*/false); + } + + bool succ = agg.GetResult(merge_out->new_value); + if (!succ) { + // If aggregation can't happen, pack all merge operands. In contrast to + // merge operator, we don't want to fail the DB. If users insert wrong + // format or call unregistered an aggregation function, we still hope + // the DB can continue functioning with other keys. + PackAllMergeOperands(merge_in, *merge_out); + } + agg.Clear(); + return true; +} + +bool AggMergeOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + Accumulator& agg = GetTLSAccumulator(); + bool do_aggregation = true; + for (const Slice& item : operand_list) { + do_aggregation = agg.Add(item, /*is_partial_aggregation=*/true); + if (!do_aggregation) { + break; + } + } + if (do_aggregation) { + do_aggregation = agg.GetResult(*new_value); + } + agg.Clear(); + return do_aggregation; +} + +std::shared_ptr GetAggMergeOperator() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared()); + assert(instance); + return instance; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.h b/src/rocksdb/utilities/agg_merge/agg_merge.h new file mode 100644 index 000000000..00e58de08 --- /dev/null +++ b/src/rocksdb/utilities/agg_merge/agg_merge.h @@ -0,0 +1,49 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/agg_merge.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +class AggMergeOperator : public MergeOperator { + public: + explicit AggMergeOperator(); + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "AggMergeOperator.v1"; } + + bool AllowSingleOperand() const override { return true; } + + bool ShouldMerge(const std::vector&) const override { return false; } + + private: + class Accumulator; + + // Pack all merge operands into one value. This is called when aggregation + // fails. The existing values are preserved and returned so that users can + // debug the problem. + static void PackAllMergeOperands(const MergeOperationInput& merge_in, + MergeOperationOutput& merge_out); + static Accumulator& GetTLSAccumulator(); +}; + +extern std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name, + const Slice& value); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/agg_merge/agg_merge_test.cc b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc new file mode 100644 index 000000000..a65441cd0 --- /dev/null +++ b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/agg_merge.h" + +#include + +#include + +#include "db/db_test_util.h" +#include "rocksdb/options.h" +#include "test_util/testharness.h" +#include "utilities/agg_merge/agg_merge.h" +#include "utilities/agg_merge/test_agg_merge.h" + +namespace ROCKSDB_NAMESPACE { + +class AggMergeTest : public DBTestBase { + public: + AggMergeTest() : DBTestBase("agg_merge_db_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(AggMergeTest, TestUsingMergeOperator) { + ASSERT_OK(AddAggregator("sum", std::make_unique())); + ASSERT_OK(AddAggregator("last3", std::make_unique())); + ASSERT_OK(AddAggregator("mul", std::make_unique())); + + Options options = CurrentOptions(); + options.merge_operator = GetAggMergeOperator(); + Reopen(options); + std::string v = EncodeHelper::EncodeFuncAndInt("sum", 10); + ASSERT_OK(Merge("foo", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 20); + ASSERT_OK(Merge("foo", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 15); + ASSERT_OK(Merge("foo", v)); + + v = EncodeHelper::EncodeFuncAndList("last3", {"a", "b"}); + ASSERT_OK(Merge("bar", v)); + v = EncodeHelper::EncodeFuncAndList("last3", {"c", "d", "e"}); + ASSERT_OK(Merge("bar", v)); + ASSERT_OK(Flush()); + v = EncodeHelper::EncodeFuncAndList("last3", {"f"}); + ASSERT_OK(Merge("bar", v)); + + // Test Put() without aggregation type. + v = EncodeHelper::EncodeFuncAndInt(kUnnamedFuncName, 30); + ASSERT_OK(Put("foo2", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 10); + ASSERT_OK(Merge("foo2", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 20); + ASSERT_OK(Merge("foo2", v)); + + EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 45), Get("foo")); + EXPECT_EQ(EncodeHelper::EncodeFuncAndList("last3", {"f", "c", "d"}), + Get("bar")); + EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 60), Get("foo2")); + + // Test changing aggregation type + v = EncodeHelper::EncodeFuncAndInt("mul", 10); + ASSERT_OK(Put("bar2", v)); + v = EncodeHelper::EncodeFuncAndInt("mul", 20); + ASSERT_OK(Merge("bar2", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 30); + ASSERT_OK(Merge("bar2", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 40); + ASSERT_OK(Merge("bar2", v)); + EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 20 + 30 + 40), + Get("bar2")); + + // Changing aggregation type with partial merge + v = EncodeHelper::EncodeFuncAndInt("mul", 10); + ASSERT_OK(Merge("foo3", v)); + ASSERT_OK(Flush()); + v = EncodeHelper::EncodeFuncAndInt("mul", 10); + ASSERT_OK(Merge("foo3", v)); + v = EncodeHelper::EncodeFuncAndInt("mul", 10); + ASSERT_OK(Merge("foo3", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 10); + ASSERT_OK(Merge("foo3", v)); + ASSERT_OK(Flush()); + EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 10 * 10 + 10), + Get("foo3")); + + // Merge after full merge + v = EncodeHelper::EncodeFuncAndInt("sum", 1); + ASSERT_OK(Merge("foo4", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 2); + ASSERT_OK(Merge("foo4", v)); + ASSERT_OK(Flush()); + v = EncodeHelper::EncodeFuncAndInt("sum", 3); + ASSERT_OK(Merge("foo4", v)); + v = EncodeHelper::EncodeFuncAndInt("sum", 4); + ASSERT_OK(Merge("foo4", v)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + v = EncodeHelper::EncodeFuncAndInt("sum", 5); + ASSERT_OK(Merge("foo4", v)); + EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 15), Get("foo4")); + + // Test unregistered function name + v = EncodeAggFuncAndPayloadNoCheck("non_existing", "1"); + ASSERT_OK(Merge("bar3", v)); + std::string v1; + v1 = EncodeAggFuncAndPayloadNoCheck("non_existing", "invalid"); + ; + ASSERT_OK(Merge("bar3", v1)); + EXPECT_EQ(EncodeAggFuncAndPayloadNoCheck(kErrorFuncName, + EncodeHelper::EncodeList({v, v1})), + Get("bar3")); + + // invalidate input + ASSERT_OK(EncodeAggFuncAndPayload("sum", "invalid", v)); + ASSERT_OK(Merge("bar4", v)); + v1 = EncodeHelper::EncodeFuncAndInt("sum", 20); + ASSERT_OK(Merge("bar4", v1)); + std::string aggregated_value = Get("bar4"); + Slice func, payload; + ASSERT_TRUE(ExtractAggFuncAndValue(aggregated_value, func, payload)); + EXPECT_EQ(kErrorFuncName, func); + std::vector decoded_list; + ASSERT_TRUE(ExtractList(payload, decoded_list)); + ASSERT_EQ(2, decoded_list.size()); + ASSERT_EQ(v, decoded_list[0]); + ASSERT_EQ(v1, decoded_list[1]); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.cc b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc new file mode 100644 index 000000000..06e5b5697 --- /dev/null +++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_agg_merge.h" + +#include + +#include +#include + +#include "util/coding.h" +#include "utilities/agg_merge/agg_merge.h" + +namespace ROCKSDB_NAMESPACE { + +std::string EncodeHelper::EncodeFuncAndInt(const Slice& function_name, + int64_t value) { + std::string encoded_value; + PutVarsignedint64(&encoded_value, value); + std::string ret; + Status s = EncodeAggFuncAndPayload(function_name, encoded_value, ret); + assert(s.ok()); + return ret; +} + +std::string EncodeHelper::EncodeInt(int64_t value) { + std::string encoded_value; + PutVarsignedint64(&encoded_value, value); + return encoded_value; +} + +std::string EncodeHelper::EncodeFuncAndList(const Slice& function_name, + const std::vector& list) { + std::string ret; + Status s = EncodeAggFuncAndPayload(function_name, EncodeList(list), ret); + assert(s.ok()); + return ret; +} + +std::string EncodeHelper::EncodeList(const std::vector& list) { + std::string result; + for (const Slice& entity : list) { + PutLengthPrefixedSlice(&result, entity); + } + return result; +} + +bool SumAggregator::Aggregate(const std::vector& item_list, + std::string& result) const { + int64_t sum = 0; + for (const Slice& item : item_list) { + int64_t ivalue; + Slice v = item; + if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) { + return false; + } + sum += ivalue; + } + result = EncodeHelper::EncodeInt(sum); + return true; +} + +bool MultipleAggregator::Aggregate(const std::vector& item_list, + std::string& result) const { + int64_t mresult = 1; + for (const Slice& item : item_list) { + int64_t ivalue; + Slice v = item; + if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) { + return false; + } + mresult *= ivalue; + } + result = EncodeHelper::EncodeInt(mresult); + return true; +} + +bool Last3Aggregator::Aggregate(const std::vector& item_list, + std::string& result) const { + std::vector last3; + last3.reserve(3); + for (auto it = item_list.rbegin(); it != item_list.rend(); ++it) { + Slice input = *it; + Slice entity; + bool ret; + while ((ret = GetLengthPrefixedSlice(&input, &entity)) == true) { + last3.push_back(entity); + if (last3.size() >= 3) { + break; + } + } + if (last3.size() >= 3) { + break; + } + if (!ret) { + continue; + } + } + result = EncodeHelper::EncodeList(last3); + return true; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.h b/src/rocksdb/utilities/agg_merge/test_agg_merge.h new file mode 100644 index 000000000..5bdf8b9cc --- /dev/null +++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.h @@ -0,0 +1,47 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/agg_merge.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +class SumAggregator : public Aggregator { + public: + ~SumAggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; + bool DoPartialAggregate() const override { return true; } +}; + +class MultipleAggregator : public Aggregator { + public: + ~MultipleAggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; + bool DoPartialAggregate() const override { return true; } +}; + +class Last3Aggregator : public Aggregator { + public: + ~Last3Aggregator() override {} + bool Aggregate(const std::vector&, std::string& result) const override; +}; + +class EncodeHelper { + public: + static std::string EncodeFuncAndInt(const Slice& function_name, + int64_t value); + static std::string EncodeInt(int64_t value); + static std::string EncodeList(const std::vector& list); + static std::string EncodeFuncAndList(const Slice& function_name, + const std::vector& list); +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/backup/backup_engine.cc b/src/rocksdb/utilities/backup/backup_engine.cc new file mode 100644 index 000000000..81b4a6629 --- /dev/null +++ b/src/rocksdb/utilities/backup/backup_engine.cc @@ -0,0 +1,3181 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/fs_readonly.h" +#include "env/fs_remap.h" +#include "file/filename.h" +#include "file/line_file_reader.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/env.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/statistics.h" +#include "rocksdb/transaction_log.h" +#include "table/sst_file_dumper.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/channel.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/math.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/backup/backup_engine_impl.h" +#include "utilities/checkpoint/checkpoint_impl.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming; + +constexpr BackupID kLatestBackupIDMarker = static_cast(-2); + +inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) { + std::string checksum_str; + Slice(checksum_hex).DecodeHex(&checksum_str); + return EndianSwapValue(DecodeFixed32(checksum_str.c_str())); +} +inline std::string ChecksumStrToHex(const std::string& checksum_str) { + return Slice(checksum_str).ToString(true); +} +inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) { + std::string checksum_str; + PutFixed32(&checksum_str, EndianSwapValue(checksum_value)); + return ChecksumStrToHex(checksum_str); +} + +const std::string kPrivateDirName = "private"; +const std::string kMetaDirName = "meta"; +const std::string kSharedDirName = "shared"; +const std::string kSharedChecksumDirName = "shared_checksum"; +const std::string kPrivateDirSlash = kPrivateDirName + "/"; +const std::string kMetaDirSlash = kMetaDirName + "/"; +const std::string kSharedDirSlash = kSharedDirName + "/"; +const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/"; + +} // namespace + +void BackupStatistics::IncrementNumberSuccessBackup() { + number_success_backup++; +} +void BackupStatistics::IncrementNumberFailBackup() { number_fail_backup++; } + +uint32_t BackupStatistics::GetNumberSuccessBackup() const { + return number_success_backup; +} +uint32_t BackupStatistics::GetNumberFailBackup() const { + return number_fail_backup; +} + +std::string BackupStatistics::ToString() const { + char result[50]; + snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u", + GetNumberSuccessBackup(), GetNumberFailBackup()); + return result; +} + +void BackupEngineOptions::Dump(Logger* logger) const { + ROCKS_LOG_INFO(logger, " Options.backup_dir: %s", + backup_dir.c_str()); + ROCKS_LOG_INFO(logger, " Options.backup_env: %p", backup_env); + ROCKS_LOG_INFO(logger, " Options.share_table_files: %d", + static_cast(share_table_files)); + ROCKS_LOG_INFO(logger, " Options.info_log: %p", info_log); + ROCKS_LOG_INFO(logger, " Options.sync: %d", + static_cast(sync)); + ROCKS_LOG_INFO(logger, " Options.destroy_old_data: %d", + static_cast(destroy_old_data)); + ROCKS_LOG_INFO(logger, " Options.backup_log_files: %d", + static_cast(backup_log_files)); + ROCKS_LOG_INFO(logger, " Options.backup_rate_limit: %" PRIu64, + backup_rate_limit); + ROCKS_LOG_INFO(logger, " Options.restore_rate_limit: %" PRIu64, + restore_rate_limit); + ROCKS_LOG_INFO(logger, "Options.max_background_operations: %d", + max_background_operations); +} + +namespace { +// -------- BackupEngineImpl class --------- +class BackupEngineImpl { + public: + BackupEngineImpl(const BackupEngineOptions& options, Env* db_env, + bool read_only = false); + ~BackupEngineImpl(); + + IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options, + DB* db, const std::string& app_metadata, + BackupID* new_backup_id_ptr); + + IOStatus PurgeOldBackups(uint32_t num_backups_to_keep); + + IOStatus DeleteBackup(BackupID backup_id); + + void StopBackup() { stop_backup_.store(true, std::memory_order_release); } + + IOStatus GarbageCollect(); + + // The returned BackupInfos are in chronological order, which means the + // latest backup comes last. + void GetBackupInfo(std::vector* backup_info, + bool include_file_details) const; + + Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const; + + void GetCorruptedBackups(std::vector* corrupt_backup_ids) const; + + IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir) const; + + IOStatus RestoreDBFromLatestBackup(const RestoreOptions& options, + const std::string& db_dir, + const std::string& wal_dir) const { + // Note: don't read latest_valid_backup_id_ outside of lock + return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir); + } + + IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const; + + IOStatus Initialize(); + + ShareFilesNaming GetNamingNoFlags() const { + return options_.share_files_with_checksum_naming & + BackupEngineOptions::kMaskNoNamingFlags; + } + ShareFilesNaming GetNamingFlags() const { + return options_.share_files_with_checksum_naming & + BackupEngineOptions::kMaskNamingFlags; + } + + void TEST_SetDefaultRateLimitersClock( + const std::shared_ptr& backup_rate_limiter_clock, + const std::shared_ptr& restore_rate_limiter_clock) { + if (backup_rate_limiter_clock) { + static_cast(options_.backup_rate_limiter.get()) + ->TEST_SetClock(backup_rate_limiter_clock); + } + + if (restore_rate_limiter_clock) { + static_cast(options_.restore_rate_limiter.get()) + ->TEST_SetClock(restore_rate_limiter_clock); + } + } + + private: + void DeleteChildren(const std::string& dir, + uint32_t file_type_filter = 0) const; + IOStatus DeleteBackupNoGC(BackupID backup_id); + + // Extends the "result" map with pathname->size mappings for the contents of + // "dir" in "env". Pathnames are prefixed with "dir". + IOStatus ReadChildFileCurrentSizes( + const std::string& dir, const std::shared_ptr&, + std::unordered_map* result) const; + + struct FileInfo { + FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum, + const std::string& id, const std::string& sid, Temperature _temp) + : refs(0), + filename(fname), + size(sz), + checksum_hex(checksum), + db_id(id), + db_session_id(sid), + temp(_temp) {} + + FileInfo(const FileInfo&) = delete; + FileInfo& operator=(const FileInfo&) = delete; + + int refs; + const std::string filename; + const uint64_t size; + // crc32c checksum as hex. empty == unknown / unavailable + std::string checksum_hex; + // DB identities + // db_id is obtained for potential usage in the future but not used + // currently + const std::string db_id; + // db_session_id appears in the backup SST filename if the table naming + // option is kUseDbSessionId + const std::string db_session_id; + Temperature temp; + + std::string GetDbFileName() { + std::string rv; + // extract the filename part + size_t slash = filename.find_last_of('/'); + // file will either be shared/, shared_checksum/, + // shared_checksum/, shared_checksum/, + // or private// + assert(slash != std::string::npos); + rv = filename.substr(slash + 1); + + // if the file was in shared_checksum, extract the real file name + // in this case the file is __., + // _., or __. + if (filename.substr(0, slash) == kSharedChecksumDirName) { + rv = GetFileFromChecksumFile(rv); + } + return rv; + } + }; + + // TODO: deprecate this function once we migrate all BackupEngine's rate + // limiting to lower-level ones (i.e, ones in file access wrapper level like + // `WritableFileWriter`) + static void LoopRateLimitRequestHelper(const size_t total_bytes_to_request, + RateLimiter* rate_limiter, + const Env::IOPriority pri, + Statistics* stats, + const RateLimiter::OpType op_type); + + static inline std::string WithoutTrailingSlash(const std::string& path) { + if (path.empty() || path.back() != '/') { + return path; + } else { + return path.substr(path.size() - 1); + } + } + + static inline std::string WithTrailingSlash(const std::string& path) { + if (path.empty() || path.back() != '/') { + return path + '/'; + } else { + return path; + } + } + + // A filesystem wrapper that makes shared backup files appear to be in the + // private backup directory (dst_dir), so that the private backup dir can + // be opened as a read-only DB. + class RemapSharedFileSystem : public RemapFileSystem { + public: + RemapSharedFileSystem(const std::shared_ptr& base, + const std::string& dst_dir, + const std::string& src_base_dir, + const std::vector>& files) + : RemapFileSystem(base), + dst_dir_(WithoutTrailingSlash(dst_dir)), + dst_dir_slash_(WithTrailingSlash(dst_dir)), + src_base_dir_(WithTrailingSlash(src_base_dir)) { + for (auto& info : files) { + if (!StartsWith(info->filename, kPrivateDirSlash)) { + assert(StartsWith(info->filename, kSharedDirSlash) || + StartsWith(info->filename, kSharedChecksumDirSlash)); + remaps_[info->GetDbFileName()] = info; + } + } + } + + const char* Name() const override { + return "BackupEngineImpl::RemapSharedFileSystem"; + } + + // Sometimes a directory listing is required in opening a DB + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg); + if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) { + // Assume remapped files exist + for (auto& r : remaps_) { + result->push_back(r.first); + } + } + return s; + } + + // Sometimes a directory listing is required in opening a DB + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + IOStatus s = + RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg); + if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) { + // Assume remapped files exist with recorded size + for (auto& r : remaps_) { + result->emplace_back(); // clean up with C++20 + FileAttributes& attr = result->back(); + attr.name = r.first; + attr.size_bytes = r.second->size; + } + } + return s; + } + + protected: + // When a file in dst_dir is requested, see if we need to remap to shared + // file path. + std::pair EncodePath( + const std::string& path) override { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + std::pair rv{IOStatus(), path}; + if (StartsWith(path, dst_dir_slash_)) { + std::string relative = path.substr(dst_dir_slash_.size()); + auto it = remaps_.find(relative); + if (it != remaps_.end()) { + rv.second = src_base_dir_ + it->second->filename; + } + } + return rv; + } + + private: + // Absolute path to a directory that some extra files will be mapped into. + const std::string dst_dir_; + // Includes a trailing slash. + const std::string dst_dir_slash_; + // Absolute path to a directory containing some files to be mapped into + // dst_dir_. Includes a trailing slash. + const std::string src_base_dir_; + // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read + // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize + // sharing with other backup data in memory. + std::unordered_map> remaps_; + }; + + class BackupMeta { + public: + BackupMeta( + const std::string& meta_filename, const std::string& meta_tmp_filename, + std::unordered_map>* file_infos, + Env* env, const std::shared_ptr& fs) + : timestamp_(0), + sequence_number_(0), + size_(0), + meta_filename_(meta_filename), + meta_tmp_filename_(meta_tmp_filename), + file_infos_(file_infos), + env_(env), + fs_(fs) {} + + BackupMeta(const BackupMeta&) = delete; + BackupMeta& operator=(const BackupMeta&) = delete; + + ~BackupMeta() {} + + void RecordTimestamp() { + // Best effort + Status s = env_->GetCurrentTime(×tamp_); + if (!s.ok()) { + timestamp_ = /* something clearly fabricated */ 1; + } + } + int64_t GetTimestamp() const { return timestamp_; } + uint64_t GetSize() const { return size_; } + uint32_t GetNumberFiles() const { + return static_cast(files_.size()); + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() const { return sequence_number_; } + + const std::string& GetAppMetadata() const { return app_metadata_; } + + void SetAppMetadata(const std::string& app_metadata) { + app_metadata_ = app_metadata; + } + + IOStatus AddFile(std::shared_ptr file_info); + + IOStatus Delete(bool delete_meta = true); + + bool Empty() const { return files_.empty(); } + + std::shared_ptr GetFile(const std::string& filename) const { + auto it = file_infos_->find(filename); + if (it == file_infos_->end()) { + return nullptr; + } + return it->second; + } + + const std::vector>& GetFiles() const { + return files_; + } + + // @param abs_path_to_size Pre-fetched file sizes (bytes). + IOStatus LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size, + RateLimiter* rate_limiter, Logger* info_log, + std::unordered_set* reported_ignored_fields); + IOStatus StoreToFile( + bool sync, int schema_version, + const TEST_BackupMetaSchemaOptions* schema_test_options); + + std::string GetInfoString() { + std::ostringstream ss; + ss << "Timestamp: " << timestamp_ << std::endl; + char human_size[16]; + AppendHumanBytes(size_, human_size, sizeof(human_size)); + ss << "Size: " << human_size << std::endl; + ss << "Files:" << std::endl; + for (const auto& file : files_) { + AppendHumanBytes(file->size, human_size, sizeof(human_size)); + ss << file->filename << ", size " << human_size << ", refs " + << file->refs << std::endl; + } + return ss.str(); + } + + const std::shared_ptr& GetEnvForOpen() const { + if (!env_for_open_) { + // Lazy initialize + // Find directories + std::string dst_dir = meta_filename_; + auto i = dst_dir.rfind(kMetaDirSlash); + assert(i != std::string::npos); + std::string src_base_dir = dst_dir.substr(0, i); + dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash); + // Make the RemapSharedFileSystem + std::shared_ptr remap_fs = + std::make_shared(fs_, dst_dir, src_base_dir, + files_); + // Make it read-only for safety + remap_fs = std::make_shared(remap_fs); + // Make an Env wrapper + env_for_open_ = std::make_shared(env_, remap_fs); + } + return env_for_open_; + } + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string app_metadata_; + std::string const meta_filename_; + std::string const meta_tmp_filename_; + // files with relative paths (without "/" prefix!!) + std::vector> files_; + std::unordered_map>* file_infos_; + Env* env_; + mutable std::shared_ptr env_for_open_; + std::shared_ptr fs_; + IOOptions iooptions_ = IOOptions(); + }; // BackupMeta + + void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta, + BackupInfo* backup_info, + bool include_file_details) const; + + inline std::string GetAbsolutePath( + const std::string& relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateFileRel(BackupID backup_id, bool tmp = false, + const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return kPrivateDirSlash + std::to_string(backup_id) + (tmp ? ".tmp" : "") + + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return kSharedDirSlash + std::string(tmp ? "." : "") + file + + (tmp ? ".tmp" : ""); + } + inline std::string GetSharedFileWithChecksumRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file + + (tmp ? ".tmp" : ""); + } + inline bool UseLegacyNaming(const std::string& sid) const { + return GetNamingNoFlags() == + BackupEngineOptions::kLegacyCrc32cAndFileSize || + sid.empty(); + } + inline std::string GetSharedFileWithChecksum( + const std::string& file, const std::string& checksum_hex, + const uint64_t file_size, const std::string& db_session_id) const { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + if (UseLegacyNaming(db_session_id)) { + assert(!checksum_hex.empty()); + file_copy.insert(file_copy.find_last_of('.'), + "_" + std::to_string(ChecksumHexToInt32(checksum_hex)) + + "_" + std::to_string(file_size)); + } else { + file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id); + if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) { + file_copy.insert(file_copy.find_last_of('.'), + "_" + std::to_string(file_size)); + } + } + return file_copy; + } + static inline std::string GetFileFromChecksumFile(const std::string& file) { + assert(file.size() == 0 || file[0] != '/'); + std::string file_copy = file; + size_t first_underscore = file_copy.find_first_of('_'); + return file_copy.erase(first_underscore, + file_copy.find_last_of('.') - first_underscore); + } + inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const { + return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") + + std::to_string(backup_id) + (tmp ? ".tmp" : ""); + } + + // If size_limit == 0, there is no size limit, copy everything. + // + // Exactly one of src and contents must be non-empty. + // + // @param src If non-empty, the file is copied from this pathname. + // @param contents If non-empty, the file will be created with these contents. + // @param src_temperature Pass in expected temperature of src, return back + // temperature reported by FileSystem + IOStatus CopyOrCreateFile(const std::string& src, const std::string& dst, + const std::string& contents, uint64_t size_limit, + Env* src_env, Env* dst_env, + const EnvOptions& src_env_options, bool sync, + RateLimiter* rate_limiter, + std::function progress_callback, + Temperature* src_temperature, + Temperature dst_temperature, + uint64_t* bytes_toward_next_callback, + uint64_t* size, std::string* checksum_hex); + + IOStatus ReadFileAndComputeChecksum(const std::string& src, + const std::shared_ptr& src_fs, + const EnvOptions& src_env_options, + uint64_t size_limit, + std::string* checksum_hex, + const Temperature src_temperature) const; + + // Obtain db_id and db_session_id from the table properties of file_path + Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options, + const std::string& file_path, + Temperature file_temp, RateLimiter* rate_limiter, + std::string* db_id, std::string* db_session_id); + + struct CopyOrCreateResult { + ~CopyOrCreateResult() { + // The Status needs to be ignored here for two reasons. + // First, if the BackupEngineImpl shuts down with jobs outstanding, then + // it is possible that the Status in the future/promise is never read, + // resulting in an unchecked Status. Second, if there are items in the + // channel when the BackupEngineImpl is shutdown, these will also have + // Status that have not been checked. This + // TODO: Fix those issues so that the Status + io_status.PermitUncheckedError(); + } + uint64_t size; + std::string checksum_hex; + std::string db_id; + std::string db_session_id; + IOStatus io_status; + Temperature expected_src_temperature = Temperature::kUnknown; + Temperature current_src_temperature = Temperature::kUnknown; + }; + + // Exactly one of src_path and contents must be non-empty. If src_path is + // non-empty, the file is copied from this pathname. Otherwise, if contents is + // non-empty, the file will be created at dst_path with these contents. + struct CopyOrCreateWorkItem { + std::string src_path; + std::string dst_path; + Temperature src_temperature; + Temperature dst_temperature; + std::string contents; + Env* src_env; + Env* dst_env; + EnvOptions src_env_options; + bool sync; + RateLimiter* rate_limiter; + uint64_t size_limit; + Statistics* stats; + std::promise result; + std::function progress_callback; + std::string src_checksum_func_name; + std::string src_checksum_hex; + std::string db_id; + std::string db_session_id; + + CopyOrCreateWorkItem() + : src_path(""), + dst_path(""), + src_temperature(Temperature::kUnknown), + dst_temperature(Temperature::kUnknown), + contents(""), + src_env(nullptr), + dst_env(nullptr), + src_env_options(), + sync(false), + rate_limiter(nullptr), + size_limit(0), + stats(nullptr), + src_checksum_func_name(kUnknownFileChecksumFuncName), + src_checksum_hex(""), + db_id(""), + db_session_id("") {} + + CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete; + CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete; + + CopyOrCreateWorkItem(CopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + CopyOrCreateWorkItem& operator=(CopyOrCreateWorkItem&& o) noexcept { + src_path = std::move(o.src_path); + dst_path = std::move(o.dst_path); + src_temperature = std::move(o.src_temperature); + dst_temperature = std::move(o.dst_temperature); + contents = std::move(o.contents); + src_env = o.src_env; + dst_env = o.dst_env; + src_env_options = std::move(o.src_env_options); + sync = o.sync; + rate_limiter = o.rate_limiter; + size_limit = o.size_limit; + stats = o.stats; + result = std::move(o.result); + progress_callback = std::move(o.progress_callback); + src_checksum_func_name = std::move(o.src_checksum_func_name); + src_checksum_hex = std::move(o.src_checksum_hex); + db_id = std::move(o.db_id); + db_session_id = std::move(o.db_session_id); + src_temperature = o.src_temperature; + return *this; + } + + CopyOrCreateWorkItem( + std::string _src_path, std::string _dst_path, + const Temperature _src_temperature, const Temperature _dst_temperature, + std::string _contents, Env* _src_env, Env* _dst_env, + EnvOptions _src_env_options, bool _sync, RateLimiter* _rate_limiter, + uint64_t _size_limit, Statistics* _stats, + std::function _progress_callback = []() {}, + const std::string& _src_checksum_func_name = + kUnknownFileChecksumFuncName, + const std::string& _src_checksum_hex = "", + const std::string& _db_id = "", const std::string& _db_session_id = "") + : src_path(std::move(_src_path)), + dst_path(std::move(_dst_path)), + src_temperature(_src_temperature), + dst_temperature(_dst_temperature), + contents(std::move(_contents)), + src_env(_src_env), + dst_env(_dst_env), + src_env_options(std::move(_src_env_options)), + sync(_sync), + rate_limiter(_rate_limiter), + size_limit(_size_limit), + stats(_stats), + progress_callback(_progress_callback), + src_checksum_func_name(_src_checksum_func_name), + src_checksum_hex(_src_checksum_hex), + db_id(_db_id), + db_session_id(_db_session_id) {} + }; + + struct BackupAfterCopyOrCreateWorkItem { + std::future result; + bool shared; + bool needed_to_copy; + Env* backup_env; + std::string dst_path_tmp; + std::string dst_path; + std::string dst_relative; + BackupAfterCopyOrCreateWorkItem() + : shared(false), + needed_to_copy(false), + backup_env(nullptr), + dst_path_tmp(""), + dst_path(""), + dst_relative("") {} + + BackupAfterCopyOrCreateWorkItem( + BackupAfterCopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + BackupAfterCopyOrCreateWorkItem& operator=( + BackupAfterCopyOrCreateWorkItem&& o) noexcept { + result = std::move(o.result); + shared = o.shared; + needed_to_copy = o.needed_to_copy; + backup_env = o.backup_env; + dst_path_tmp = std::move(o.dst_path_tmp); + dst_path = std::move(o.dst_path); + dst_relative = std::move(o.dst_relative); + return *this; + } + + BackupAfterCopyOrCreateWorkItem(std::future&& _result, + bool _shared, bool _needed_to_copy, + Env* _backup_env, std::string _dst_path_tmp, + std::string _dst_path, + std::string _dst_relative) + : result(std::move(_result)), + shared(_shared), + needed_to_copy(_needed_to_copy), + backup_env(_backup_env), + dst_path_tmp(std::move(_dst_path_tmp)), + dst_path(std::move(_dst_path)), + dst_relative(std::move(_dst_relative)) {} + }; + + struct RestoreAfterCopyOrCreateWorkItem { + std::future result; + std::string from_file; + std::string to_file; + std::string checksum_hex; + RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {} + RestoreAfterCopyOrCreateWorkItem(std::future&& _result, + const std::string& _from_file, + const std::string& _to_file, + const std::string& _checksum_hex) + : result(std::move(_result)), + from_file(_from_file), + to_file(_to_file), + checksum_hex(_checksum_hex) {} + RestoreAfterCopyOrCreateWorkItem( + RestoreAfterCopyOrCreateWorkItem&& o) noexcept { + *this = std::move(o); + } + + RestoreAfterCopyOrCreateWorkItem& operator=( + RestoreAfterCopyOrCreateWorkItem&& o) noexcept { + result = std::move(o.result); + checksum_hex = std::move(o.checksum_hex); + return *this; + } + }; + + bool initialized_; + std::mutex byte_report_mutex_; + mutable channel files_to_copy_or_create_; + std::vector threads_; + std::atomic threads_cpu_priority_; + + // Certain operations like PurgeOldBackups and DeleteBackup will trigger + // automatic GarbageCollect (true) unless we've already done one in this + // session and have not failed to delete backup files since then (false). + bool might_need_garbage_collect_ = true; + + // Adds a file to the backup work queue to be copied or created if it doesn't + // already exist. + // + // Exactly one of src_dir and contents must be non-empty. + // + // @param src_dir If non-empty, the file in this directory named fname will be + // copied. + // @param fname Name of destination file and, in case of copy, source file. + // @param contents If non-empty, the file will be created with these contents. + IOStatus AddBackupFileWorkItem( + std::unordered_set& live_dst_paths, + std::vector& backup_items_to_finish, + BackupID backup_id, bool shared, const std::string& src_dir, + const std::string& fname, // starts with "/" + const EnvOptions& src_env_options, RateLimiter* rate_limiter, + FileType file_type, uint64_t size_bytes, Statistics* stats, + uint64_t size_limit = 0, bool shared_checksum = false, + std::function progress_callback = []() {}, + const std::string& contents = std::string(), + const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName, + const std::string& src_checksum_str = kUnknownFileChecksum, + const Temperature src_temperature = Temperature::kUnknown); + + // backup state data + BackupID latest_backup_id_; + BackupID latest_valid_backup_id_; + std::map> backups_; + std::map>> + corrupt_backups_; + std::unordered_map> + backuped_file_infos_; + std::atomic stop_backup_; + + // options data + BackupEngineOptions options_; + Env* db_env_; + Env* backup_env_; + + // directories + std::unique_ptr backup_directory_; + std::unique_ptr shared_directory_; + std::unique_ptr meta_directory_; + std::unique_ptr private_directory_; + + static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB + bool read_only_; + BackupStatistics backup_statistics_; + std::unordered_set reported_ignored_fields_; + static const size_t kMaxAppMetaSize = 1024 * 1024; // 1MB + std::shared_ptr db_fs_; + std::shared_ptr backup_fs_; + IOOptions io_options_ = IOOptions(); + + public: + std::unique_ptr schema_test_options_; +}; + +// -------- BackupEngineImplThreadSafe class --------- +// This locking layer for thread safety in the public API is layered on +// top to prevent accidental recursive locking with RWMutex, which is UB. +// Note: BackupEngineReadOnlyBase inherited twice, but has no fields +class BackupEngineImplThreadSafe : public BackupEngine, + public BackupEngineReadOnly { + public: + BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env, + bool read_only = false) + : impl_(options, db_env, read_only) {} + ~BackupEngineImplThreadSafe() override {} + + using BackupEngine::CreateNewBackupWithMetadata; + IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options, + DB* db, const std::string& app_metadata, + BackupID* new_backup_id) override { + WriteLock lock(&mutex_); + return impl_.CreateNewBackupWithMetadata(options, db, app_metadata, + new_backup_id); + } + + IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) override { + WriteLock lock(&mutex_); + return impl_.PurgeOldBackups(num_backups_to_keep); + } + + IOStatus DeleteBackup(BackupID backup_id) override { + WriteLock lock(&mutex_); + return impl_.DeleteBackup(backup_id); + } + + void StopBackup() override { + // No locking needed + impl_.StopBackup(); + } + + IOStatus GarbageCollect() override { + WriteLock lock(&mutex_); + return impl_.GarbageCollect(); + } + + Status GetLatestBackupInfo(BackupInfo* backup_info, + bool include_file_details = false) const override { + ReadLock lock(&mutex_); + return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info, + include_file_details); + } + + Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const override { + ReadLock lock(&mutex_); + return impl_.GetBackupInfo(backup_id, backup_info, include_file_details); + } + + void GetBackupInfo(std::vector* backup_info, + bool include_file_details) const override { + ReadLock lock(&mutex_); + impl_.GetBackupInfo(backup_info, include_file_details); + } + + void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const override { + ReadLock lock(&mutex_); + impl_.GetCorruptedBackups(corrupt_backup_ids); + } + + using BackupEngine::RestoreDBFromBackup; + IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir) const override { + ReadLock lock(&mutex_); + return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + } + + using BackupEngine::RestoreDBFromLatestBackup; + IOStatus RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const override { + // Defer to above function, which locks + return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir); + } + + IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const override { + ReadLock lock(&mutex_); + return impl_.VerifyBackup(backup_id, verify_with_checksum); + } + + // Not public API but needed + IOStatus Initialize() { + // No locking needed + return impl_.Initialize(); + } + + // Not public API but used in testing + void TEST_SetBackupMetaSchemaOptions( + const TEST_BackupMetaSchemaOptions& options) { + impl_.schema_test_options_.reset(new TEST_BackupMetaSchemaOptions(options)); + } + + // Not public API but used in testing + void TEST_SetDefaultRateLimitersClock( + const std::shared_ptr& backup_rate_limiter_clock = nullptr, + const std::shared_ptr& restore_rate_limiter_clock = + nullptr) { + impl_.TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock, + restore_rate_limiter_clock); + } + + private: + mutable port::RWMutex mutex_; + BackupEngineImpl impl_; +}; +} // namespace + +IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env, + BackupEngine** backup_engine_ptr) { + std::unique_ptr backup_engine( + new BackupEngineImplThreadSafe(options, env)); + auto s = backup_engine->Initialize(); + if (!s.ok()) { + *backup_engine_ptr = nullptr; + return s; + } + *backup_engine_ptr = backup_engine.release(); + return IOStatus::OK(); +} + +namespace { +BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options, + Env* db_env, bool read_only) + : initialized_(false), + threads_cpu_priority_(), + latest_backup_id_(0), + latest_valid_backup_id_(0), + stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_), + read_only_(read_only) { + if (options_.backup_rate_limiter == nullptr && + options_.backup_rate_limit > 0) { + options_.backup_rate_limiter.reset( + NewGenericRateLimiter(options_.backup_rate_limit)); + } + if (options_.restore_rate_limiter == nullptr && + options_.restore_rate_limit > 0) { + options_.restore_rate_limiter.reset( + NewGenericRateLimiter(options_.restore_rate_limit)); + } + db_fs_ = db_env_->GetFileSystem(); + backup_fs_ = backup_env_->GetFileSystem(); +} + +BackupEngineImpl::~BackupEngineImpl() { + files_to_copy_or_create_.sendEof(); + for (auto& t : threads_) { + t.join(); + } + LogFlush(options_.info_log); + for (const auto& it : corrupt_backups_) { + it.second.first.PermitUncheckedError(); + } +} + +IOStatus BackupEngineImpl::Initialize() { + assert(!initialized_); + initialized_ = true; + if (read_only_) { + ROCKS_LOG_INFO(options_.info_log, "Starting read_only backup engine"); + } + options_.Dump(options_.info_log); + + auto meta_path = GetAbsolutePath(kMetaDirName); + + if (!read_only_) { + // we might need to clean up from previous crash or I/O errors + might_need_garbage_collect_ = true; + + if (options_.max_valid_backups_to_open != + std::numeric_limits::max()) { + options_.max_valid_backups_to_open = std::numeric_limits::max(); + ROCKS_LOG_WARN( + options_.info_log, + "`max_valid_backups_to_open` is not set to the default value. " + "Ignoring its value since BackupEngine is not read-only."); + } + + // gather the list of directories that we need to create + std::vector*>> + directories; + directories.emplace_back(GetAbsolutePath(), &backup_directory_); + if (options_.share_table_files) { + if (options_.share_files_with_checksum) { + directories.emplace_back( + GetAbsolutePath(GetSharedFileWithChecksumRel()), + &shared_directory_); + } else { + directories.emplace_back(GetAbsolutePath(GetSharedFileRel()), + &shared_directory_); + } + } + directories.emplace_back(GetAbsolutePath(kPrivateDirName), + &private_directory_); + directories.emplace_back(meta_path, &meta_directory_); + // create all the dirs we need + for (const auto& d : directories) { + IOStatus io_s = + backup_fs_->CreateDirIfMissing(d.first, io_options_, nullptr); + if (io_s.ok()) { + io_s = + backup_fs_->NewDirectory(d.first, io_options_, d.second, nullptr); + } + if (!io_s.ok()) { + return io_s; + } + } + } + + std::vector backup_meta_files; + { + IOStatus io_s = backup_fs_->GetChildren(meta_path, io_options_, + &backup_meta_files, nullptr); + if (io_s.IsNotFound()) { + return IOStatus::NotFound(meta_path + " is missing"); + } else if (!io_s.ok()) { + return io_s; + } + } + // create backups_ structure + for (auto& file : backup_meta_files) { + ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str()); + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // Invalid file name, will be deleted with auto-GC when user + // initiates an append or write operation. (Behave as read-only until + // then.) + ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s", + file.c_str()); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + // Insert all the (backup_id, BackupMeta) that will be loaded later + // The loading performed later will check whether there are corrupt backups + // and move the corrupt backups to corrupt_backups_ + backups_.insert(std::make_pair( + backup_id, std::unique_ptr(new BackupMeta( + GetBackupMetaFile(backup_id, false /* tmp */), + GetBackupMetaFile(backup_id, true /* tmp */), + &backuped_file_infos_, backup_env_, backup_fs_)))); + } + + latest_backup_id_ = 0; + latest_valid_backup_id_ = 0; + if (options_.destroy_old_data) { // Destroy old data + assert(!read_only_); + ROCKS_LOG_INFO( + options_.info_log, + "Backup Engine started with destroy_old_data == true, deleting all " + "backups"); + IOStatus io_s = PurgeOldBackups(0); + if (io_s.ok()) { + io_s = GarbageCollect(); + } + if (!io_s.ok()) { + return io_s; + } + } else { // Load data from storage + // abs_path_to_size: maps absolute paths of files in backup directory to + // their corresponding sizes + std::unordered_map abs_path_to_size; + // Insert files and their sizes in backup sub-directories (shared and + // shared_checksum) to abs_path_to_size + for (const auto& rel_dir : + {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + IOStatus io_s = + ReadChildFileCurrentSizes(abs_dir, backup_fs_, &abs_path_to_size); + if (!io_s.ok()) { + // I/O error likely impacting all backups + return io_s; + } + } + // load the backups if any, until valid_backups_to_open of the latest + // non-corrupted backups have been successfully opened. + int valid_backups_to_open = options_.max_valid_backups_to_open; + for (auto backup_iter = backups_.rbegin(); backup_iter != backups_.rend(); + ++backup_iter) { + assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first); + if (latest_backup_id_ == 0) { + latest_backup_id_ = backup_iter->first; + } + if (valid_backups_to_open == 0) { + break; + } + + // Insert files and their sizes in backup sub-directories + // (private/backup_id) to abs_path_to_size + IOStatus io_s = ReadChildFileCurrentSizes( + GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_fs_, + &abs_path_to_size); + if (io_s.ok()) { + io_s = backup_iter->second->LoadFromFile( + options_.backup_dir, abs_path_to_size, + options_.backup_rate_limiter.get(), options_.info_log, + &reported_ignored_fields_); + } + if (io_s.IsCorruption() || io_s.IsNotSupported()) { + ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s", + backup_iter->first, io_s.ToString().c_str()); + corrupt_backups_.insert(std::make_pair( + backup_iter->first, + std::make_pair(io_s, std::move(backup_iter->second)))); + } else if (!io_s.ok()) { + // Distinguish corruption errors from errors in the backup Env. + // Errors in the backup Env (i.e., this code path) will cause Open() to + // fail, whereas corruption errors would not cause Open() failures. + return io_s; + } else { + ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s", + backup_iter->first, + backup_iter->second->GetInfoString().c_str()); + assert(latest_valid_backup_id_ == 0 || + latest_valid_backup_id_ > backup_iter->first); + if (latest_valid_backup_id_ == 0) { + latest_valid_backup_id_ = backup_iter->first; + } + --valid_backups_to_open; + } + } + + for (const auto& corrupt : corrupt_backups_) { + backups_.erase(backups_.find(corrupt.first)); + } + // erase the backups before max_valid_backups_to_open + int num_unopened_backups; + if (options_.max_valid_backups_to_open == 0) { + num_unopened_backups = 0; + } else { + num_unopened_backups = + std::max(0, static_cast(backups_.size()) - + options_.max_valid_backups_to_open); + } + for (int i = 0; i < num_unopened_backups; ++i) { + assert(backups_.begin()->second->Empty()); + backups_.erase(backups_.begin()); + } + } + + ROCKS_LOG_INFO(options_.info_log, "Latest backup is %u", latest_backup_id_); + ROCKS_LOG_INFO(options_.info_log, "Latest valid backup is %u", + latest_valid_backup_id_); + + // set up threads perform copies from files_to_copy_or_create_ in the + // background + threads_cpu_priority_ = CpuPriority::kNormal; + threads_.reserve(options_.max_background_operations); + for (int t = 0; t < options_.max_background_operations; t++) { + threads_.emplace_back([this]() { +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + pthread_setname_np(pthread_self(), "backup_engine"); +#endif +#endif + CpuPriority current_priority = CpuPriority::kNormal; + CopyOrCreateWorkItem work_item; + uint64_t bytes_toward_next_callback = 0; + while (files_to_copy_or_create_.read(work_item)) { + CpuPriority priority = threads_cpu_priority_; + if (current_priority != priority) { + TEST_SYNC_POINT_CALLBACK( + "BackupEngineImpl::Initialize:SetCpuPriority", &priority); + port::SetCpuPriority(0, priority); + current_priority = priority; + } + // `bytes_read` and `bytes_written` stats are enabled based on + // compile-time support and cannot be dynamically toggled. So we do not + // need to worry about `PerfLevel` here, unlike many other + // `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + uint64_t prev_bytes_written = IOSTATS(bytes_written); + + CopyOrCreateResult result; + Temperature temp = work_item.src_temperature; + result.io_status = CopyOrCreateFile( + work_item.src_path, work_item.dst_path, work_item.contents, + work_item.size_limit, work_item.src_env, work_item.dst_env, + work_item.src_env_options, work_item.sync, work_item.rate_limiter, + work_item.progress_callback, &temp, work_item.dst_temperature, + &bytes_toward_next_callback, &result.size, &result.checksum_hex); + + RecordTick(work_item.stats, BACKUP_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + RecordTick(work_item.stats, BACKUP_WRITE_BYTES, + IOSTATS(bytes_written) - prev_bytes_written); + + result.db_id = work_item.db_id; + result.db_session_id = work_item.db_session_id; + result.expected_src_temperature = work_item.src_temperature; + result.current_src_temperature = temp; + if (result.io_status.ok() && !work_item.src_checksum_hex.empty()) { + // unknown checksum function name implies no db table file checksum in + // db manifest; work_item.src_checksum_hex not empty means + // backup engine has calculated its crc32c checksum for the table + // file; therefore, we are able to compare the checksums. + if (work_item.src_checksum_func_name == + kUnknownFileChecksumFuncName || + work_item.src_checksum_func_name == kDbFileChecksumFuncName) { + if (work_item.src_checksum_hex != result.checksum_hex) { + std::string checksum_info( + "Expected checksum is " + work_item.src_checksum_hex + + " while computed checksum is " + result.checksum_hex); + result.io_status = IOStatus::Corruption( + "Checksum mismatch after copying to " + work_item.dst_path + + ": " + checksum_info); + } + } else { + // FIXME(peterd): dead code? + std::string checksum_function_info( + "Existing checksum function is " + + work_item.src_checksum_func_name + + " while provided checksum function is " + + kBackupFileChecksumFuncName); + ROCKS_LOG_INFO( + options_.info_log, + "Unable to verify checksum after copying to %s: %s\n", + work_item.dst_path.c_str(), checksum_function_info.c_str()); + } + } + work_item.result.set_value(std::move(result)); + } + }); + } + ROCKS_LOG_INFO(options_.info_log, "Initialized BackupEngine"); + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, const std::string& app_metadata, + BackupID* new_backup_id_ptr) { + assert(initialized_); + assert(!read_only_); + if (app_metadata.size() > kMaxAppMetaSize) { + return IOStatus::InvalidArgument("App metadata too large"); + } + + if (options.decrease_background_thread_cpu_priority) { + if (options.background_thread_cpu_priority < threads_cpu_priority_) { + threads_cpu_priority_.store(options.background_thread_cpu_priority); + } + } + + BackupID new_backup_id = latest_backup_id_ + 1; + + // `bytes_read` and `bytes_written` stats are enabled based on compile-time + // support and cannot be dynamically toggled. So we do not need to worry about + // `PerfLevel` here, unlike many other `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + uint64_t prev_bytes_written = IOSTATS(bytes_written); + + assert(backups_.find(new_backup_id) == backups_.end()); + + auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id)); + IOStatus io_s = backup_fs_->FileExists(private_dir, io_options_, nullptr); + if (io_s.ok()) { + // maybe last backup failed and left partial state behind, clean it up. + // need to do this before updating backups_ such that a private dir + // named after new_backup_id will be cleaned up. + // (If an incomplete new backup is followed by an incomplete delete + // of the latest full backup, then there could be more than one next + // id with a private dir, the last thing to be deleted in delete + // backup, but all will be cleaned up with a GarbageCollect.) + io_s = GarbageCollect(); + } else if (io_s.IsNotFound()) { + // normal case, the new backup's private dir doesn't exist yet + io_s = IOStatus::OK(); + } + + auto ret = backups_.insert(std::make_pair( + new_backup_id, std::unique_ptr(new BackupMeta( + GetBackupMetaFile(new_backup_id, false /* tmp */), + GetBackupMetaFile(new_backup_id, true /* tmp */), + &backuped_file_infos_, backup_env_, backup_fs_)))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup->RecordTimestamp(); + new_backup->SetAppMetadata(app_metadata); + + auto start_backup = backup_env_->NowMicros(); + + ROCKS_LOG_INFO(options_.info_log, + "Started the backup process -- creating backup %u", + new_backup_id); + + if (options_.share_table_files && !options_.share_files_with_checksum) { + ROCKS_LOG_WARN(options_.info_log, + "BackupEngineOptions::share_files_with_checksum=false is " + "DEPRECATED and could lead to data loss."); + } + + if (io_s.ok()) { + io_s = backup_fs_->CreateDir(private_dir, io_options_, nullptr); + } + + // A set into which we will insert the dst_paths that are calculated for live + // files and live WAL files. + // This is used to check whether a live files shares a dst_path with another + // live file. + std::unordered_set live_dst_paths; + + std::vector backup_items_to_finish; + // Add a CopyOrCreateWorkItem to the channel for each live file + Status disabled = db->DisableFileDeletions(); + DBOptions db_options = db->GetDBOptions(); + Statistics* stats = db_options.statistics.get(); + if (io_s.ok()) { + CheckpointImpl checkpoint(db); + uint64_t sequence_number = 0; + FileChecksumGenFactory* db_checksum_factory = + db_options.file_checksum_gen_factory.get(); + const std::string kFileChecksumGenFactoryName = + "FileChecksumGenCrc32cFactory"; + bool compare_checksum = + db_checksum_factory != nullptr && + db_checksum_factory->Name() == kFileChecksumGenFactoryName + ? true + : false; + EnvOptions src_raw_env_options(db_options); + RateLimiter* rate_limiter = options_.backup_rate_limiter.get(); + io_s = status_to_io_status(checkpoint.CreateCustomCheckpoint( + [&](const std::string& /*src_dirname*/, const std::string& /*fname*/, + FileType) { + // custom checkpoint will switch to calling copy_file_cb after it sees + // NotSupported returned from link_file_cb. + return IOStatus::NotSupported(); + } /* link_file_cb */, + [&](const std::string& src_dirname, const std::string& fname, + uint64_t size_limit_bytes, FileType type, + const std::string& checksum_func_name, + const std::string& checksum_val, + const Temperature src_temperature) { + if (type == kWalFile && !options_.backup_log_files) { + return IOStatus::OK(); + } + Log(options_.info_log, "add file for backup %s", fname.c_str()); + uint64_t size_bytes = 0; + IOStatus io_st; + if (type == kTableFile || type == kBlobFile) { + io_st = db_fs_->GetFileSize(src_dirname + "/" + fname, io_options_, + &size_bytes, nullptr); + if (!io_st.ok()) { + Log(options_.info_log, "GetFileSize is failed: %s", + io_st.ToString().c_str()); + return io_st; + } + } + EnvOptions src_env_options; + switch (type) { + case kWalFile: + src_env_options = + db_env_->OptimizeForLogRead(src_raw_env_options); + break; + case kTableFile: + src_env_options = db_env_->OptimizeForCompactionTableRead( + src_raw_env_options, ImmutableDBOptions(db_options)); + break; + case kDescriptorFile: + src_env_options = + db_env_->OptimizeForManifestRead(src_raw_env_options); + break; + case kBlobFile: + src_env_options = db_env_->OptimizeForBlobFileRead( + src_raw_env_options, ImmutableDBOptions(db_options)); + break; + default: + // Other backed up files (like options file) are not read by live + // DB, so don't need to worry about avoiding mixing buffered and + // direct I/O. Just use plain defaults. + src_env_options = src_raw_env_options; + break; + } + io_st = AddBackupFileWorkItem( + live_dst_paths, backup_items_to_finish, new_backup_id, + options_.share_table_files && + (type == kTableFile || type == kBlobFile), + src_dirname, fname, src_env_options, rate_limiter, type, + size_bytes, db_options.statistics.get(), size_limit_bytes, + options_.share_files_with_checksum && + (type == kTableFile || type == kBlobFile), + options.progress_callback, "" /* contents */, checksum_func_name, + checksum_val, src_temperature); + return io_st; + } /* copy_file_cb */, + [&](const std::string& fname, const std::string& contents, + FileType type) { + Log(options_.info_log, "add file for backup %s", fname.c_str()); + return AddBackupFileWorkItem( + live_dst_paths, backup_items_to_finish, new_backup_id, + false /* shared */, "" /* src_dir */, fname, + EnvOptions() /* src_env_options */, rate_limiter, type, + contents.size(), db_options.statistics.get(), 0 /* size_limit */, + false /* shared_checksum */, options.progress_callback, contents); + } /* create_file_cb */, + &sequence_number, + options.flush_before_backup ? 0 : std::numeric_limits::max(), + compare_checksum)); + if (io_s.ok()) { + new_backup->SetSequenceNumber(sequence_number); + } + } + ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish."); + IOStatus item_io_status; + for (auto& item : backup_items_to_finish) { + item.result.wait(); + auto result = item.result.get(); + item_io_status = result.io_status; + Temperature temp = result.expected_src_temperature; + if (result.current_src_temperature != Temperature::kUnknown && + (temp == Temperature::kUnknown || + options_.current_temperatures_override_manifest)) { + temp = result.current_src_temperature; + } + if (item_io_status.ok() && item.shared && item.needed_to_copy) { + item_io_status = item.backup_env->GetFileSystem()->RenameFile( + item.dst_path_tmp, item.dst_path, io_options_, nullptr); + } + if (item_io_status.ok()) { + item_io_status = new_backup.get()->AddFile(std::make_shared( + item.dst_relative, result.size, result.checksum_hex, result.db_id, + result.db_session_id, temp)); + } + if (!item_io_status.ok()) { + io_s = item_io_status; + } + } + + // we copied all the files, enable file deletions + if (disabled.ok()) { // If we successfully disabled file deletions + db->EnableFileDeletions(false).PermitUncheckedError(); + } + auto backup_time = backup_env_->NowMicros() - start_backup; + + if (io_s.ok()) { + // persist the backup metadata on the disk + io_s = new_backup->StoreToFile(options_.sync, options_.schema_version, + schema_test_options_.get()); + } + if (io_s.ok() && options_.sync) { + std::unique_ptr backup_private_directory; + backup_fs_ + ->NewDirectory(GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)), + io_options_, &backup_private_directory, nullptr) + .PermitUncheckedError(); + if (backup_private_directory != nullptr) { + io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && private_directory_ != nullptr) { + io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && meta_directory_ != nullptr) { + io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && shared_directory_ != nullptr) { + io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + if (io_s.ok() && backup_directory_ != nullptr) { + io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + } + + if (io_s.ok()) { + backup_statistics_.IncrementNumberSuccessBackup(); + // here we know that we succeeded and installed the new backup + latest_backup_id_ = new_backup_id; + latest_valid_backup_id_ = new_backup_id; + if (new_backup_id_ptr) { + *new_backup_id_ptr = new_backup_id; + } + ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good"); + + // backup_speed is in byte/second + double backup_speed = new_backup->GetSize() / (1.048576 * backup_time); + ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u", + new_backup->GetNumberFiles()); + char human_size[16]; + AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size)); + ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size); + ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds", + backup_time); + ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed); + ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s", + backup_statistics_.ToString().c_str()); + } else { + backup_statistics_.IncrementNumberFailBackup(); + // clean all the files we might have created + ROCKS_LOG_INFO(options_.info_log, "Backup failed -- %s", + io_s.ToString().c_str()); + ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n", + backup_statistics_.ToString().c_str()); + // delete files that we might have already written + might_need_garbage_collect_ = true; + DeleteBackup(new_backup_id).PermitUncheckedError(); + } + + RecordTick(stats, BACKUP_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read); + RecordTick(stats, BACKUP_WRITE_BYTES, + IOSTATS(bytes_written) - prev_bytes_written); + return io_s; +} + +IOStatus BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { + assert(initialized_); + assert(!read_only_); + + // Best effort deletion even with errors + IOStatus overall_status = IOStatus::OK(); + + ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + std::vector to_delete; + auto itr = backups_.begin(); + while ((backups_.size() - to_delete.size()) > num_backups_to_keep) { + to_delete.push_back(itr->first); + itr++; + } + for (auto backup_id : to_delete) { + // Do not GC until end + IOStatus io_s = DeleteBackupNoGC(backup_id); + if (!io_s.ok()) { + overall_status = io_s; + } + } + // Clean up after any incomplete backup deletion, potentially from + // earlier session. + if (might_need_garbage_collect_) { + IOStatus io_s = GarbageCollect(); + if (!io_s.ok() && overall_status.ok()) { + overall_status = io_s; + } + } + return overall_status; +} + +IOStatus BackupEngineImpl::DeleteBackup(BackupID backup_id) { + IOStatus s1 = DeleteBackupNoGC(backup_id); + IOStatus s2 = IOStatus::OK(); + + // Clean up after any incomplete backup deletion, potentially from + // earlier session. + if (might_need_garbage_collect_) { + s2 = GarbageCollect(); + } + + if (!s1.ok()) { + // Any failure in the primary objective trumps any failure in the + // secondary objective. + s2.PermitUncheckedError(); + return s1; + } else { + return s2; + } +} + +// Does not auto-GarbageCollect nor lock +IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) { + assert(initialized_); + assert(!read_only_); + + ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup != backups_.end()) { + IOStatus io_s = backup->second->Delete(); + if (!io_s.ok()) { + return io_s; + } + backups_.erase(backup); + } else { + auto corrupt = corrupt_backups_.find(backup_id); + if (corrupt == corrupt_backups_.end()) { + return IOStatus::NotFound("Backup not found"); + } + IOStatus io_s = corrupt->second.second->Delete(); + if (!io_s.ok()) { + return io_s; + } + corrupt->second.first.PermitUncheckedError(); + corrupt_backups_.erase(corrupt); + } + + // After removing meta file, best effort deletion even with errors. + // (Don't delete other files if we can't delete the meta file right + // now.) + std::vector to_delete; + for (auto& itr : backuped_file_infos_) { + if (itr.second->refs == 0) { + IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(itr.first), + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + io_s.ToString().c_str()); + to_delete.push_back(itr.first); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + for (auto& td : to_delete) { + backuped_file_infos_.erase(td); + } + + // take care of private dirs -- GarbageCollect() will take care of them + // if they are not empty + std::string private_dir = GetPrivateFileRel(backup_id); + IOStatus io_s = + backup_fs_->DeleteDir(GetAbsolutePath(private_dir), io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), io_s.ToString().c_str()); + if (!io_s.ok()) { + // Full gc or trying again later might work + might_need_garbage_collect_ = true; + } + return IOStatus::OK(); +} + +void BackupEngineImpl::SetBackupInfoFromBackupMeta( + BackupID id, const BackupMeta& meta, BackupInfo* backup_info, + bool include_file_details) const { + *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(), + meta.GetNumberFiles(), meta.GetAppMetadata()); + std::string dir = + options_.backup_dir + "/" + kPrivateDirSlash + std::to_string(id); + if (include_file_details) { + auto& file_details = backup_info->file_details; + file_details.reserve(meta.GetFiles().size()); + for (auto& file_ptr : meta.GetFiles()) { + BackupFileInfo& finfo = *file_details.emplace(file_details.end()); + finfo.relative_filename = file_ptr->filename; + finfo.size = file_ptr->size; + finfo.directory = dir; + uint64_t number; + FileType type; + bool ok = ParseFileName(file_ptr->filename, &number, &type); + if (ok) { + finfo.file_number = number; + finfo.file_type = type; + } + // TODO: temperature, file_checksum, file_checksum_func_name + } + backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id)); + backup_info->name_for_open.pop_back(); // remove trailing '/' + backup_info->env_for_open = meta.GetEnvForOpen(); + } +} + +Status BackupEngineImpl::GetBackupInfo(BackupID backup_id, + BackupInfo* backup_info, + bool include_file_details) const { + assert(initialized_); + if (backup_id == kLatestBackupIDMarker) { + // Note: Read latest_valid_backup_id_ inside of lock + backup_id = latest_valid_backup_id_; + } + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return Status::Corruption(corrupt_itr->second.first.ToString()); + } + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup->Empty()) { + return Status::NotFound("Backup not found"); + } + + SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info, + include_file_details); + return Status::OK(); +} + +void BackupEngineImpl::GetBackupInfo(std::vector* backup_info, + bool include_file_details) const { + assert(initialized_); + backup_info->resize(backups_.size()); + size_t i = 0; + for (auto& backup : backups_) { + const BackupMeta& meta = *backup.second; + if (!meta.Empty()) { + SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++), + include_file_details); + } + } +} + +void BackupEngineImpl::GetCorruptedBackups( + std::vector* corrupt_backup_ids) const { + assert(initialized_); + corrupt_backup_ids->reserve(corrupt_backups_.size()); + for (auto& backup : corrupt_backups_) { + corrupt_backup_ids->push_back(backup.first); + } +} + +IOStatus BackupEngineImpl::RestoreDBFromBackup( + const RestoreOptions& options, BackupID backup_id, + const std::string& db_dir, const std::string& wal_dir) const { + assert(initialized_); + if (backup_id == kLatestBackupIDMarker) { + // Note: Read latest_valid_backup_id_ inside of lock + backup_id = latest_valid_backup_id_; + } + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return corrupt_itr->second.first; + } + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return IOStatus::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup->Empty()) { + return IOStatus::NotFound("Backup not found"); + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring backup id %u\n", backup_id); + ROCKS_LOG_INFO(options_.info_log, "keep_log_files: %d\n", + static_cast(options.keep_log_files)); + + // just in case. Ignore errors + db_fs_->CreateDirIfMissing(db_dir, io_options_, nullptr) + .PermitUncheckedError(); + db_fs_->CreateDirIfMissing(wal_dir, io_options_, nullptr) + .PermitUncheckedError(); + + if (options.keep_log_files) { + // delete files in db_dir, but keep all the log files + DeleteChildren(db_dir, 1 << kWalFile); + // move all the files from archive dir to wal_dir + std::string archive_dir = ArchivalDirectory(wal_dir); + std::vector archive_files; + db_fs_->GetChildren(archive_dir, io_options_, &archive_files, nullptr) + .PermitUncheckedError(); // ignore errors + for (const auto& f : archive_files) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kWalFile) { + ROCKS_LOG_INFO(options_.info_log, + "Moving log file from archive/ to wal_dir: %s", + f.c_str()); + IOStatus io_s = db_fs_->RenameFile( + archive_dir + "/" + f, wal_dir + "/" + f, io_options_, nullptr); + if (!io_s.ok()) { + // if we can't move log file from archive_dir to wal_dir, + // we should fail, since it might mean data loss + return io_s; + } + } + } + } else { + DeleteChildren(wal_dir); + DeleteChildren(ArchivalDirectory(wal_dir)); + DeleteChildren(db_dir); + } + + IOStatus io_s; + std::vector restore_items_to_finish; + std::string temporary_current_file; + std::string final_current_file; + std::unique_ptr db_dir_for_fsync; + std::unique_ptr wal_dir_for_fsync; + + for (const auto& file_info : backup->GetFiles()) { + const std::string& file = file_info->filename; + // 1. get DB filename + std::string dst = file_info->GetDbFileName(); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return IOStatus::Corruption("Backup corrupted: Fail to parse filename " + + dst); + } + // 3. Construct the final path + // kWalFile lives in wal_dir and all the rest live in db_dir + if (type == kWalFile) { + dst = wal_dir + "/" + dst; + if (options_.sync && !wal_dir_for_fsync) { + io_s = db_fs_->NewDirectory(wal_dir, io_options_, &wal_dir_for_fsync, + nullptr); + if (!io_s.ok()) { + return io_s; + } + } + } else { + dst = db_dir + "/" + dst; + if (options_.sync && !db_dir_for_fsync) { + io_s = db_fs_->NewDirectory(db_dir, io_options_, &db_dir_for_fsync, + nullptr); + if (!io_s.ok()) { + return io_s; + } + } + } + // For atomicity, initially restore CURRENT file to a temporary name. + // This is useful even without options_.sync e.g. in case the restore + // process is interrupted. + if (type == kCurrentFile) { + final_current_file = dst; + dst = temporary_current_file = dst + ".tmp"; + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(), + dst.c_str()); + CopyOrCreateWorkItem copy_or_create_work_item( + GetAbsolutePath(file), dst, Temperature::kUnknown /* src_temp */, + file_info->temp, "" /* contents */, backup_env_, db_env_, + EnvOptions() /* src_env_options */, options_.sync, + options_.restore_rate_limiter.get(), file_info->size, + nullptr /* stats */); + RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), file, dst, + file_info->checksum_hex); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + restore_items_to_finish.push_back( + std::move(after_copy_or_create_work_item)); + } + IOStatus item_io_status; + for (auto& item : restore_items_to_finish) { + item.result.wait(); + auto result = item.result.get(); + item_io_status = result.io_status; + // Note: It is possible that both of the following bad-status cases occur + // during copying. But, we only return one status. + if (!item_io_status.ok()) { + io_s = item_io_status; + break; + } else if (!item.checksum_hex.empty() && + item.checksum_hex != result.checksum_hex) { + io_s = IOStatus::Corruption( + "While restoring " + item.from_file + " -> " + item.to_file + + ": expected checksum is " + item.checksum_hex + + " while computed checksum is " + result.checksum_hex); + break; + } + } + + // When enabled, the first FsyncWithDirOptions is to ensure all files are + // fully persisted before renaming CURRENT.tmp + if (io_s.ok() && db_dir_for_fsync) { + ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n"); + io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + + if (io_s.ok() && wal_dir_for_fsync) { + io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); + } + + if (io_s.ok() && !temporary_current_file.empty()) { + ROCKS_LOG_INFO(options_.info_log, "Restore: atomic rename CURRENT.tmp\n"); + assert(!final_current_file.empty()); + io_s = db_fs_->RenameFile(temporary_current_file, final_current_file, + io_options_, nullptr); + } + + if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) { + // Second FsyncWithDirOptions is to ensure the final atomic rename of DB + // restore is fully persisted even if power goes out right after restore + // operation returns success + assert(db_dir_for_fsync); + io_s = db_dir_for_fsync->FsyncWithDirOptions( + io_options_, nullptr, DirFsyncOptions(final_current_file)); + } + + ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n", + io_s.ToString().c_str()); + return io_s; +} + +IOStatus BackupEngineImpl::VerifyBackup(BackupID backup_id, + bool verify_with_checksum) const { + assert(initialized_); + // Check if backup_id is corrupted, or valid and registered + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return corrupt_itr->second.first; + } + + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return IOStatus::NotFound(); + } + + auto& backup = backup_itr->second; + if (backup->Empty()) { + return IOStatus::NotFound(); + } + + ROCKS_LOG_INFO(options_.info_log, "Verifying backup id %u\n", backup_id); + + // Find all existing backup files belong to backup_id + std::unordered_map curr_abs_path_to_size; + for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(), + GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + // Shared directories allowed to be missing in some cases. Expected but + // missing files will be reported a few lines down. + ReadChildFileCurrentSizes(abs_dir, backup_fs_, &curr_abs_path_to_size) + .PermitUncheckedError(); + } + + // For all files registered in backup + for (const auto& file_info : backup->GetFiles()) { + const auto abs_path = GetAbsolutePath(file_info->filename); + // check existence of the file + if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) { + return IOStatus::NotFound("File missing: " + abs_path); + } + // verify file size + if (file_info->size != curr_abs_path_to_size[abs_path]) { + std::string size_info("Expected file size is " + + std::to_string(file_info->size) + + " while found file size is " + + std::to_string(curr_abs_path_to_size[abs_path])); + return IOStatus::Corruption("File corrupted: File size mismatch for " + + abs_path + ": " + size_info); + } + if (verify_with_checksum && !file_info->checksum_hex.empty()) { + // verify file checksum + std::string checksum_hex; + ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n", + abs_path.c_str()); + IOStatus io_s = ReadFileAndComputeChecksum( + abs_path, backup_fs_, EnvOptions(), 0 /* size_limit */, &checksum_hex, + Temperature::kUnknown); + if (!io_s.ok()) { + return io_s; + } else if (file_info->checksum_hex != checksum_hex) { + std::string checksum_info( + "Expected checksum is " + file_info->checksum_hex + + " while computed checksum is " + checksum_hex); + return IOStatus::Corruption("File corrupted: Checksum mismatch for " + + abs_path + ": " + checksum_info); + } + } + } + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::CopyOrCreateFile( + const std::string& src, const std::string& dst, const std::string& contents, + uint64_t size_limit, Env* src_env, Env* dst_env, + const EnvOptions& src_env_options, bool sync, RateLimiter* rate_limiter, + std::function progress_callback, Temperature* src_temperature, + Temperature dst_temperature, uint64_t* bytes_toward_next_callback, + uint64_t* size, std::string* checksum_hex) { + assert(src.empty() != contents.empty()); + IOStatus io_s; + std::unique_ptr dst_file; + std::unique_ptr src_file; + FileOptions dst_file_options; + dst_file_options.use_mmap_writes = false; + dst_file_options.temperature = dst_temperature; + // TODO:(gzh) maybe use direct reads/writes here if possible + if (size != nullptr) { + *size = 0; + } + uint32_t checksum_value = 0; + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options, + &dst_file, nullptr); + if (io_s.ok() && !src.empty()) { + auto src_file_options = FileOptions(src_env_options); + src_file_options.temperature = *src_temperature; + io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options, + &src_file, nullptr); + } + if (io_s.IsPathNotFound() && *src_temperature != Temperature::kUnknown) { + // Retry without temperature hint in case the FileSystem is strict with + // non-kUnknown temperature option + io_s = src_env->GetFileSystem()->NewSequentialFile( + src, FileOptions(src_env_options), &src_file, nullptr); + } + if (!io_s.ok()) { + return io_s; + } + + size_t buf_size = + rate_limiter ? static_cast(rate_limiter->GetSingleBurstBytes()) + : kDefaultCopyFileBufferSize; + + std::unique_ptr dest_writer( + new WritableFileWriter(std::move(dst_file), dst, dst_file_options)); + std::unique_ptr src_reader; + std::unique_ptr buf; + if (!src.empty()) { + // Return back current temperature in FileSystem + *src_temperature = src_file->GetTemperature(); + + src_reader.reset(new SequentialFileReader( + std::move(src_file), src, nullptr /* io_tracer */, {}, rate_limiter)); + buf.reset(new char[buf_size]); + } + + Slice data; + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return status_to_io_status(Status::Incomplete("Backup stopped")); + } + if (!src.empty()) { + size_t buffer_to_read = + (buf_size < size_limit) ? buf_size : static_cast(size_limit); + io_s = src_reader->Read(buffer_to_read, &data, buf.get(), + Env::IO_LOW /* rate_limiter_priority */); + *bytes_toward_next_callback += data.size(); + } else { + data = contents; + } + size_limit -= data.size(); + TEST_SYNC_POINT_CALLBACK( + "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", + (src.length() > 4 && src.rfind(".sst") == src.length() - 4) ? &data + : nullptr); + + if (!io_s.ok()) { + return io_s; + } + + if (size != nullptr) { + *size += data.size(); + } + if (checksum_hex != nullptr) { + checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); + } + io_s = dest_writer->Append(data); + + if (rate_limiter != nullptr) { + if (!src.empty()) { + rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } else { + LoopRateLimitRequestHelper(data.size(), rate_limiter, Env::IO_LOW, + nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + while (*bytes_toward_next_callback >= + options_.callback_trigger_interval_size) { + *bytes_toward_next_callback -= options_.callback_trigger_interval_size; + std::lock_guard lock(byte_report_mutex_); + progress_callback(); + } + } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0); + + // Convert uint32_t checksum to hex checksum + if (checksum_hex != nullptr) { + checksum_hex->assign(ChecksumInt32ToHex(checksum_value)); + } + + if (io_s.ok() && sync) { + io_s = dest_writer->Sync(false); + } + if (io_s.ok()) { + io_s = dest_writer->Close(); + } + return io_s; +} + +// fname will always start with "/" +IOStatus BackupEngineImpl::AddBackupFileWorkItem( + std::unordered_set& live_dst_paths, + std::vector& backup_items_to_finish, + BackupID backup_id, bool shared, const std::string& src_dir, + const std::string& fname, const EnvOptions& src_env_options, + RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes, + Statistics* stats, uint64_t size_limit, bool shared_checksum, + std::function progress_callback, const std::string& contents, + const std::string& src_checksum_func_name, + const std::string& src_checksum_str, const Temperature src_temperature) { + assert(contents.empty() != src_dir.empty()); + + std::string src_path = src_dir + "/" + fname; + std::string dst_relative; + std::string dst_relative_tmp; + std::string db_id; + std::string db_session_id; + // crc32c checksum in hex. empty == unavailable / unknown + std::string checksum_hex; + + // Whenever a default checksum function name is passed in, we will compares + // the corresponding checksum values after copying. Note that only table and + // blob files may have a known checksum function name passed in. + // + // If no default checksum function name is passed in and db session id is not + // available, we will calculate the checksum *before* copying in two cases + // (we always calcuate checksums when copying or creating for any file types): + // a) share_files_with_checksum is true and file type is table; + // b) share_table_files is true and the file exists already. + // + // Step 0: Check if default checksum function name is passed in + if (kDbFileChecksumFuncName == src_checksum_func_name) { + if (src_checksum_str == kUnknownFileChecksum) { + return status_to_io_status( + Status::Aborted("Unknown checksum value for " + fname)); + } + checksum_hex = ChecksumStrToHex(src_checksum_str); + } + + // Step 1: Prepare the relative path to destination + if (shared && shared_checksum) { + if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize && + file_type != kBlobFile) { + // Prepare db_session_id to add to the file name + // Ignore the returned status + // In the failed cases, db_id and db_session_id will be empty + GetFileDbIdentities(db_env_, src_env_options, src_path, src_temperature, + rate_limiter, &db_id, &db_session_id) + .PermitUncheckedError(); + } + // Calculate checksum if checksum and db session id are not available. + // If db session id is available, we will not calculate the checksum + // since the session id should suffice to avoid file name collision in + // the shared_checksum directory. + if (checksum_hex.empty() && db_session_id.empty()) { + IOStatus io_s = ReadFileAndComputeChecksum( + src_path, db_fs_, src_env_options, size_limit, &checksum_hex, + src_temperature); + if (!io_s.ok()) { + return io_s; + } + } + if (size_bytes == std::numeric_limits::max()) { + return IOStatus::NotFound("File missing: " + src_path); + } + // dst_relative depends on the following conditions: + // 1) the naming scheme is kUseDbSessionId, + // 2) db_session_id is not empty, + // 3) checksum is available in the DB manifest. + // If 1,2,3) are satisfied, then dst_relative will be of the form: + // shared_checksum/__.sst + // If 1,2) are satisfied, then dst_relative will be of the form: + // shared_checksum/_.sst + // Otherwise, dst_relative is of the form + // shared_checksum/__.sst + // + // For blob files, db_session_id is not supported with the blob file format. + // It uses original/legacy naming scheme. + // dst_relative will be of the form: + // shared_checksum/__.blob + dst_relative = GetSharedFileWithChecksum(fname, checksum_hex, size_bytes, + db_session_id); + dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true); + dst_relative = GetSharedFileWithChecksumRel(dst_relative, false); + } else if (shared) { + dst_relative_tmp = GetSharedFileRel(fname, true); + dst_relative = GetSharedFileRel(fname, false); + } else { + dst_relative = GetPrivateFileRel(backup_id, false, fname); + } + + // We copy into `temp_dest_path` and, once finished, rename it to + // `final_dest_path`. This allows files to atomically appear at + // `final_dest_path`. We can copy directly to the final path when atomicity + // is unnecessary, like for files in private backup directories. + const std::string* copy_dest_path; + std::string temp_dest_path; + std::string final_dest_path = GetAbsolutePath(dst_relative); + if (!dst_relative_tmp.empty()) { + temp_dest_path = GetAbsolutePath(dst_relative_tmp); + copy_dest_path = &temp_dest_path; + } else { + copy_dest_path = &final_dest_path; + } + + // Step 2: Determine whether to copy or not + // if it's shared, we also need to check if it exists -- if it does, no need + // to copy it again. + bool need_to_copy = true; + // true if final_dest_path is the same path as another live file + const bool same_path = + live_dst_paths.find(final_dest_path) != live_dst_paths.end(); + + bool file_exists = false; + if (shared && !same_path) { + // Should be in shared directory but not a live path, check existence in + // shared directory + IOStatus exist = + backup_fs_->FileExists(final_dest_path, io_options_, nullptr); + if (exist.ok()) { + file_exists = true; + } else if (exist.IsNotFound()) { + file_exists = false; + } else { + return exist; + } + } + + if (!contents.empty()) { + need_to_copy = false; + } else if (shared && (same_path || file_exists)) { + need_to_copy = false; + auto find_result = backuped_file_infos_.find(dst_relative); + if (find_result == backuped_file_infos_.end() && !same_path) { + // file exists but not referenced + ROCKS_LOG_INFO( + options_.info_log, + "%s already present, but not referenced by any backup. We will " + "overwrite the file.", + fname.c_str()); + need_to_copy = true; + // Defer any failure reporting to when we try to write the file + backup_fs_->DeleteFile(final_dest_path, io_options_, nullptr) + .PermitUncheckedError(); + } else { + // file exists and referenced + if (checksum_hex.empty()) { + // same_path should not happen for a standard DB, so OK to + // read file contents to check for checksum mismatch between + // two files from same DB getting same name. + // For compatibility with future meta file that might not have + // crc32c checksum available, consider it might be empty, but + // we don't currently generate meta file without crc32c checksum. + // Therefore we have to read & compute it if we don't have it. + if (!same_path && !find_result->second->checksum_hex.empty()) { + assert(find_result != backuped_file_infos_.end()); + // Note: to save I/O on incremental backups, we copy prior known + // checksum of the file instead of reading entire file contents + // to recompute it. + checksum_hex = find_result->second->checksum_hex; + // Regarding corruption detection, consider: + // (a) the DB file is corrupt (since previous backup) and the backup + // file is OK: we failed to detect, but the backup is safe. DB can + // be repaired/restored once its corruption is detected. + // (b) the backup file is corrupt (since previous backup) and the + // db file is OK: we failed to detect, but the backup is corrupt. + // CreateNewBackup should support fast incremental backups and + // there's no way to support that without reading all the files. + // We might add an option for extra checks on incremental backup, + // but until then, use VerifyBackups to check existing backup data. + // (c) file name collision with legitimately different content. + // This is almost inconceivable with a well-generated DB session + // ID, but even in that case, we double check the file sizes in + // BackupMeta::AddFile. + } else { + IOStatus io_s = ReadFileAndComputeChecksum( + src_path, db_fs_, src_env_options, size_limit, &checksum_hex, + src_temperature); + if (!io_s.ok()) { + return io_s; + } + } + } + if (!db_session_id.empty()) { + ROCKS_LOG_INFO(options_.info_log, + "%s already present, with checksum %s, size %" PRIu64 + " and DB session identity %s", + fname.c_str(), checksum_hex.c_str(), size_bytes, + db_session_id.c_str()); + } else { + ROCKS_LOG_INFO(options_.info_log, + "%s already present, with checksum %s and size %" PRIu64, + fname.c_str(), checksum_hex.c_str(), size_bytes); + } + } + } + live_dst_paths.insert(final_dest_path); + + // Step 3: Add work item + if (!contents.empty() || need_to_copy) { + ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(), + copy_dest_path->c_str()); + CopyOrCreateWorkItem copy_or_create_work_item( + src_dir.empty() ? "" : src_path, *copy_dest_path, src_temperature, + Temperature::kUnknown /*dst_temp*/, contents, db_env_, backup_env_, + src_env_options, options_.sync, rate_limiter, size_limit, stats, + progress_callback, src_checksum_func_name, checksum_hex, db_id, + db_session_id); + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), shared, need_to_copy, + backup_env_, temp_dest_path, final_dest_path, dst_relative); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); + } else { + std::promise promise_result; + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + promise_result.get_future(), shared, need_to_copy, backup_env_, + temp_dest_path, final_dest_path, dst_relative); + backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); + CopyOrCreateResult result; + result.io_status = IOStatus::OK(); + result.size = size_bytes; + result.checksum_hex = std::move(checksum_hex); + result.db_id = std::move(db_id); + result.db_session_id = std::move(db_session_id); + promise_result.set_value(std::move(result)); + } + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::ReadFileAndComputeChecksum( + const std::string& src, const std::shared_ptr& src_fs, + const EnvOptions& src_env_options, uint64_t size_limit, + std::string* checksum_hex, const Temperature src_temperature) const { + if (checksum_hex == nullptr) { + return status_to_io_status(Status::Aborted("Checksum pointer is null")); + } + uint32_t checksum_value = 0; + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + std::unique_ptr src_reader; + auto file_options = FileOptions(src_env_options); + file_options.temperature = src_temperature; + RateLimiter* rate_limiter = options_.backup_rate_limiter.get(); + IOStatus io_s = SequentialFileReader::Create( + src_fs, src, file_options, &src_reader, nullptr /* dbg */, rate_limiter); + if (io_s.IsPathNotFound() && src_temperature != Temperature::kUnknown) { + // Retry without temperature hint in case the FileSystem is strict with + // non-kUnknown temperature option + file_options.temperature = Temperature::kUnknown; + io_s = SequentialFileReader::Create(src_fs, src, file_options, &src_reader, + nullptr /* dbg */, rate_limiter); + } + if (!io_s.ok()) { + return io_s; + } + + size_t buf_size = kDefaultCopyFileBufferSize; + std::unique_ptr buf(new char[buf_size]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return status_to_io_status(Status::Incomplete("Backup stopped")); + } + size_t buffer_to_read = + (buf_size < size_limit) ? buf_size : static_cast(size_limit); + io_s = src_reader->Read(buffer_to_read, &data, buf.get(), + Env::IO_LOW /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + + size_limit -= data.size(); + checksum_value = crc32c::Extend(checksum_value, data.data(), data.size()); + } while (data.size() > 0 && size_limit > 0); + + checksum_hex->assign(ChecksumInt32ToHex(checksum_value)); + + return io_s; +} + +Status BackupEngineImpl::GetFileDbIdentities( + Env* src_env, const EnvOptions& src_env_options, + const std::string& file_path, Temperature file_temp, + RateLimiter* rate_limiter, std::string* db_id, std::string* db_session_id) { + assert(db_id != nullptr || db_session_id != nullptr); + + Options options; + options.env = src_env; + SstFileDumper sst_reader(options, file_path, file_temp, + 2 * 1024 * 1024 + /* readahead_size */, + false /* verify_checksum */, false /* output_hex */, + false /* decode_blob_index */, src_env_options, + true /* silent */); + + const TableProperties* table_properties = nullptr; + std::shared_ptr tp; + Status s = sst_reader.getStatus(); + + if (s.ok()) { + // Try to get table properties from the table reader of sst_reader + if (!sst_reader.ReadTableProperties(&tp).ok()) { + // Try to use table properites from the initialization of sst_reader + table_properties = sst_reader.GetInitTableProperties(); + } else { + table_properties = tp.get(); + if (table_properties != nullptr && rate_limiter != nullptr) { + // sizeof(*table_properties) is a sufficent but far-from-exact + // approximation of read bytes due to metaindex block, std::string + // properties and varint compression + LoopRateLimitRequestHelper(sizeof(*table_properties), rate_limiter, + Env::IO_LOW, nullptr /* stats */, + RateLimiter::OpType::kRead); + } + } + } else { + ROCKS_LOG_INFO(options_.info_log, "Failed to read %s: %s", + file_path.c_str(), s.ToString().c_str()); + return s; + } + + if (table_properties != nullptr) { + if (db_id != nullptr) { + db_id->assign(table_properties->db_id); + } + if (db_session_id != nullptr) { + db_session_id->assign(table_properties->db_session_id); + if (db_session_id->empty()) { + s = Status::NotFound("DB session identity not found in " + file_path); + ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str()); + return s; + } + } + return Status::OK(); + } else { + s = Status::Corruption("Table properties missing in " + file_path); + ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str()); + return s; + } +} + +void BackupEngineImpl::LoopRateLimitRequestHelper( + const size_t total_bytes_to_request, RateLimiter* rate_limiter, + const Env::IOPriority pri, Statistics* stats, + const RateLimiter::OpType op_type) { + assert(rate_limiter != nullptr); + size_t remaining_bytes = total_bytes_to_request; + size_t request_bytes = 0; + while (remaining_bytes > 0) { + request_bytes = + std::min(static_cast(rate_limiter->GetSingleBurstBytes()), + remaining_bytes); + rate_limiter->Request(request_bytes, pri, stats, op_type); + remaining_bytes -= request_bytes; + } +} + +void BackupEngineImpl::DeleteChildren(const std::string& dir, + uint32_t file_type_filter) const { + std::vector children; + db_fs_->GetChildren(dir, io_options_, &children, nullptr) + .PermitUncheckedError(); // ignore errors + + for (const auto& f : children) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && (file_type_filter & (1 << type))) { + // don't delete this file + continue; + } + db_fs_->DeleteFile(dir + "/" + f, io_options_, nullptr) + .PermitUncheckedError(); // ignore errors + } +} + +IOStatus BackupEngineImpl::ReadChildFileCurrentSizes( + const std::string& dir, const std::shared_ptr& fs, + std::unordered_map* result) const { + assert(result != nullptr); + std::vector files_attrs; + IOStatus io_status = fs->FileExists(dir, io_options_, nullptr); + if (io_status.ok()) { + io_status = + fs->GetChildrenFileAttributes(dir, io_options_, &files_attrs, nullptr); + } else if (io_status.IsNotFound()) { + // Insert no entries can be considered success + io_status = IOStatus::OK(); + } + const bool slash_needed = dir.empty() || dir.back() != '/'; + for (const auto& file_attrs : files_attrs) { + result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name, + file_attrs.size_bytes); + } + return io_status; +} + +IOStatus BackupEngineImpl::GarbageCollect() { + assert(!read_only_); + + // We will make a best effort to remove all garbage even in the presence + // of inconsistencies or I/O failures that inhibit finding garbage. + IOStatus overall_status = IOStatus::OK(); + // If all goes well, we don't need another auto-GC this session + might_need_garbage_collect_ = false; + + ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection"); + + // delete obsolete shared files + for (bool with_checksum : {false, true}) { + std::vector shared_children; + { + std::string shared_path; + if (with_checksum) { + shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel()); + } else { + shared_path = GetAbsolutePath(GetSharedFileRel()); + } + IOStatus io_s = backup_fs_->FileExists(shared_path, io_options_, nullptr); + if (io_s.ok()) { + io_s = backup_fs_->GetChildren(shared_path, io_options_, + &shared_children, nullptr); + } else if (io_s.IsNotFound()) { + io_s = IOStatus::OK(); + } + if (!io_s.ok()) { + overall_status = io_s; + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + for (auto& child : shared_children) { + std::string rel_fname; + if (with_checksum) { + rel_fname = GetSharedFileWithChecksumRel(child); + } else { + rel_fname = GetSharedFileRel(child); + } + auto child_itr = backuped_file_infos_.find(rel_fname); + // if it's not refcounted, delete it + if (child_itr == backuped_file_infos_.end() || + child_itr->second->refs == 0) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(rel_fname), + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", + rel_fname.c_str(), io_s.ToString().c_str()); + backuped_file_infos_.erase(rel_fname); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + } + + // delete obsolete private files + std::vector private_children; + { + IOStatus io_s = + backup_fs_->GetChildren(GetAbsolutePath(kPrivateDirName), io_options_, + &private_children, nullptr); + if (!io_s.ok()) { + overall_status = io_s; + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id)); + std::vector subchildren; + if (backup_fs_ + ->GetChildren(full_private_path, io_options_, &subchildren, nullptr) + .ok()) { + for (auto& subchild : subchildren) { + IOStatus io_s = backup_fs_->DeleteFile(full_private_path + subchild, + io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", + (full_private_path + subchild).c_str(), + io_s.ToString().c_str()); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + } + // finally delete the private dir + IOStatus io_s = + backup_fs_->DeleteDir(full_private_path, io_options_, nullptr); + ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s", + full_private_path.c_str(), io_s.ToString().c_str()); + if (!io_s.ok()) { + // Trying again later might work + might_need_garbage_collect_ = true; + } + } + + assert(overall_status.ok() || might_need_garbage_collect_); + return overall_status; +} + +// ------- BackupMeta class -------- + +IOStatus BackupEngineImpl::BackupMeta::AddFile( + std::shared_ptr file_info) { + auto itr = file_infos_->find(file_info->filename); + if (itr == file_infos_->end()) { + auto ret = file_infos_->insert({file_info->filename, file_info}); + if (ret.second) { + itr = ret.first; + itr->second->refs = 1; + } else { + // if this happens, something is seriously wrong + return IOStatus::Corruption("In memory metadata insertion error"); + } + } else { + // Compare sizes, because we scanned that off the filesystem on both + // ends. This is like a check in VerifyBackup. + if (itr->second->size != file_info->size) { + std::string msg = "Size mismatch for existing backup file: "; + msg.append(file_info->filename); + msg.append(" Size in backup is " + std::to_string(itr->second->size) + + " while size in DB is " + std::to_string(file_info->size)); + msg.append( + " If this DB file checks as not corrupt, try deleting old" + " backups or backing up to a different backup directory."); + return IOStatus::Corruption(msg); + } + if (file_info->checksum_hex.empty()) { + // No checksum available to check + } else if (itr->second->checksum_hex.empty()) { + // Remember checksum if newly acquired + itr->second->checksum_hex = file_info->checksum_hex; + } else if (itr->second->checksum_hex != file_info->checksum_hex) { + // Note: to save I/O, these will be equal trivially on already backed + // up files that don't have the checksum in their name. And it should + // never fail for files that do have checksum in their name. + + // Should never reach here, but produce an appropriate corruption + // message in case we do in a release build. + assert(false); + std::string msg = "Checksum mismatch for existing backup file: "; + msg.append(file_info->filename); + msg.append(" Expected checksum is " + itr->second->checksum_hex + + " while computed checksum is " + file_info->checksum_hex); + msg.append( + " If this DB file checks as not corrupt, try deleting old" + " backups or backing up to a different backup directory."); + return IOStatus::Corruption(msg); + } + ++itr->second->refs; // increase refcount if already present + } + + size_ += file_info->size; + files_.push_back(itr->second); + + return IOStatus::OK(); +} + +IOStatus BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { + IOStatus io_s; + for (const auto& file : files_) { + --file->refs; // decrease refcount + } + files_.clear(); + // delete meta file + if (delete_meta) { + io_s = fs_->FileExists(meta_filename_, iooptions_, nullptr); + if (io_s.ok()) { + io_s = fs_->DeleteFile(meta_filename_, iooptions_, nullptr); + } else if (io_s.IsNotFound()) { + io_s = IOStatus::OK(); // nothing to delete + } + } + timestamp_ = 0; + return io_s; +} + +// Constants for backup meta file schema (see LoadFromFile) +const std::string kSchemaVersionPrefix{"schema_version "}; +const std::string kFooterMarker{"// FOOTER"}; + +const std::string kAppMetaDataFieldName{"metadata"}; + +// WART: The checksums are crc32c but named "crc32" +const std::string kFileCrc32cFieldName{"crc32"}; +const std::string kFileSizeFieldName{"size"}; +const std::string kTemperatureFieldName{"temp"}; + +// Marks a (future) field that should cause failure if not recognized. +// Other fields are assumed to be ignorable. For example, in the future +// we might add +// ni::file_name_escape uri_percent +// to indicate all file names have had spaces and special characters +// escaped using a URI percent encoding. +const std::string kNonIgnorableFieldPrefix{"ni::"}; + +// Each backup meta file is of the format (schema version 1): +//---------------------------------------------------------- +// +// +// metadata (optional) +// +// crc32 +// crc32 +// ... +//---------------------------------------------------------- +// +// For schema version 2.x (not in public APIs, but +// forward-compatibility started): +//---------------------------------------------------------- +// schema_version +// +// +// [ ] +// ... +// +// ( )* +// ( )* +// ... +// [// FOOTER] +// [ ] +// ... +//---------------------------------------------------------- +// where +// ::= [0-9]+([.][0-9]+) +// ::= [A-Za-z_][A-Za-z_0-9.]+ +// is anything but newline +// is anything but space and newline +// Although "// FOOTER" wouldn't strictly be required as a delimiter +// given the number of files is included, it is there for parsing +// sanity in case of corruption. It is only required if followed +// by footer fields, such as a checksum of the meta file (so far). +// Unrecognized fields are ignored, to support schema evolution on +// non-critical features with forward compatibility. Update schema +// major version for breaking changes. Schema minor versions are indicated +// only for diagnostic/debugging purposes. +// +// Fields in schema version 2.0: +// * Top-level meta fields: +// * Only "metadata" as in schema version 1 +// * File meta fields: +// * "crc32" - a crc32c checksum as in schema version 1 +// * "size" - the size of the file (new) +// * Footer meta fields: +// * None yet (future use for meta file checksum anticipated) +// +IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size, + RateLimiter* rate_limiter, Logger* info_log, + std::unordered_set* reported_ignored_fields) { + assert(reported_ignored_fields); + assert(Empty()); + + std::unique_ptr backup_meta_reader; + { + IOStatus io_s = LineFileReader::Create(fs_, meta_filename_, FileOptions(), + &backup_meta_reader, + nullptr /* dbg */, rate_limiter); + if (!io_s.ok()) { + return io_s; + } + } + + // If we don't read an explicit schema_version, that implies version 1, + // which is what we call the original backup meta schema. + int schema_major_version = 1; + + // Failures handled at the end + std::string line; + if (backup_meta_reader->ReadLine(&line, + Env::IO_LOW /* rate_limiter_priority */)) { + if (StartsWith(line, kSchemaVersionPrefix)) { + std::string ver = line.substr(kSchemaVersionPrefix.size()); + if (ver == "2" || StartsWith(ver, "2.")) { + schema_major_version = 2; + } else { + return IOStatus::NotSupported( + "Unsupported/unrecognized schema version: " + ver); + } + line.clear(); + } else if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + } + if (!line.empty()) { + timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } else if (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } + if (backup_meta_reader->ReadLine(&line, + Env::IO_LOW /* rate_limiter_priority */)) { + sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10); + } + uint32_t num_files = UINT32_MAX; + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + // Number -> number of files -> exit loop reading optional meta fields + if (line[0] >= '0' && line[0] <= '9') { + num_files = static_cast(strtoul(line.c_str(), nullptr, 10)); + break; + } + // else, must be a meta field assignment + auto space_pos = line.find_first_of(' '); + if (space_pos == std::string::npos) { + return IOStatus::Corruption("Expected number of files or meta field"); + } + std::string field_name = line.substr(0, space_pos); + std::string field_data = line.substr(space_pos + 1); + if (field_name == kAppMetaDataFieldName) { + // app metadata present + bool decode_success = Slice(field_data).DecodeHex(&app_metadata_); + if (!decode_success) { + return IOStatus::Corruption( + "Failed to decode stored hex encoded app metadata"); + } + } else if (schema_major_version < 2) { + return IOStatus::Corruption("Expected number of files or \"" + + kAppMetaDataFieldName + "\" field"); + } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable meta field " + + field_name + " (from future version?)"); + } else { + // Warn the first time we see any particular unrecognized meta field + if (reported_ignored_fields->insert("meta:" + field_name).second) { + ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s", + field_name.c_str()); + } + } + } + std::vector> files; + bool footer_present = false; + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + std::vector components = StringSplit(line, ' '); + + if (components.size() < 1) { + return IOStatus::Corruption("Empty line instead of file entry."); + } + if (schema_major_version >= 2 && components.size() == 2 && + line == kFooterMarker) { + footer_present = true; + break; + } + + const std::string& filename = components[0]; + + uint64_t actual_size; + const std::shared_ptr file_info = GetFile(filename); + if (file_info) { + actual_size = file_info->size; + } else { + std::string abs_path = backup_dir + "/" + filename; + auto e = abs_path_to_size.find(abs_path); + if (e == abs_path_to_size.end()) { + return IOStatus::Corruption( + "Pathname in meta file not found on disk: " + abs_path); + } + actual_size = e->second; + } + + if (schema_major_version >= 2) { + if (components.size() % 2 != 1) { + return IOStatus::Corruption( + "Bad number of line components for file entry."); + } + } else { + // Check restricted original schema + if (components.size() < 3) { + return IOStatus::Corruption("File checksum is missing for " + filename + + " in " + meta_filename_); + } + if (components[1] != kFileCrc32cFieldName) { + return IOStatus::Corruption("Unknown checksum type for " + filename + + " in " + meta_filename_); + } + if (components.size() > 3) { + return IOStatus::Corruption("Extra data for entry " + filename + + " in " + meta_filename_); + } + } + + std::string checksum_hex; + Temperature temp = Temperature::kUnknown; + for (unsigned i = 1; i < components.size(); i += 2) { + const std::string& field_name = components[i]; + const std::string& field_data = components[i + 1]; + + if (field_name == kFileCrc32cFieldName) { + uint32_t checksum_value = + static_cast(strtoul(field_data.c_str(), nullptr, 10)); + if (field_data != std::to_string(checksum_value)) { + return IOStatus::Corruption("Invalid checksum value for " + filename + + " in " + meta_filename_); + } + checksum_hex = ChecksumInt32ToHex(checksum_value); + } else if (field_name == kFileSizeFieldName) { + uint64_t ex_size = + std::strtoull(field_data.c_str(), nullptr, /*base*/ 10); + if (ex_size != actual_size) { + return IOStatus::Corruption( + "For file " + filename + " expected size " + + std::to_string(ex_size) + " but found size" + + std::to_string(actual_size)); + } + } else if (field_name == kTemperatureFieldName) { + auto iter = temperature_string_map.find(field_data); + if (iter != temperature_string_map.end()) { + temp = iter->second; + } else { + // Could report corruption, but in case of new temperatures added + // in future, letting those map to kUnknown which should generally + // be safe. + temp = Temperature::kUnknown; + } + } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable file field " + + field_name + " (from future version?)"); + } else { + // Warn the first time we see any particular unrecognized file field + if (reported_ignored_fields->insert("file:" + field_name).second) { + ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s", + field_name.c_str()); + } + } + } + + files.emplace_back(new FileInfo(filename, actual_size, checksum_hex, + /*id*/ "", /*sid*/ "", temp)); + } + + if (footer_present) { + assert(schema_major_version >= 2); + while (backup_meta_reader->ReadLine( + &line, Env::IO_LOW /* rate_limiter_priority */)) { + if (line.empty()) { + return IOStatus::Corruption("Unexpected empty line"); + } + auto space_pos = line.find_first_of(' '); + if (space_pos == std::string::npos) { + return IOStatus::Corruption("Expected footer field"); + } + std::string field_name = line.substr(0, space_pos); + std::string field_data = line.substr(space_pos + 1); + if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { + return IOStatus::NotSupported("Unrecognized non-ignorable field " + + field_name + " (from future version?)"); + } else if (reported_ignored_fields->insert("footer:" + field_name) + .second) { + // Warn the first time we see any particular unrecognized footer field + ROCKS_LOG_WARN(info_log, + "Ignoring unrecognized backup meta footer field %s", + field_name.c_str()); + } + } + } + + { + IOStatus io_s = backup_meta_reader->GetStatus(); + if (!io_s.ok()) { + return io_s; + } + } + + if (num_files != files.size()) { + return IOStatus::Corruption( + "Inconsistent number of files or missing/incomplete header in " + + meta_filename_); + } + + files_.reserve(files.size()); + for (const auto& file_info : files) { + IOStatus io_s = AddFile(file_info); + if (!io_s.ok()) { + return io_s; + } + } + + return IOStatus::OK(); +} + +const std::vector minor_version_strings{ + "", // invalid major version 0 + "", // implicit major version 1 + "2.0", +}; + +IOStatus BackupEngineImpl::BackupMeta::StoreToFile( + bool sync, int schema_version, + const TEST_BackupMetaSchemaOptions* schema_test_options) { + if (schema_version < 1) { + return IOStatus::InvalidArgument( + "BackupEngineOptions::schema_version must be >= 1"); + } + if (schema_version > static_cast(minor_version_strings.size() - 1)) { + return IOStatus::NotSupported( + "Only BackupEngineOptions::schema_version <= " + + std::to_string(minor_version_strings.size() - 1) + " is supported"); + } + std::string ver = minor_version_strings[schema_version]; + + // Need schema_version >= 2 for TEST_BackupMetaSchemaOptions + assert(schema_version >= 2 || schema_test_options == nullptr); + + IOStatus io_s; + std::unique_ptr backup_meta_file; + FileOptions file_options; + file_options.use_mmap_writes = false; + file_options.use_direct_writes = false; + io_s = fs_->NewWritableFile(meta_tmp_filename_, file_options, + &backup_meta_file, nullptr); + if (!io_s.ok()) { + return io_s; + } + + std::ostringstream buf; + if (schema_test_options) { + // override for testing + ver = schema_test_options->version; + } + if (!ver.empty()) { + assert(schema_version >= 2); + buf << kSchemaVersionPrefix << ver << "\n"; + } + buf << static_cast(timestamp_) << "\n"; + buf << sequence_number_ << "\n"; + + if (!app_metadata_.empty()) { + std::string hex_encoded_metadata = + Slice(app_metadata_).ToString(/* hex */ true); + buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n"; + } + if (schema_test_options) { + for (auto& e : schema_test_options->meta_fields) { + buf << e.first << " " << e.second << "\n"; + } + } + buf << files_.size() << "\n"; + + for (const auto& file : files_) { + buf << file->filename; + if (schema_test_options == nullptr || + schema_test_options->crc32c_checksums) { + // use crc32c for now, switch to something else if needed + buf << " " << kFileCrc32cFieldName << " " + << ChecksumHexToInt32(file->checksum_hex); + } + if (schema_version >= 2 && file->temp != Temperature::kUnknown) { + buf << " " << kTemperatureFieldName << " " + << temperature_to_string[file->temp]; + } + if (schema_test_options && schema_test_options->file_sizes) { + buf << " " << kFileSizeFieldName << " " << std::to_string(file->size); + } + if (schema_test_options) { + for (auto& e : schema_test_options->file_fields) { + buf << " " << e.first << " " << e.second; + } + } + buf << "\n"; + } + + if (schema_test_options && !schema_test_options->footer_fields.empty()) { + buf << kFooterMarker << "\n"; + for (auto& e : schema_test_options->footer_fields) { + buf << e.first << " " << e.second << "\n"; + } + } + + io_s = backup_meta_file->Append(Slice(buf.str()), iooptions_, nullptr); + IOSTATS_ADD(bytes_written, buf.str().size()); + if (io_s.ok() && sync) { + io_s = backup_meta_file->Sync(iooptions_, nullptr); + } + if (io_s.ok()) { + io_s = backup_meta_file->Close(iooptions_, nullptr); + } + if (io_s.ok()) { + io_s = fs_->RenameFile(meta_tmp_filename_, meta_filename_, iooptions_, + nullptr); + } + return io_s; +} +} // namespace + +IOStatus BackupEngineReadOnly::Open(const BackupEngineOptions& options, + Env* env, + BackupEngineReadOnly** backup_engine_ptr) { + if (options.destroy_old_data) { + return IOStatus::InvalidArgument( + "Can't destroy old data with ReadOnly BackupEngine"); + } + std::unique_ptr backup_engine( + new BackupEngineImplThreadSafe(options, env, true /*read_only*/)); + auto s = backup_engine->Initialize(); + if (!s.ok()) { + *backup_engine_ptr = nullptr; + return s; + } + *backup_engine_ptr = backup_engine.release(); + return IOStatus::OK(); +} + +void TEST_SetBackupMetaSchemaOptions( + BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options) { + BackupEngineImplThreadSafe* impl = + static_cast_with_check(engine); + impl->TEST_SetBackupMetaSchemaOptions(options); +} + +void TEST_SetDefaultRateLimitersClock( + BackupEngine* engine, + const std::shared_ptr& backup_rate_limiter_clock, + const std::shared_ptr& restore_rate_limiter_clock) { + BackupEngineImplThreadSafe* impl = + static_cast_with_check(engine); + impl->TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock, + restore_rate_limiter_clock); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/backup/backup_engine_impl.h b/src/rocksdb/utilities/backup/backup_engine_impl.h new file mode 100644 index 000000000..398f47f27 --- /dev/null +++ b/src/rocksdb/utilities/backup/backup_engine_impl.h @@ -0,0 +1,36 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/backup_engine.h" + +namespace ROCKSDB_NAMESPACE { + +struct TEST_BackupMetaSchemaOptions { + std::string version = "2"; + bool crc32c_checksums = false; + bool file_sizes = true; + std::map meta_fields; + std::map file_fields; + std::map footer_fields; +}; + +// Modifies the BackupEngine(Impl) to write backup meta files using the +// unpublished schema version 2, for the life of this object (not backup_dir). +// TEST_BackupMetaSchemaOptions offers some customization for testing. +void TEST_SetBackupMetaSchemaOptions( + BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options); + +// Modifies the BackupEngine(Impl) to use specified clocks for backup and +// restore rate limiters created by default if not specified by users for +// test speedup. +void TEST_SetDefaultRateLimitersClock( + BackupEngine* engine, + const std::shared_ptr& backup_rate_limiter_clock = nullptr, + const std::shared_ptr& restore_rate_limiter_clock = nullptr); +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/backup/backup_engine_test.cc b/src/rocksdb/utilities/backup/backup_engine_test.cc new file mode 100644 index 000000000..d1f74f769 --- /dev/null +++ b/src/rocksdb/utilities/backup/backup_engine_test.cc @@ -0,0 +1,4219 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) + +#include "rocksdb/utilities/backup_engine.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "env/env_chroot.h" +#include "file/filename.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/statistics.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/utilities/stackable_db.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/stderr_logger.h" +#include "util/string_util.h" +#include "utilities/backup/backup_engine_impl.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming; +const auto kLegacyCrc32cAndFileSize = + BackupEngineOptions::kLegacyCrc32cAndFileSize; +const auto kUseDbSessionId = BackupEngineOptions::kUseDbSessionId; +const auto kFlagIncludeFileSize = BackupEngineOptions::kFlagIncludeFileSize; +const auto kNamingDefault = kUseDbSessionId | kFlagIncludeFileSize; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), + options_(options), + dbname_(dbname), + deletions_enabled_(true), + sequence_number_(0) {} + + SequenceNumber GetLatestSequenceNumber() const override { + return ++sequence_number_; + } + + const std::string& GetName() const override { return dbname_; } + + Env* GetEnv() const override { return options_.env; } + + using DB::GetOptions; + Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override { + return options_; + } + + DBOptions GetDBOptions() const override { return DBOptions(options_); } + + Status EnableFileDeletions(bool /*force*/) override { + EXPECT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + Status DisableFileDeletions() override { + EXPECT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; } + + Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override { + uint64_t number; + FileType type; + files->clear(); + for (auto& f : live_files_) { + bool success = ParseFileName(f, &number, &type); + if (!success) { + return Status::InvalidArgument("Bad file name: " + f); + } + files->emplace_back(); + LiveFileStorageInfo& info = files->back(); + info.relative_filename = f; + info.directory = dbname_; + info.file_number = number; + info.file_type = type; + if (type == kDescriptorFile) { + info.size = 100; // See TestFs::GetChildrenFileAttributes below + info.trim_to_size = true; + } else if (type == kCurrentFile) { + info.size = 0; + info.trim_to_size = true; + } else { + info.size = 200; // See TestFs::GetChildrenFileAttributes below + } + if (opts.include_checksum_info) { + info.file_checksum = kUnknownFileChecksum; + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + } + } + return Status::OK(); + } + + // To avoid FlushWAL called on stacked db which is nullptr + Status FlushWAL(bool /*sync*/) override { return Status::OK(); } + + std::vector live_files_; + + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; + mutable SequenceNumber sequence_number_; +}; // DummyDB + +class TestFs : public FileSystemWrapper { + public: + explicit TestFs(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + const char* Name() const override { return "TestFs"; } + + class DummySequentialFile : public FSSequentialFile { + public: + explicit DummySequentialFile(bool fail_reads) + : FSSequentialFile(), rnd_(5), fail_reads_(fail_reads) {} + IOStatus Read(size_t n, const IOOptions&, Slice* result, char* scratch, + IODebugContext*) override { + if (fail_reads_) { + return IOStatus::IOError(); + } + size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } + *result = Slice(scratch, read_size); + size_left -= read_size; + return IOStatus::OK(); + } + + IOStatus Skip(uint64_t n) override { + size_left = (n > size_left) ? size_left - n : 0; + return IOStatus::OK(); + } + + private: + size_t size_left = 200; + Random rnd_; + bool fail_reads_; + }; + + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + MutexLock l(&mutex_); + if (dummy_sequential_file_) { + r->reset( + new TestFs::DummySequentialFile(dummy_sequential_file_fail_reads_)); + return IOStatus::OK(); + } else { + IOStatus s = FileSystemWrapper::NewSequentialFile(f, file_opts, r, dbg); + if (s.ok()) { + if ((*r)->use_direct_io()) { + ++num_direct_seq_readers_; + } + ++num_seq_readers_; + } + return s; + } + } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + MutexLock l(&mutex_); + written_files_.push_back(f); + if (limit_written_files_ == 0) { + return IOStatus::NotSupported("Limit on written files reached"); + } + limit_written_files_--; + IOStatus s = FileSystemWrapper::NewWritableFile(f, file_opts, r, dbg); + if (s.ok()) { + if ((*r)->use_direct_io()) { + ++num_direct_writers_; + } + ++num_writers_; + } + return s; + } + + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + MutexLock l(&mutex_); + IOStatus s = FileSystemWrapper::NewRandomAccessFile(f, file_opts, r, dbg); + if (s.ok()) { + if ((*r)->use_direct_io()) { + ++num_direct_rand_readers_; + } + ++num_rand_readers_; + } + return s; + } + + IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override { + MutexLock l(&mutex_); + if (fail_delete_files_) { + return IOStatus::IOError(); + } + EXPECT_GT(limit_delete_files_, 0U); + limit_delete_files_--; + return FileSystemWrapper::DeleteFile(f, options, dbg); + } + + IOStatus DeleteDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + MutexLock l(&mutex_); + if (fail_delete_files_) { + return IOStatus::IOError(); + } + return FileSystemWrapper::DeleteDir(d, options, dbg); + } + + void AssertWrittenFiles(std::vector& should_have_written) { + MutexLock l(&mutex_); + std::sort(should_have_written.begin(), should_have_written.end()); + std::sort(written_files_.begin(), written_files_.end()); + + ASSERT_EQ(should_have_written, written_files_); + } + + void ClearWrittenFiles() { + MutexLock l(&mutex_); + written_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + MutexLock l(&mutex_); + limit_written_files_ = limit; + } + + void SetLimitDeleteFiles(uint64_t limit) { + MutexLock l(&mutex_); + limit_delete_files_ = limit; + } + + void SetDeleteFileFailure(bool fail) { + MutexLock l(&mutex_); + fail_delete_files_ = fail; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + MutexLock l(&mutex_); + dummy_sequential_file_ = dummy_sequential_file; + } + void SetDummySequentialFileFailReads(bool dummy_sequential_file_fail_reads) { + MutexLock l(&mutex_); + dummy_sequential_file_fail_reads_ = dummy_sequential_file_fail_reads; + } + + void SetGetChildrenFailure(bool fail) { get_children_failure_ = fail; } + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override { + if (get_children_failure_) { + return IOStatus::IOError("SimulatedFailure"); + } + return FileSystemWrapper::GetChildren(dir, io_opts, r, dbg); + } + + // Some test cases do not actually create the test files (e.g., see + // DummyDB::live_files_) - for those cases, we mock those files' attributes + // so CreateNewBackup() can get their attributes. + void SetFilenamesForMockedAttrs(const std::vector& filenames) { + filenames_for_mocked_attrs_ = filenames; + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + if (filenames_for_mocked_attrs_.size() > 0) { + for (const auto& filename : filenames_for_mocked_attrs_) { + uint64_t size_bytes = 200; // Match TestFs + if (filename.find("MANIFEST") == 0) { + size_bytes = 100; // Match DummyDB::GetLiveFiles + } + result->push_back({dir + "/" + filename, size_bytes}); + } + return IOStatus::OK(); + } + return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, + dbg); + } + + IOStatus GetFileSize(const std::string& f, const IOOptions& options, + uint64_t* s, IODebugContext* dbg) override { + if (filenames_for_mocked_attrs_.size() > 0) { + auto fname = f.substr(f.find_last_of('/') + 1); + auto filename_iter = std::find(filenames_for_mocked_attrs_.begin(), + filenames_for_mocked_attrs_.end(), fname); + if (filename_iter != filenames_for_mocked_attrs_.end()) { + *s = 200; // Match TestFs + if (fname.find("MANIFEST") == 0) { + *s = 100; // Match DummyDB::GetLiveFiles + } + return IOStatus::OK(); + } + return IOStatus::NotFound(fname); + } + return FileSystemWrapper::GetFileSize(f, options, s, dbg); + } + + void SetCreateDirIfMissingFailure(bool fail) { + create_dir_if_missing_failure_ = fail; + } + IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + if (create_dir_if_missing_failure_) { + return IOStatus::IOError("SimulatedFailure"); + } + return FileSystemWrapper::CreateDirIfMissing(d, options, dbg); + } + + void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; } + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + if (new_directory_failure_) { + return IOStatus::IOError("SimulatedFailure"); + } + return FileSystemWrapper::NewDirectory(name, io_opts, result, dbg); + } + + void ClearFileOpenCounters() { + MutexLock l(&mutex_); + num_rand_readers_ = 0; + num_direct_rand_readers_ = 0; + num_seq_readers_ = 0; + num_direct_seq_readers_ = 0; + num_writers_ = 0; + num_direct_writers_ = 0; + } + + int num_rand_readers() { return num_rand_readers_; } + int num_direct_rand_readers() { return num_direct_rand_readers_; } + int num_seq_readers() { return num_seq_readers_; } + int num_direct_seq_readers() { return num_direct_seq_readers_; } + int num_writers() { return num_writers_; } + // FIXME(?): unused + int num_direct_writers() { return num_direct_writers_; } + + private: + port::Mutex mutex_; + bool dummy_sequential_file_ = false; + bool dummy_sequential_file_fail_reads_ = false; + std::vector written_files_; + std::vector filenames_for_mocked_attrs_; + uint64_t limit_written_files_ = 1000000; + uint64_t limit_delete_files_ = 1000000; + bool fail_delete_files_ = false; + + bool get_children_failure_ = false; + bool create_dir_if_missing_failure_ = false; + bool new_directory_failure_ = false; + + // Keeps track of how many files of each type were successfully opened, and + // out of those, how many were opened with direct I/O. + std::atomic num_rand_readers_{}; + std::atomic num_direct_rand_readers_{}; + std::atomic num_seq_readers_{}; + std::atomic num_direct_seq_readers_{}; + std::atomic num_writers_{}; + std::atomic num_direct_writers_{}; +}; // TestFs + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + const char* Name() const override { return "FileManager"; } + + Status GetRandomFileInDir(const std::string& dir, std::string* fname, + uint64_t* fsize) { + std::vector children; + auto s = GetChildrenFileAttributes(dir, &children); + if (!s.ok()) { + return s; + } else if (children.size() <= 2) { // . and .. + return Status::NotFound("Empty directory: " + dir); + } + assert(fname != nullptr); + while (true) { + int i = rnd_.Next() % children.size(); + fname->assign(dir + "/" + children[i].name); + *fsize = children[i].size_bytes; + return Status::OK(); + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status DeleteRandomFileInDir(const std::string& dir) { + std::vector children; + Status s = GetChildren(dir, &children); + if (!s.ok()) { + return s; + } + while (true) { + int i = rnd_.Next() % children.size(); + return DeleteFile(dir + "/" + children[i]); + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status AppendToRandomFileInDir(const std::string& dir, + const std::string& data) { + std::vector children; + Status s = GetChildren(dir, &children); + if (!s.ok()) { + return s; + } + while (true) { + int i = rnd_.Next() % children.size(); + return WriteToFile(dir + "/" + children[i], data); + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + std::string file_contents; + Status s = ReadFileToString(this, fname, &file_contents); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; i < bytes_to_corrupt; ++i) { + std::string tmp = rnd_.RandomString(1); + file_contents[rnd_.Next() % file_contents.size()] = tmp[0]; + } + return WriteToFile(fname, file_contents); + } + + Status CorruptFileStart(const std::string& fname) { + std::string to_xor = "blah"; + std::string file_contents; + Status s = ReadFileToString(this, fname, &file_contents); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + for (size_t i = 0; i < to_xor.size(); ++i) { + file_contents[i] ^= to_xor[i]; + } + return WriteToFile(fname, file_contents); + } + + Status CorruptChecksum(const std::string& fname, bool appear_valid) { + std::string metadata; + Status s = ReadFileToString(this, fname, &metadata); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + + auto pos = metadata.find("private"); + if (pos == std::string::npos) { + return Status::Corruption("private file is expected"); + } + pos = metadata.find(" crc32 ", pos + 6); + if (pos == std::string::npos) { + return Status::Corruption("checksum not found"); + } + + if (metadata.size() < pos + 7) { + return Status::Corruption("bad CRC32 checksum value"); + } + + if (appear_valid) { + if (metadata[pos + 8] == '\n') { + // single digit value, safe to insert one more digit + metadata.insert(pos + 8, 1, '0'); + } else { + metadata.erase(pos + 8, 1); + } + } else { + metadata[pos + 7] = 'a'; + } + + return WriteToFile(fname, metadata); + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + std::unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + + private: + Random rnd_; +}; // FileManager + +// utility functions +namespace { + +enum FillDBFlushAction { + kFlushMost, + kFlushAll, + kAutoFlushOnly, +}; + +// Many tests in this file expect FillDB to write at least one sst file, +// so the default behavior (if not kAutoFlushOnly) of FillDB is to force +// a flush. But to ensure coverage of the WAL file case, we also (by default) +// do one Put after the Flush (kFlushMost). +size_t FillDB(DB* db, int from, int to, + FillDBFlushAction flush_action = kFlushMost) { + size_t bytes_written = 0; + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + bytes_written += key.size() + value.size(); + + EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + + if (flush_action == kFlushMost && i == to - 2) { + EXPECT_OK(db->Flush(FlushOptions())); + } + } + if (flush_action == kFlushAll) { + EXPECT_OK(db->Flush(FlushOptions())); + } + return bytes_written; +} + +void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} +} // namespace + +class BackupEngineTest : public testing::Test { + public: + enum ShareOption { + kNoShare, + kShareNoChecksum, + kShareWithChecksum, + }; + + const std::vector kAllShareOptions = {kNoShare, kShareNoChecksum, + kShareWithChecksum}; + + BackupEngineTest() { + // set up files + std::string db_chroot = test::PerThreadDBPath("db_for_backup"); + std::string backup_chroot = test::PerThreadDBPath("db_backups"); + EXPECT_OK(Env::Default()->CreateDirIfMissing(db_chroot)); + EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_chroot)); + dbname_ = "/tempdb"; + backupdir_ = "/tempbk"; + latest_backup_ = backupdir_ + "/LATEST_BACKUP"; + + // set up FileSystem & Envs + db_chroot_fs_ = NewChrootFileSystem(FileSystem::Default(), db_chroot); + backup_chroot_fs_ = + NewChrootFileSystem(FileSystem::Default(), backup_chroot); + test_db_fs_ = std::make_shared(db_chroot_fs_); + test_backup_fs_ = std::make_shared(backup_chroot_fs_); + SetEnvsFromFileSystems(); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 17; // 128KB + options_.wal_dir = dbname_; + options_.enable_blob_files = true; + + // The sync option is not easily testable in unit tests, but should be + // smoke tested across all the other backup tests. However, it is + // certainly not worth doubling the runtime of backup tests for it. + // Thus, we can enable sync for one of our alternate testing + // configurations. + constexpr bool kUseSync = +#ifdef ROCKSDB_MODIFY_NPHASH + true; +#else + false; +#endif // ROCKSDB_MODIFY_NPHASH + + // set up backup db options + engine_options_.reset(new BackupEngineOptions( + backupdir_, test_backup_env_.get(), /*share_table_files*/ true, + logger_.get(), kUseSync)); + + // most tests will use multi-threaded backups + engine_options_->max_background_operations = 7; + + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + + // delete old LATEST_BACKUP file, which some tests create for compatibility + // testing. + backup_chroot_env_->DeleteFile(latest_backup_).PermitUncheckedError(); + } + + void SetEnvsFromFileSystems() { + db_chroot_env_.reset( + new CompositeEnvWrapper(Env::Default(), db_chroot_fs_)); + backup_chroot_env_.reset( + new CompositeEnvWrapper(Env::Default(), backup_chroot_fs_)); + test_db_env_.reset(new CompositeEnvWrapper(Env::Default(), test_db_fs_)); + options_.env = test_db_env_.get(); + test_backup_env_.reset( + new CompositeEnvWrapper(Env::Default(), test_backup_fs_)); + if (engine_options_) { + engine_options_->backup_env = test_backup_env_.get(); + } + file_manager_.reset(new FileManager(backup_chroot_env_.get())); + db_file_manager_.reset(new FileManager(db_chroot_env_.get())); + + // Create logger + DBOptions logger_options; + logger_options.env = db_chroot_env_.get(); + ASSERT_OK(CreateLoggerFromOptions(dbname_, logger_options, &logger_)); + } + + DB* OpenDB() { + DB* db; + EXPECT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + + void CloseAndReopenDB(bool read_only = false) { + // Close DB + db_.reset(); + + // Open DB + test_db_fs_->SetLimitWrittenFiles(1000000); + DB* db; + if (read_only) { + ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db)); + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + db_.reset(db); + } + + void InitializeDBAndBackupEngine(bool dummy = false) { + // reset all the db env defaults + test_db_fs_->SetLimitWrittenFiles(1000000); + test_db_fs_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + db_.reset(db); + } + + virtual void OpenDBAndBackupEngine( + bool destroy_old_data = false, bool dummy = false, + ShareOption shared_option = kShareNoChecksum) { + InitializeDBAndBackupEngine(dummy); + // reset backup env defaults + test_backup_fs_->SetLimitWrittenFiles(1000000); + engine_options_->destroy_old_data = destroy_old_data; + engine_options_->share_table_files = shared_option != kNoShare; + engine_options_->share_files_with_checksum = + shared_option == kShareWithChecksum; + OpenBackupEngine(destroy_old_data); + } + + void CloseDBAndBackupEngine() { + db_.reset(); + backup_engine_.reset(); + } + + void OpenBackupEngine(bool destroy_old_data = false) { + engine_options_->destroy_old_data = destroy_old_data; + engine_options_->info_log = logger_.get(); + BackupEngine* backup_engine; + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + backup_engine_.reset(backup_engine); + } + + void CloseBackupEngine() { backup_engine_.reset(nullptr); } + + // cross-cutting test of GetBackupInfo + void AssertBackupInfoConsistency() { + std::vector backup_info; + backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true); + std::map file_sizes; + + // Find the files that are supposed to be there + for (auto& backup : backup_info) { + uint64_t sum_for_backup = 0; + for (auto& file : backup.file_details) { + auto e = file_sizes.find(file.relative_filename); + if (e == file_sizes.end()) { + // fprintf(stderr, "Adding %s -> %u\n", + // file.relative_filename.c_str(), (unsigned)file.size); + file_sizes[file.relative_filename] = file.size; + } else { + ASSERT_EQ(file_sizes[file.relative_filename], file.size); + } + sum_for_backup += file.size; + } + ASSERT_EQ(backup.size, sum_for_backup); + } + + std::vector corrupt_backup_ids; + backup_engine_->GetCorruptedBackups(&corrupt_backup_ids); + bool has_corrupt = corrupt_backup_ids.size() > 0; + + // Compare with what's in backup dir + std::vector child_dirs; + ASSERT_OK( + test_backup_env_->GetChildren(backupdir_ + "/private", &child_dirs)); + for (auto& dir : child_dirs) { + dir = "private/" + dir; + } + child_dirs.push_back("shared"); // might not exist + child_dirs.push_back("shared_checksum"); // might not exist + for (auto& dir : child_dirs) { + std::vector children; + test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children) + .PermitUncheckedError(); + // fprintf(stderr, "ls %s\n", (backupdir_ + "/" + dir).c_str()); + for (auto& file : children) { + uint64_t size; + size = UINT64_MAX; // appease clang-analyze + std::string rel_file = dir + "/" + file; + // fprintf(stderr, "stat %s\n", (backupdir_ + "/" + rel_file).c_str()); + ASSERT_OK( + test_backup_env_->GetFileSize(backupdir_ + "/" + rel_file, &size)); + auto e = file_sizes.find(rel_file); + if (e == file_sizes.end()) { + // The only case in which we should find files not reported + ASSERT_TRUE(has_corrupt); + } else { + ASSERT_EQ(e->second, size); + file_sizes.erase(e); + } + } + } + + // Everything should have been matched + ASSERT_EQ(file_sizes.size(), 0); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0, + bool keep_log_files = false) { + RestoreOptions restore_options(keep_log_files); + bool opened_backup_engine = false; + if (backup_engine_.get() == nullptr) { + opened_backup_engine = true; + OpenBackupEngine(); + } + AssertBackupInfoConsistency(); + + // Now perform restore + if (backup_id > 0) { + ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_, + restore_options)); + } else { + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_, + restore_options)); + } + DB* db = OpenDB(); + // Check DB contents + AssertExists(db, start_exist, end_exist); + if (end != 0) { + AssertEmpty(db, end_exist, end); + } + delete db; + if (opened_backup_engine) { + CloseBackupEngine(); + } + } + + void DeleteLogFiles() { + std::vector delete_logs; + ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs)); + for (auto f : delete_logs) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kWalFile) { + ASSERT_OK(db_chroot_env_->DeleteFile(dbname_ + "/" + f)); + } + } + } + + Status GetDataFilesInDB(const FileType& file_type, + std::vector* files) { + std::vector live; + uint64_t ignore_manifest_size; + Status s = db_->GetLiveFiles(live, &ignore_manifest_size, /*flush*/ false); + if (!s.ok()) { + return s; + } + std::vector children; + s = test_db_env_->GetChildrenFileAttributes(dbname_, &children); + for (const auto& child : children) { + FileType type; + uint64_t number = 0; + if (ParseFileName(child.name, &number, &type) && type == file_type && + std::find(live.begin(), live.end(), "/" + child.name) != live.end()) { + files->push_back(child); + } + } + return s; + } + + Status GetRandomDataFileInDB(const FileType& file_type, + std::string* fname_out, + uint64_t* fsize_out = nullptr) { + Random rnd(6); // NB: hardly "random" + std::vector files; + Status s = GetDataFilesInDB(file_type, &files); + if (!s.ok()) { + return s; + } + if (files.empty()) { + return Status::NotFound(""); + } + size_t i = rnd.Uniform(static_cast(files.size())); + *fname_out = dbname_ + "/" + files[i].name; + if (fsize_out) { + *fsize_out = files[i].size_bytes; + } + return Status::OK(); + } + + Status CorruptRandomDataFileInDB(const FileType& file_type) { + std::string fname; + uint64_t fsize = 0; + Status s = GetRandomDataFileInDB(file_type, &fname, &fsize); + if (!s.ok()) { + return s; + } + + std::string file_contents; + s = ReadFileToString(test_db_env_.get(), fname, &file_contents); + if (!s.ok()) { + return s; + } + s = test_db_env_->DeleteFile(fname); + if (!s.ok()) { + return s; + } + + file_contents[0] = (file_contents[0] + 257) % 256; + return WriteStringToFile(test_db_env_.get(), file_contents, fname); + } + + void AssertDirectoryFilesMatchRegex(const std::string& dir, + const TestRegex& pattern, + const std::string& file_type, + int minimum_count) { + std::vector children; + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + int found_count = 0; + for (const auto& child : children) { + if (EndsWith(child.name, file_type)) { + ASSERT_MATCHES_REGEX(child.name, pattern); + ++found_count; + } + } + ASSERT_GE(found_count, minimum_count); + } + + void AssertDirectoryFilesSizeIndicators(const std::string& dir, + int minimum_count) { + std::vector children; + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + int found_count = 0; + for (const auto& child : children) { + auto last_underscore = child.name.find_last_of('_'); + auto last_dot = child.name.find_last_of('.'); + ASSERT_NE(child.name, child.name.substr(0, last_underscore)); + ASSERT_NE(child.name, child.name.substr(0, last_dot)); + ASSERT_LT(last_underscore, last_dot); + std::string s = child.name.substr(last_underscore + 1, + last_dot - (last_underscore + 1)); + ASSERT_EQ(s, std::to_string(child.size_bytes)); + ++found_count; + } + ASSERT_GE(found_count, minimum_count); + } + + // files + std::string dbname_; + std::string backupdir_; + std::string latest_backup_; + + // logger_ must be above backup_engine_ such that the engine's destructor, + // which uses a raw pointer to the logger, executes first. + std::shared_ptr logger_; + + // FileSystems + std::shared_ptr db_chroot_fs_; + std::shared_ptr backup_chroot_fs_; + std::shared_ptr test_db_fs_; + std::shared_ptr test_backup_fs_; + + // Env wrappers + std::unique_ptr db_chroot_env_; + std::unique_ptr backup_chroot_env_; + std::unique_ptr test_db_env_; + std::unique_ptr test_backup_env_; + std::unique_ptr file_manager_; + std::unique_ptr db_file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // owned as db_ when present + std::unique_ptr db_; + std::unique_ptr backup_engine_; + + // options + Options options_; + + protected: + void DestroyDBWithoutCheck(const std::string& dbname, + const Options& options) { + // DestroyDB may fail because the db might not be existed for some tests + DestroyDB(dbname, options).PermitUncheckedError(); + } + + std::unique_ptr engine_options_; +}; // BackupEngineTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +class BackupEngineTestWithParam : public BackupEngineTest, + public testing::WithParamInterface { + public: + BackupEngineTestWithParam() { + engine_options_->share_files_with_checksum = GetParam(); + } + void OpenDBAndBackupEngine( + bool destroy_old_data = false, bool dummy = false, + ShareOption shared_option = kShareNoChecksum) override { + BackupEngineTest::InitializeDBAndBackupEngine(dummy); + // reset backup env defaults + test_backup_fs_->SetLimitWrittenFiles(1000000); + engine_options_->destroy_old_data = destroy_old_data; + engine_options_->share_table_files = shared_option != kNoShare; + // NOTE: keep share_files_with_checksum setting from constructor + OpenBackupEngine(destroy_old_data); + } +}; + +TEST_F(BackupEngineTest, FileCollision) { + const int keys_iteration = 100; + for (const auto& sopt : kAllShareOptions) { + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt); + FillDB(db_.get(), 0, keys_iteration); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + FillDB(db_.get(), keys_iteration, keys_iteration * 2); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // If the db directory has been cleaned up, it is sensitive to file + // collision. + DestroyDBWithoutCheck(dbname_, options_); + + // open fresh DB, but old backups present + OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */, + sopt); + FillDB(db_.get(), 0, keys_iteration); + ASSERT_OK(db_->Flush(FlushOptions())); // like backup would do + FillDB(db_.get(), keys_iteration, keys_iteration * 2); + if (sopt != kShareNoChecksum) { + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + } else { + // The new table files created in FillDB() will clash with the old + // backup and sharing tables with no checksum will have the file + // collision problem. + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + ASSERT_OK(backup_engine_->PurgeOldBackups(0)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + } + CloseDBAndBackupEngine(); + + // delete old data + DestroyDBWithoutCheck(dbname_, options_); + } +} + +// This test verifies that the verifyBackup method correctly identifies +// invalid backups +TEST_P(BackupEngineTestWithParam, VerifyBackup) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + // ---------- case 1. - valid backup ----------- + ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok()); + + // ---------- case 2. - delete a file -----------i + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1")); + ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound()); + + // ---------- case 3. - corrupt a file ----------- + std::string append_data = "Corrupting a random file"; + ASSERT_OK(file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2", + append_data)); + ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption()); + + // ---------- case 4. - invalid backup ----------- + ASSERT_TRUE(backup_engine_->VerifyBackup(6).IsNotFound()); + CloseDBAndBackupEngine(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// open DB, write, close DB, backup, restore, repeat +TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDBWithoutCheck(dbname_, options_); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenDBAndBackupEngine(destroy_data); + destroy_data = false; + // kAutoFlushOnly to preserve legacy test behavior (consider updating) + FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0)) + << "iter: " << iter << ", idx: " << i; + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; + + // ---- restore the DB ---- + OpenBackupEngine(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(backup_engine_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseBackupEngine(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDBWithoutCheck(dbname_, options_); + + // TODO: Implement & test db_paths support in backup (not supported in + // restore) + // options_.db_paths.emplace_back(dbname_, 500 * 1024); + // options_.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024); + + OpenDBAndBackupEngine(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + ASSERT_OK(backup_engine_->DeleteBackup(2)); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // kAutoFlushOnly to preserve legacy test behavior (consider updating) + FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + } + // close and destroy + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; + + // ---- restore every backup and verify all the data is there ---- + OpenBackupEngine(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(backup_engine_->DeleteBackup(4)); + ASSERT_OK(backup_engine_->PurgeOldBackups(2)); + + std::vector backup_info; + backup_engine_->GetBackupInfo(&backup_info); + ASSERT_EQ(2UL, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseBackupEngine(); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +INSTANTIATE_TEST_CASE_P(BackupEngineTestWithParam, BackupEngineTestWithParam, + ::testing::Bool()); + +// this will make sure that backup does not copy the same file twice +TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) { + OpenDBAndBackupEngine(true, true); + + // should write 5 DB files + one meta file + test_backup_fs_->SetLimitWrittenFiles(7); + test_backup_fs_->ClearWrittenFiles(); + test_db_fs_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = {"00010.sst", "00011.sst", "CURRENT", "MANIFEST-01", + "00011.log"}; + test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + std::vector should_have_written = { + "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", "/private/1/CURRENT", + "/private/1/MANIFEST-01", "/private/1/00011.log", "/meta/.1.tmp"}; + AppendPath(backupdir_, should_have_written); + test_backup_fs_->AssertWrittenFiles(should_have_written); + + char db_number = '1'; + + for (std::string other_sst : {"00015.sst", "00017.sst", "00019.sst"}) { + // should write 4 new DB files + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_fs_->SetLimitWrittenFiles(6); + test_backup_fs_->ClearWrittenFiles(); + + dummy_db_->live_files_ = {"00010.sst", other_sst, "CURRENT", "MANIFEST-01", + "00011.log"}; + test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + // should not open 00010.sst - it's already there + + ++db_number; + std::string private_dir = std::string("/private/") + db_number; + should_have_written = { + "/shared/." + other_sst + ".tmp", private_dir + "/CURRENT", + private_dir + "/MANIFEST-01", private_dir + "/00011.log", + std::string("/meta/.") + db_number + ".tmp"}; + AppendPath(backupdir_, should_have_written); + test_backup_fs_->AssertWrittenFiles(should_have_written); + } + + ASSERT_OK(backup_engine_->DeleteBackup(1)); + ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(Status::NotFound(), + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size = 0; + ASSERT_OK(test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", + &size)); + ASSERT_EQ(100UL, size); + ASSERT_OK( + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size)); + ASSERT_EQ(200UL, size); + + CloseBackupEngine(); + + // + // Now simulate incomplete delete by removing just meta + // + ASSERT_OK(test_backup_env_->DeleteFile(backupdir_ + "/meta/2")); + + OpenBackupEngine(); + + // 1 appears to be removed, so + // 2 non-corrupt and 0 corrupt seen + std::vector backup_info; + std::vector corrupt_backup_ids; + backup_engine_->GetBackupInfo(&backup_info); + backup_engine_->GetCorruptedBackups(&corrupt_backup_ids); + ASSERT_EQ(2UL, backup_info.size()); + ASSERT_EQ(0UL, corrupt_backup_ids.size()); + + // Keep the two we see, but this should suffice to purge unreferenced + // shared files from incomplete delete. + ASSERT_OK(backup_engine_->PurgeOldBackups(2)); + + // Make sure dangling sst file has been removed (somewhere along this + // process). GarbageCollect should not be needed. + ASSERT_EQ(Status::NotFound(), + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst")); + ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst")); + + // Now actually purge a good one + ASSERT_OK(backup_engine_->PurgeOldBackups(1)); + + ASSERT_EQ(Status::NotFound(), + test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst")); + ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst")); + + CloseDBAndBackupEngine(); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +// 3. Corrupted checksum value - if the checksum is not a valid uint32_t, +// db open should fail, otherwise, it aborts during the restore process. +TEST_F(BackupEngineTest, CorruptionsTest) { + const int keys_iteration = 5000; + Random rnd(6); + Status s; + + OpenDBAndBackupEngine(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_fs_->SetLimitWrittenFiles(2); + // should fail + s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)); + ASSERT_NOK(s); + test_backup_fs_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseDBAndBackupEngine(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // --------- case 2. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenBackupEngine(); + s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_NOK(s); + CloseBackupEngine(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenBackupEngine(); + s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseBackupEngine(); + ASSERT_NOK(s); + + // --------- case 3. corrupted checksum value ---- + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false)); + // checksum of backup 3 is an invalid value, this can be detected at + // db open time, and it reverts to the previous backup automatically + AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5); + // checksum of the backup 2 appears to be valid, this can cause checksum + // mismatch and abort restore process + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true)); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2")); + OpenBackupEngine(); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2")); + s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_NOK(s); + + // make sure that no corrupt backups have actually been deleted! + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/3")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/4")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/1")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/2")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/3")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/4")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5")); + + // delete the corrupt backups and then make sure they're actually deleted + ASSERT_OK(backup_engine_->DeleteBackup(5)); + ASSERT_OK(backup_engine_->DeleteBackup(4)); + ASSERT_OK(backup_engine_->DeleteBackup(3)); + ASSERT_OK(backup_engine_->DeleteBackup(2)); + // Should not be needed anymore with auto-GC on DeleteBackup + //(void)backup_engine_->GarbageCollect(); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/meta/5")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/private/5")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/meta/4")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/private/4")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/meta/3")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/private/3")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/meta/2")); + ASSERT_EQ(Status::NotFound(), + file_manager_->FileExists(backupdir_ + "/private/2")); + CloseBackupEngine(); + AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5); + + // new backup should be 2! + OpenDBAndBackupEngine(); + FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + CloseDBAndBackupEngine(); + AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5); +} + +// Corrupt a file but maintain its size +TEST_F(BackupEngineTest, CorruptFileMaintainSize) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true); + // create a backup + FillDB(db_.get(), 0, keys_iteration); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + // verify with file size + ASSERT_OK(backup_engine_->VerifyBackup(1, false)); + // verify with file checksum + ASSERT_OK(backup_engine_->VerifyBackup(1, true)); + + std::string file_to_corrupt; + uint64_t file_size = 0; + // under normal circumstance, there should be at least one nonempty file + while (file_size == 0) { + // get a random file in /private/1 + assert(file_manager_ + ->GetRandomFileInDir(backupdir_ + "/private/1", &file_to_corrupt, + &file_size) + .ok()); + // corrupt the file by replacing its content by file_size random bytes + ASSERT_OK(file_manager_->CorruptFile(file_to_corrupt, file_size)); + } + // file sizes match + ASSERT_OK(backup_engine_->VerifyBackup(1, false)); + // file checksums mismatch + ASSERT_NOK(backup_engine_->VerifyBackup(1, true)); + // sanity check, use default second argument + ASSERT_OK(backup_engine_->VerifyBackup(1)); + CloseDBAndBackupEngine(); + + // an extra challenge + // set share_files_with_checksum to true and do two more backups + // corrupt all the table files in shared_checksum but maintain their sizes + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum); + // creat two backups + for (int i = 1; i < 3; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + std::vector children; + const std::string dir = backupdir_ + "/shared_checksum"; + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + for (const auto& child : children) { + if (child.size_bytes == 0) { + continue; + } + // corrupt the file by replacing its content by file_size random bytes + ASSERT_OK( + file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes)); + } + // file sizes match + ASSERT_OK(backup_engine_->VerifyBackup(1, false)); + ASSERT_OK(backup_engine_->VerifyBackup(2, false)); + // file checksums mismatch + ASSERT_NOK(backup_engine_->VerifyBackup(1, true)); + ASSERT_NOK(backup_engine_->VerifyBackup(2, true)); + CloseDBAndBackupEngine(); +} + +// Corrupt a blob file but maintain its size +TEST_P(BackupEngineTestWithParam, CorruptBlobFileMaintainSize) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true); + // create a backup + FillDB(db_.get(), 0, keys_iteration); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + // verify with file size + ASSERT_OK(backup_engine_->VerifyBackup(1, false)); + // verify with file checksum + ASSERT_OK(backup_engine_->VerifyBackup(1, true)); + + std::string file_to_corrupt; + std::vector children; + + std::string dir = backupdir_; + if (engine_options_->share_files_with_checksum) { + dir += "/shared_checksum"; + } else { + dir += "/shared"; + } + + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + + for (const auto& child : children) { + if (EndsWith(child.name, ".blob") && child.size_bytes != 0) { + // corrupt the blob files by replacing its content by file_size random + // bytes + ASSERT_OK( + file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes)); + } + } + + // file sizes match + ASSERT_OK(backup_engine_->VerifyBackup(1, false)); + // file checksums mismatch + ASSERT_NOK(backup_engine_->VerifyBackup(1, true)); + // sanity check, use default second argument + ASSERT_OK(backup_engine_->VerifyBackup(1)); + CloseDBAndBackupEngine(); +} + +// Test if BackupEngine will fail to create new backup if some table has been +// corrupted and the table file checksum is stored in the DB manifest +TEST_F(BackupEngineTest, TableFileCorruptedBeforeBackup) { + const int keys_iteration = 50000; + + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kNoShare); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random table file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kTableFile)); + // file_checksum_gen_factory is null, and thus table checksum is not + // verified for creating a new backup; no correction is detected + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + + // Enable table file checksum in DB manifest + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kNoShare); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random table file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kTableFile)); + // table file checksum is enabled so we should be able to detect any + // corruption + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); +} + +// Test if BackupEngine will fail to create new backup if some blob files has +// been corrupted and the blob file checksum is stored in the DB manifest +TEST_F(BackupEngineTest, BlobFileCorruptedBeforeBackup) { + const int keys_iteration = 50000; + + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kNoShare); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random blob file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile)); + // file_checksum_gen_factory is null, and thus blob checksum is not + // verified for creating a new backup; no correction is detected + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + + // Enable file checksum in DB manifest + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kNoShare); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random blob file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile)); + + // file checksum is enabled so we should be able to detect any + // corruption + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Test if BackupEngine will fail to create new backup if some table has been +// corrupted and the table file checksum is stored in the DB manifest for the +// case when backup table files will be stored in a shared directory +TEST_P(BackupEngineTestWithParam, TableFileCorruptedBeforeBackup) { + const int keys_iteration = 50000; + + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random table file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kTableFile)); + // cannot detect corruption since DB manifest has no table checksums + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + + // Enable table checksums in DB manifest + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random table file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kTableFile)); + // corruption is detected + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); +} + +// Test if BackupEngine will fail to create new backup if some blob files have +// been corrupted and the blob file checksum is stored in the DB manifest for +// the case when backup blob files will be stored in a shared directory +TEST_P(BackupEngineTestWithParam, BlobFileCorruptedBeforeBackup) { + const int keys_iteration = 50000; + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random blob file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile)); + // cannot detect corruption since DB manifest has no blob file checksums + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + + // Enable blob file checksums in DB manifest + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, keys_iteration); + CloseAndReopenDB(/*read_only*/ true); + // corrupt a random blob file in the DB directory + ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile)); + // corruption is detected + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) { + const int keys_iteration = 50000; + engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize; + // When share_files_with_checksum is on, we calculate checksums of table + // files before and after copying. So we can test whether a corruption has + // happened during the file is copied to backup directory. + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum); + + FillDB(db_.get(), 0, keys_iteration); + std::atomic corrupted{false}; + // corrupt files when copying to the backup directory + SyncPoint::GetInstance()->SetCallBack( + "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", + [&](void* data) { + if (data != nullptr) { + Slice* d = reinterpret_cast(data); + if (!d->empty()) { + d->remove_suffix(1); + corrupted = true; + } + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = backup_engine_->CreateNewBackup(db_.get()); + if (corrupted) { + ASSERT_NOK(s); + } else { + // should not in this path in normal cases + ASSERT_OK(s); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + CloseDBAndBackupEngine(); + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) { + const int keys_iteration = 50000; + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + for (auto& sopt : kAllShareOptions) { + // Since the default DB table file checksum is on, we obtain checksums of + // table files from the DB manifest before copying and verify it with the + // one calculated during copying. + // Therefore, we can test whether a corruption has happened during the file + // being copied to backup directory. + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt); + + FillDB(db_.get(), 0, keys_iteration); + + // corrupt files when copying to the backup directory + SyncPoint::GetInstance()->SetCallBack( + "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup", + [&](void* data) { + if (data != nullptr) { + Slice* d = reinterpret_cast(data); + if (!d->empty()) { + d->remove_suffix(1); + } + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + // The only case that we can't detect a corruption is when the file + // being backed up is empty. But as keys_iteration is large, such + // a case shouldn't have happened and we should be able to detect + // the corruption. + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + CloseDBAndBackupEngine(); + // delete old files in db + DestroyDBWithoutCheck(dbname_, options_); + } +} + +TEST_F(BackupEngineTest, InterruptCreationTest) { + // Interrupt backup creation by failing new writes and failing cleanup of the + // partial state. Then verify a subsequent backup can still succeed. + const int keys_iteration = 5000; + Random rnd(6); + + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, keys_iteration); + test_backup_fs_->SetLimitWrittenFiles(2); + test_backup_fs_->SetDeleteFileFailure(true); + // should fail creation + ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + CloseDBAndBackupEngine(); + // should also fail cleanup so the tmp directory stays behind + ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/")); + + OpenDBAndBackupEngine(false /* destroy_old_data */); + test_backup_fs_->SetLimitWrittenFiles(1000000); + test_backup_fs_->SetDeleteFileFailure(false); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + // latest backup should have all the keys + CloseDBAndBackupEngine(); + AssertBackupConsistency(0, 0, keys_iteration); +} + +TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) { + const int keys_iteration = 5000; + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + for (const auto& sopt : kAllShareOptions) { + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt); + FillDB(db_.get(), 0, keys_iteration); + // That FillDB leaves a mix of flushed and unflushed data + SyncPoint::GetInstance()->LoadDependency( + {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1", + "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"}, + {"BackupEngineTest::FlushCompactDuringBackupCheckpoint:After", + "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() { + TEST_SYNC_POINT( + "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"); + FillDB(db_.get(), keys_iteration, 2 * keys_iteration); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast(db_.get()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbi->TEST_WaitForCompact()); + TEST_SYNC_POINT( + "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After"); + }}; + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + flush_thread.join(); + CloseDBAndBackupEngine(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + /* FIXME(peterd): reinstate with option for checksum in file names + if (sopt == kShareWithChecksum) { + // Ensure we actually got DB manifest checksums by inspecting + // shared_checksum file names for hex checksum component + TestRegex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst"); + std::vector children; + const std::string dir = backupdir_ + "/shared_checksum"; + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); + for (const auto& child : children) { + if (child.size_bytes == 0) { + continue; + } + EXPECT_MATCHES_REGEX(child.name, expected); + } + } + */ + AssertBackupConsistency(0, 0, keys_iteration); + } +} + +inline std::string OptionsPath(std::string ret, int backupID) { + ret += "/private/"; + ret += std::to_string(backupID); + ret += "/"; + return ret; +} + +// Backup the LATEST options file to +// "/private//OPTIONS" + +TEST_F(BackupEngineTest, BackupOptions) { + OpenDBAndBackupEngine(true); + for (int i = 1; i < 5; i++) { + std::string name; + std::vector filenames; + // Must reset() before reset(OpenDB()) again. + // Calling OpenDB() while *db_ is existing will cause LOCK issue + db_.reset(); + db_.reset(OpenDB()); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(), + options_.env, &name)); + ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name)); + ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i), + &filenames)); + for (auto fn : filenames) { + if (fn.compare(0, 7, "OPTIONS") == 0) { + ASSERT_EQ(name, fn); + } + } + } + + CloseDBAndBackupEngine(); +} + +TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) { + OpenDBAndBackupEngine(true); + SyncPoint::GetInstance()->LoadDependency( + {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1", + "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"}, + {"BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions", + "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() { + TEST_SYNC_POINT( + "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"); + DBImpl* dbi = static_cast(db_.get()); + // Change arbitrary option to trigger OPTIONS file deletion + ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(), + {{"paranoid_file_checks", "false"}})); + ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(), + {{"paranoid_file_checks", "true"}})); + ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(), + {{"paranoid_file_checks", "false"}})); + TEST_SYNC_POINT( + "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions"); + }}; + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + setoptions_thread.join(); + CloseDBAndBackupEngine(); +} + +// This test verifies we don't delete the latest backup when read-only option is +// set +TEST_F(BackupEngineTest, NoDeleteWithReadOnly) { + const int keys_iteration = 5000; + Random rnd(6); + + OpenDBAndBackupEngine(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2))); + } + CloseDBAndBackupEngine(); + ASSERT_OK(file_manager_->WriteToFile(latest_backup_, "4")); + + engine_options_->destroy_old_data = false; + BackupEngineReadOnly* read_only_backup_engine; + ASSERT_OK(BackupEngineReadOnly::Open( + backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine)); + + // assert that data from backup 5 is still here (even though LATEST_BACKUP + // says 4 is latest) + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5")); + ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5")); + + // Behavior change: We now ignore LATEST_BACKUP contents. This means that + // we should have 5 backups, even if LATEST_BACKUP says 4. + std::vector backup_info; + read_only_backup_engine->GetBackupInfo(&backup_info); + ASSERT_EQ(5UL, backup_info.size()); + delete read_only_backup_engine; +} + +TEST_F(BackupEngineTest, FailOverwritingBackups) { + options_.write_buffer_size = 1024 * 1024 * 1024; // 1GB + options_.disable_auto_compactions = true; + + // create backups 1, 2, 3, 4, 5 + OpenDBAndBackupEngine(true); + for (int i = 0; i < 5; ++i) { + CloseDBAndBackupEngine(); + DeleteLogFiles(); + OpenDBAndBackupEngine(false); + FillDB(db_.get(), 100 * i, 100 * (i + 1), kFlushAll); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + } + CloseDBAndBackupEngine(); + + // restore 3 + OpenBackupEngine(); + ASSERT_OK(backup_engine_->RestoreDBFromBackup(3, dbname_, dbname_)); + CloseBackupEngine(); + + OpenDBAndBackupEngine(false); + // More data, bigger SST + FillDB(db_.get(), 1000, 1300, kFlushAll); + Status s = backup_engine_->CreateNewBackup(db_.get()); + // the new backup fails because new table files + // clash with old table files from backups 4 and 5 + // (since write_buffer_size is huge, we can be sure that + // each backup will generate only one sst file and that + // a file generated here would have the same name as an + // sst file generated by backup 4, and will be bigger) + ASSERT_TRUE(s.IsCorruption()); + ASSERT_OK(backup_engine_->DeleteBackup(4)); + ASSERT_OK(backup_engine_->DeleteBackup(5)); + // now, the backup can succeed + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); +} + +TEST_F(BackupEngineTest, NoShareTableFiles) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true, false, kNoShare); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2))); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +// Verify that you can backup and restore with share_files_with_checksum on +TEST_F(BackupEngineTest, ShareTableFilesWithChecksums) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2))); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +// Verify that you can backup and restore using share_files_with_checksum set to +// false and then transition this option to true +TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsTransition) { + const int keys_iteration = 5000; + // set share_files_with_checksum to false + OpenDBAndBackupEngine(true, false, kShareNoChecksum); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } + + // set share_files_with_checksum to true and do some more backups + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + for (int i = 5; i < 10; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + // Verify first (about to delete) + AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11); + + // For an extra challenge, make sure that GarbageCollect / DeleteBackup + // is OK even if we open without share_table_files + OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); + ASSERT_OK(backup_engine_->DeleteBackup(1)); + ASSERT_OK(backup_engine_->GarbageCollect()); + CloseDBAndBackupEngine(); + + // Verify rest (not deleted) + for (int i = 1; i < 10; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 11); + } +} + +// Verify backup and restore with various naming options, check names +TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) { + ASSERT_TRUE(engine_options_->share_files_with_checksum_naming == + kNamingDefault); + + const int keys_iteration = 5000; + + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + FillDB(db_.get(), 0, keys_iteration); + CloseDBAndBackupEngine(); + + static const std::map option_to_expected = { + {kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"}, + // kFlagIncludeFileSize redundant here + {kLegacyCrc32cAndFileSize | kFlagIncludeFileSize, + "[0-9]+_[0-9]+_[0-9]+[.]sst"}, + {kUseDbSessionId, "[0-9]+_s[0-9A-Z]{20}[.]sst"}, + {kUseDbSessionId | kFlagIncludeFileSize, + "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"}, + }; + + const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob"; + + for (const auto& pair : option_to_expected) { + CloseAndReopenDB(); + engine_options_->share_files_with_checksum_naming = pair.first; + OpenBackupEngine(true /*destroy_old_data*/); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2); + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", pair.second, + ".sst", 1 /* minimum_count */); + if (std::string::npos != pair.second.GetPattern().find("_[0-9]+[.]sst")) { + AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum", + 1 /* minimum_count */); + } + + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", + blobfile_pattern, ".blob", + 1 /* minimum_count */); + } +} + +// Mimic SST file generated by pre-6.12 releases and verify that +// old names are always used regardless of naming option. +TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) { + const int keys_iteration = 5000; + + // Pre-6.12 release did not include db id and db session id properties. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + props->db_id = ""; + props->db_session_id = ""; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Corrupting the table properties corrupts the unique id. + // Ignore the unique id recorded in the manifest. + options_.verify_sst_unique_id_in_manifest = false; + + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + FillDB(db_.get(), 0, keys_iteration); + CloseDBAndBackupEngine(); + + // Old names should always be used on old files + const TestRegex sstfile_pattern("[0-9]+_[0-9]+_[0-9]+[.]sst"); + + const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob"; + + for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) { + CloseAndReopenDB(); + engine_options_->share_files_with_checksum_naming = option; + OpenBackupEngine(true /*destroy_old_data*/); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2); + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", + sstfile_pattern, ".sst", + 1 /* minimum_count */); + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", + blobfile_pattern, ".blob", + 1 /* minimum_count */); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Test how naming options interact with detecting DB corruption +// between incremental backups +TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) { + const auto share_no_checksum = static_cast(0); + + for (bool corrupt_before_first_backup : {false, true}) { + for (ShareFilesNaming option : + {share_no_checksum, kLegacyCrc32cAndFileSize, kNamingDefault}) { + auto share = + option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum; + if (option != share_no_checksum) { + engine_options_->share_files_with_checksum_naming = option; + } + OpenDBAndBackupEngine(true, false, share); + DBImpl* dbi = static_cast(db_.get()); + // A small SST file + ASSERT_OK(dbi->Put(WriteOptions(), "x", "y")); + ASSERT_OK(dbi->Flush(FlushOptions())); + // And a bigger one + ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500))); + ASSERT_OK(dbi->Flush(FlushOptions())); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + CloseAndReopenDB(/*read_only*/ true); + + std::vector table_files; + ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files)); + ASSERT_EQ(table_files.size(), 2); + std::string tf0 = dbname_ + "/" + table_files[0].name; + std::string tf1 = dbname_ + "/" + table_files[1].name; + + CloseDBAndBackupEngine(); + + if (corrupt_before_first_backup) { + // This corrupts a data block, which does not cause DB open + // failure, only failure on accessing the block. + ASSERT_OK(db_file_manager_->CorruptFileStart(tf0)); + } + + OpenDBAndBackupEngine(false, false, share); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + CloseDBAndBackupEngine(); + + // if corrupt_before_first_backup, this undoes the initial corruption + ASSERT_OK(db_file_manager_->CorruptFileStart(tf0)); + + OpenDBAndBackupEngine(false, false, share); + Status s = backup_engine_->CreateNewBackup(db_.get()); + + // Even though none of the naming options catch the inconsistency + // between the first and second time backing up fname, in the case + // of kUseDbSessionId (kNamingDefault), this is an intentional + // trade-off to avoid full scan of files from the DB that are + // already backed up. If we did the scan, kUseDbSessionId could catch + // the corruption. kLegacyCrc32cAndFileSize does the scan (to + // compute checksum for name) without catching the corruption, + // because the corruption means the names don't merge. + EXPECT_OK(s); + + // VerifyBackup doesn't check DB integrity or table file internal + // checksums + EXPECT_OK(backup_engine_->VerifyBackup(1, true)); + EXPECT_OK(backup_engine_->VerifyBackup(2, true)); + + db_.reset(); + ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_)); + { + DB* db = OpenDB(); + s = db->VerifyChecksum(); + delete db; + } + if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) { + // Second backup is OK because it used (uncorrupt) file from first + // backup instead of (corrupt) file from DB. + // This is arguably a good trade-off vs. treating the file as distinct + // from the old version, because a file should be more likely to be + // corrupt as it ages. Although the backed-up file might also corrupt + // with age, the alternative approach (checksum in file name computed + // from current DB file contents) wouldn't detect that case at backup + // time either. Although you would have both copies of the file with + // the alternative approach, that would only last until the older + // backup is deleted. + ASSERT_OK(s); + } else if (option == kLegacyCrc32cAndFileSize && + corrupt_before_first_backup) { + // Second backup is OK because it saved the updated (uncorrupt) + // file from DB, instead of the sharing with first backup. + // Recall: if corrupt_before_first_backup, [second CorruptFileStart] + // undoes the initial corruption. + // This is arguably a bad trade-off vs. sharing the old version of the + // file because a file should be more likely to corrupt as it ages. + // (Not likely that the previously backed-up version was already + // corrupt and the new version is non-corrupt. This approach doesn't + // help if backed-up version is corrupted after taking the backup.) + ASSERT_OK(s); + } else { + // Something is legitimately corrupted, but we can't be sure what + // with information available (TODO? unless one passes block checksum + // test and other doesn't. Probably better to use end-to-end full file + // checksum anyway.) + ASSERT_TRUE(s.IsCorruption()); + } + + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + } + } +} + +// Test how naming options interact with detecting file size corruption +// between incremental backups +TEST_F(BackupEngineTest, FileSizeForIncremental) { + const auto share_no_checksum = static_cast(0); + // TODO: enable blob files once Integrated BlobDB supports DB session id. + options_.enable_blob_files = false; + + for (ShareFilesNaming option : {share_no_checksum, kLegacyCrc32cAndFileSize, + kNamingDefault, kUseDbSessionId}) { + auto share = + option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum; + if (option != share_no_checksum) { + engine_options_->share_files_with_checksum_naming = option; + } + OpenDBAndBackupEngine(true, false, share); + + std::vector children; + const std::string shared_dir = + backupdir_ + + (option == share_no_checksum ? "/shared" : "/shared_checksum"); + + // A single small SST file + ASSERT_OK(db_->Put(WriteOptions(), "x", "y")); + + // First, test that we always detect file size corruption on the shared + // backup side on incremental. (Since sizes aren't really part of backup + // meta file, this works by querying the filesystem for the sizes.) + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/)); + CloseDBAndBackupEngine(); + + // Corrupt backup SST file + ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children)); + ASSERT_EQ(children.size(), 1U); // one sst + for (const auto& child : children) { + if (child.name.size() > 4 && child.size_bytes > 0) { + ASSERT_OK( + file_manager_->WriteToFile(shared_dir + "/" + child.name, "asdf")); + break; + } + } + + OpenDBAndBackupEngine(false, false, share); + Status s = backup_engine_->CreateNewBackup(db_.get()); + EXPECT_TRUE(s.IsCorruption()); + + ASSERT_OK(backup_engine_->PurgeOldBackups(0)); + CloseDBAndBackupEngine(); + + // Second, test that a hypothetical db session id collision would likely + // not suffice to corrupt a backup, because there's a good chance of + // file size difference (in this test, guaranteed) so either no name + // collision or detected collision. + + // Create backup 1 + OpenDBAndBackupEngine(false, false, share); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + // Even though we have "the same" DB state as backup 1, we need + // to restore to recreate the same conditions as later restore. + db_.reset(); + DestroyDBWithoutCheck(dbname_, options_); + ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_)); + CloseDBAndBackupEngine(); + + // Forge session id + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SetDbSessionId", [](void* sid_void_star) { + std::string* sid = static_cast(sid_void_star); + *sid = "01234567890123456789"; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Create another SST file + OpenDBAndBackupEngine(false, false, share); + ASSERT_OK(db_->Put(WriteOptions(), "y", "x")); + + // Create backup 2 + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/)); + + // Restore backup 1 (again) + db_.reset(); + DestroyDBWithoutCheck(dbname_, options_); + ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_)); + CloseDBAndBackupEngine(); + + // Create another SST file with same number and db session id, only bigger + OpenDBAndBackupEngine(false, false, share); + ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500))); + + // Count backup SSTs files. + children.clear(); + ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children)); + ASSERT_EQ(children.size(), 2U); // two sst files + + // Try create backup 3 + s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/); + + // Re-count backup SSTs + children.clear(); + ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children)); + + if (option == kUseDbSessionId) { + // Acceptable to call it corruption if size is not in name and + // db session id collision is practically impossible. + EXPECT_TRUE(s.IsCorruption()); + EXPECT_EQ(children.size(), 2U); // no SST file added + } else if (option == share_no_checksum) { + // Good to call it corruption if both backups cannot be + // accommodated. + EXPECT_TRUE(s.IsCorruption()); + EXPECT_EQ(children.size(), 2U); // no SST file added + } else { + // Since opening a DB seems sufficient for detecting size corruption + // on the DB side, this should be a good thing, ... + EXPECT_OK(s); + // ... as long as we did actually treat it as a distinct SST file. + EXPECT_EQ(children.size(), 3U); // Another SST added + } + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } +} + +// Verify backup and restore with share_files_with_checksum off and then +// transition this option to on and share_files_with_checksum_naming to be +// based on kUseDbSessionId +TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingTransition) { + const int keys_iteration = 5000; + // We may set share_files_with_checksum_naming to kLegacyCrc32cAndFileSize + // here but even if we don't, it should have no effect when + // share_files_with_checksum is false + ASSERT_TRUE(engine_options_->share_files_with_checksum_naming == + kNamingDefault); + // set share_files_with_checksum to false + OpenDBAndBackupEngine(true, false, kShareNoChecksum); + int j = 3; + for (int i = 0; i < j; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < j; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * (j + 1)); + } + + // set share_files_with_checksum to true and do some more backups + // and use session id in the name of SST file backup + ASSERT_TRUE(engine_options_->share_files_with_checksum_naming == + kNamingDefault); + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + // Use checksum in the name as well + ++j; + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + // Verify first (about to delete) + AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1)); + + // For an extra challenge, make sure that GarbageCollect / DeleteBackup + // is OK even if we open without share_table_files but with + // share_files_with_checksum_naming based on kUseDbSessionId + ASSERT_TRUE(engine_options_->share_files_with_checksum_naming == + kNamingDefault); + OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); + ASSERT_OK(backup_engine_->DeleteBackup(1)); + ASSERT_OK(backup_engine_->GarbageCollect()); + CloseDBAndBackupEngine(); + + // Verify second (about to delete) + AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1)); + + // Use checksum and file size for backup table file names and open without + // share_table_files + // Again, make sure that GarbageCollect / DeleteBackup is OK + engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize; + OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); + ASSERT_OK(backup_engine_->DeleteBackup(2)); + ASSERT_OK(backup_engine_->GarbageCollect()); + CloseDBAndBackupEngine(); + + // Verify rest (not deleted) + for (int i = 2; i < j; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * (j + 1)); + } +} + +// Verify backup and restore with share_files_with_checksum on and transition +// from kLegacyCrc32cAndFileSize to kUseDbSessionId +TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingUpgrade) { + engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize; + const int keys_iteration = 5000; + // set share_files_with_checksum to true + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + int j = 3; + for (int i = 0; i < j; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < j; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * (j + 1)); + } + + engine_options_->share_files_with_checksum_naming = kUseDbSessionId; + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + ++j; + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + // Verify first (about to delete) + AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1)); + + // For an extra challenge, make sure that GarbageCollect / DeleteBackup + // is OK even if we open without share_table_files + OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); + ASSERT_OK(backup_engine_->DeleteBackup(1)); + ASSERT_OK(backup_engine_->GarbageCollect()); + CloseDBAndBackupEngine(); + + // Verify second (about to delete) + AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1)); + + // Use checksum and file size for backup table file names and open without + // share_table_files + // Again, make sure that GarbageCollect / DeleteBackup is OK + engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize; + OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare); + ASSERT_OK(backup_engine_->DeleteBackup(2)); + ASSERT_OK(backup_engine_->GarbageCollect()); + CloseDBAndBackupEngine(); + + // Verify rest (not deleted) + for (int i = 2; i < j; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * (j + 1)); + } +} + +// This test simulates cleaning up after aborted or incomplete creation +// of a new backup. +TEST_F(BackupEngineTest, DeleteTmpFiles) { + for (int cleanup_fn : {1, 2, 3, 4}) { + for (ShareOption shared_option : kAllShareOptions) { + OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */, + shared_option); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + BackupID next_id = 1; + BackupID oldest_id = std::numeric_limits::max(); + { + std::vector backup_info; + backup_engine_->GetBackupInfo(&backup_info); + for (const auto& bi : backup_info) { + next_id = std::max(next_id, bi.backup_id + 1); + oldest_id = std::min(oldest_id, bi.backup_id); + } + } + CloseDBAndBackupEngine(); + + // An aborted or incomplete new backup will always be in the next + // id (maybe more) + std::string next_private = "private/" + std::to_string(next_id); + + // NOTE: both shared and shared_checksum should be cleaned up + // regardless of how the backup engine is opened. + std::vector tmp_files_and_dirs; + for (const auto& dir_and_file : { + std::make_pair(std::string("shared"), + std::string(".00006.sst.tmp")), + std::make_pair(std::string("shared_checksum"), + std::string(".00007.sst.tmp")), + std::make_pair(next_private, std::string("00003.sst")), + }) { + std::string dir = backupdir_ + "/" + dir_and_file.first; + ASSERT_OK(file_manager_->CreateDirIfMissing(dir)); + ASSERT_OK(file_manager_->FileExists(dir)); + + std::string file = dir + "/" + dir_and_file.second; + ASSERT_OK(file_manager_->WriteToFile(file, "tmp")); + ASSERT_OK(file_manager_->FileExists(file)); + + tmp_files_and_dirs.push_back(file); + } + if (cleanup_fn != /*CreateNewBackup*/ 4) { + // This exists after CreateNewBackup because it's deleted then + // re-created. + tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private); + } + + OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */, + shared_option); + // Need to call one of these explicitly to delete tmp files + switch (cleanup_fn) { + case 1: + ASSERT_OK(backup_engine_->GarbageCollect()); + break; + case 2: + ASSERT_OK(backup_engine_->DeleteBackup(oldest_id)); + break; + case 3: + ASSERT_OK(backup_engine_->PurgeOldBackups(1)); + break; + case 4: + // Does a garbage collect if it sees that next private dir exists + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + break; + default: + assert(false); + } + CloseDBAndBackupEngine(); + for (std::string file_or_dir : tmp_files_and_dirs) { + if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) { + FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn; + } + } + } + } +} + +TEST_F(BackupEngineTest, KeepLogFiles) { + engine_options_->backup_log_files = false; + // basically infinite + options_.WAL_ttl_seconds = 24 * 60 * 60; + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100, kFlushAll); + FillDB(db_.get(), 100, 200, kFlushAll); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + FillDB(db_.get(), 200, 300, kFlushAll); + FillDB(db_.get(), 300, 400, kFlushAll); + FillDB(db_.get(), 400, 500, kFlushAll); + CloseDBAndBackupEngine(); + + // all data should be there if we call with keep_log_files = true + AssertBackupConsistency(0, 0, 500, 600, true); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class BackupEngineRateLimitingTestWithParam + : public BackupEngineTest, + public testing::WithParamInterface< + std::tuple /* limits */>> { + public: + BackupEngineRateLimitingTestWithParam() {} +}; + +uint64_t const MB = 1024 * 1024; + +INSTANTIATE_TEST_CASE_P( + RateLimiting, BackupEngineRateLimitingTestWithParam, + ::testing::Values(std::make_tuple(false, 0, std::make_pair(1 * MB, 5 * MB)), + std::make_tuple(false, 0, std::make_pair(2 * MB, 3 * MB)), + std::make_tuple(false, 1, std::make_pair(1 * MB, 5 * MB)), + std::make_tuple(false, 1, std::make_pair(2 * MB, 3 * MB)), + std::make_tuple(true, 0, std::make_pair(1 * MB, 5 * MB)), + std::make_tuple(true, 0, std::make_pair(2 * MB, 3 * MB)), + std::make_tuple(true, 1, std::make_pair(1 * MB, 5 * MB)), + std::make_tuple(true, 1, + std::make_pair(2 * MB, 3 * MB)))); + +TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) { + size_t const kMicrosPerSec = 1000 * 1000LL; + const bool custom_rate_limiter = std::get<0>(GetParam()); + // iter 0 -- single threaded + // iter 1 -- multi threaded + const int iter = std::get<1>(GetParam()); + const std::pair limit = std::get<2>(GetParam()); + std::unique_ptr special_env( + new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true)); + // destroy old data + Options options; + options.env = special_env.get(); + DestroyDBWithoutCheck(dbname_, options); + + if (custom_rate_limiter) { + std::shared_ptr backup_rate_limiter = + std::make_shared( + limit.first, 100 * 1000 /* refill_period_us */, 10 /* fairness */, + RateLimiter::Mode::kWritesOnly /* mode */, + special_env->GetSystemClock(), false /* auto_tuned */); + std::shared_ptr restore_rate_limiter = + std::make_shared( + limit.second, 100 * 1000 /* refill_period_us */, 10 /* fairness */, + RateLimiter::Mode::kWritesOnly /* mode */, + special_env->GetSystemClock(), false /* auto_tuned */); + engine_options_->backup_rate_limiter = backup_rate_limiter; + engine_options_->restore_rate_limiter = restore_rate_limiter; + } else { + engine_options_->backup_rate_limit = limit.first; + engine_options_->restore_rate_limit = limit.second; + } + + engine_options_->max_background_operations = (iter == 0) ? 1 : 10; + options_.compression = kNoCompression; + + // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the + // `Env` to advance its time according to the fake wait duration. The + // workaround is to install a callback that advance the `Env`'s mock time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) { + int64_t time_waited_us = *static_cast(arg); + special_env->SleepForMicroseconds(static_cast(time_waited_us)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + OpenDBAndBackupEngine(true); + TEST_SetDefaultRateLimitersClock(backup_engine_.get(), + special_env->GetSystemClock()); + + size_t bytes_written = FillDB(db_.get(), 0, 10000); + + auto start_backup = special_env->NowMicros(); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + auto backup_time = special_env->NowMicros() - start_backup; + CloseDBAndBackupEngine(); + auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / limit.first; + ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time); + + OpenBackupEngine(); + TEST_SetDefaultRateLimitersClock( + backup_engine_.get(), + special_env->GetSystemClock() /* backup_rate_limiter_clock */, + special_env->GetSystemClock() /* restore_rate_limiter_clock */); + + auto start_restore = special_env->NowMicros(); + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_)); + auto restore_time = special_env->NowMicros() - start_restore; + CloseBackupEngine(); + auto rate_limited_restore_time = + (bytes_written * kMicrosPerSec) / limit.second; + ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time); + + AssertBackupConsistency(0, 0, 10000, 10100); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostTimedWait"); +} + +TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) { + const std::size_t kMicrosPerSec = 1000 * 1000LL; + const bool custom_rate_limiter = std::get<0>(GetParam()); + const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first; + const bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false; + std::unique_ptr special_env( + new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true)); + + if (custom_rate_limiter) { + std::shared_ptr backup_rate_limiter = + std::make_shared( + backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, + special_env->GetSystemClock(), false /* auto_tuned */); + engine_options_->backup_rate_limiter = backup_rate_limiter; + } else { + engine_options_->backup_rate_limit = backup_rate_limiter_limit; + } + + engine_options_->max_background_operations = is_single_threaded ? 1 : 10; + + Options options; + options.env = special_env.get(); + DestroyDBWithoutCheck(dbname_, options); + // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the + // `Env` to advance its time according to the fake wait duration. The + // workaround is to install a callback that advance the `Env`'s mock time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) { + int64_t time_waited_us = *static_cast(arg); + special_env->SleepForMicroseconds(static_cast(time_waited_us)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + OpenDBAndBackupEngine(true /* destroy_old_data */); + TEST_SetDefaultRateLimitersClock(backup_engine_.get(), + special_env->GetSystemClock(), nullptr); + FillDB(db_.get(), 0, 10000); + + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + + std::vector backup_infos; + BackupInfo backup_info; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(1, backup_infos.size()); + const int backup_id = 1; + ASSERT_EQ(backup_id, backup_infos[0].backup_id); + ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info, + true /* include_file_details */)); + + std::uint64_t bytes_read_during_verify_backup = 0; + for (BackupFileInfo backup_file_info : backup_info.file_details) { + bytes_read_during_verify_backup += backup_file_info.size; + } + auto start_verify_backup = special_env->NowMicros(); + ASSERT_OK( + backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */)); + auto verify_backup_time = special_env->NowMicros() - start_verify_backup; + auto rate_limited_verify_backup_time = + (bytes_read_during_verify_backup * kMicrosPerSec) / + backup_rate_limiter_limit; + if (custom_rate_limiter) { + EXPECT_GE(verify_backup_time, 0.8 * rate_limited_verify_backup_time); + } + + CloseDBAndBackupEngine(); + AssertBackupConsistency(backup_id, 0, 10000, 10010); + DestroyDBWithoutCheck(dbname_, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostTimedWait"); +} + +TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInBackup) { + bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false; + engine_options_->max_background_operations = is_single_threaded ? 1 : 10; + + const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first; + std::shared_ptr backup_rate_limiter(NewGenericRateLimiter( + backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */)); + engine_options_->backup_rate_limiter = backup_rate_limiter; + + DestroyDBWithoutCheck(dbname_, Options()); + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum /* shared_option */); + FillDB(db_.get(), 0, 10); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + std::int64_t total_bytes_through_with_no_read_charged = + backup_rate_limiter->GetTotalBytesThrough(); + CloseBackupEngine(); + + backup_rate_limiter.reset(NewGenericRateLimiter( + backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */)); + engine_options_->backup_rate_limiter = backup_rate_limiter; + + OpenBackupEngine(true); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + std::int64_t total_bytes_through_with_read_charged = + backup_rate_limiter->GetTotalBytesThrough(); + EXPECT_GT(total_bytes_through_with_read_charged, + total_bytes_through_with_no_read_charged); + CloseDBAndBackupEngine(); + AssertBackupConsistency(1, 0, 10, 20); + DestroyDBWithoutCheck(dbname_, Options()); +} + +TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInRestore) { + bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false; + engine_options_->max_background_operations = is_single_threaded ? 1 : 10; + + const std::uint64_t restore_rate_limiter_limit = + std::get<2>(GetParam()).second; + std::shared_ptr restore_rate_limiter(NewGenericRateLimiter( + restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */)); + engine_options_->restore_rate_limiter = restore_rate_limiter; + + DestroyDBWithoutCheck(dbname_, Options()); + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, 10); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, Options()); + + OpenBackupEngine(false /* destroy_old_data */); + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_)); + std::int64_t total_bytes_through_with_no_read_charged = + restore_rate_limiter->GetTotalBytesThrough(); + CloseBackupEngine(); + DestroyDBWithoutCheck(dbname_, Options()); + + restore_rate_limiter.reset(NewGenericRateLimiter( + restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */)); + engine_options_->restore_rate_limiter = restore_rate_limiter; + + OpenBackupEngine(false /* destroy_old_data */); + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_)); + std::int64_t total_bytes_through_with_read_charged = + restore_rate_limiter->GetTotalBytesThrough(); + EXPECT_EQ(total_bytes_through_with_read_charged, + total_bytes_through_with_no_read_charged * 2); + CloseBackupEngine(); + AssertBackupConsistency(1, 0, 10, 20); + DestroyDBWithoutCheck(dbname_, Options()); +} + +TEST_P(BackupEngineRateLimitingTestWithParam, + RateLimitingChargeReadInInitialize) { + bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false; + engine_options_->max_background_operations = is_single_threaded ? 1 : 10; + + const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first; + std::shared_ptr backup_rate_limiter(NewGenericRateLimiter( + backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */)); + engine_options_->backup_rate_limiter = backup_rate_limiter; + + DestroyDBWithoutCheck(dbname_, Options()); + OpenDBAndBackupEngine(true /* destroy_old_data */); + FillDB(db_.get(), 0, 10); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + CloseDBAndBackupEngine(); + AssertBackupConsistency(1, 0, 10, 20); + + std::int64_t total_bytes_through_before_initialize = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + OpenDBAndBackupEngine(false /* destroy_old_data */); + // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile, + // which is called in BackupEngineImpl::Initialize() during + // OpenBackupEngine(false) + EXPECT_GT(engine_options_->backup_rate_limiter->GetTotalBytesThrough(), + total_bytes_through_before_initialize); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, Options()); +} + +class BackupEngineRateLimitingTestWithParam2 + : public BackupEngineTest, + public testing::WithParamInterface< + std::tuple /* limits */>> { + public: + BackupEngineRateLimitingTestWithParam2() {} +}; + +INSTANTIATE_TEST_CASE_P( + LowRefillBytesPerPeriod, BackupEngineRateLimitingTestWithParam2, + ::testing::Values(std::make_tuple(std::make_pair(1, 1)))); +// To verify we don't request over-sized bytes relative to +// refill_bytes_per_period_ in each RateLimiter::Request() called in +// BackupEngine through verifying we don't trigger assertion +// failure on over-sized request in GenericRateLimiter in debug builds +TEST_P(BackupEngineRateLimitingTestWithParam2, + RateLimitingWithLowRefillBytesPerPeriod) { + SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); + + engine_options_->max_background_operations = 1; + const uint64_t backup_rate_limiter_limit = std::get<0>(GetParam()).first; + std::shared_ptr backup_rate_limiter( + std::make_shared( + backup_rate_limiter_limit, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, + special_env.GetSystemClock(), false /* auto_tuned */)); + + engine_options_->backup_rate_limiter = backup_rate_limiter; + + const uint64_t restore_rate_limiter_limit = std::get<0>(GetParam()).second; + std::shared_ptr restore_rate_limiter( + std::make_shared( + restore_rate_limiter_limit, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */, + special_env.GetSystemClock(), false /* auto_tuned */)); + + engine_options_->restore_rate_limiter = restore_rate_limiter; + + // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the + // `Env` to advance its time according to the fake wait duration. The + // workaround is to install a callback that advance the `Env`'s mock time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) { + int64_t time_waited_us = *static_cast(arg); + special_env.SleepForMicroseconds(static_cast(time_waited_us)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyDBWithoutCheck(dbname_, Options()); + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum /* shared_option */); + + FillDB(db_.get(), 0, 100); + int64_t total_bytes_through_before_backup = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + EXPECT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + int64_t total_bytes_through_after_backup = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + ASSERT_GT(total_bytes_through_after_backup, + total_bytes_through_before_backup); + + std::vector backup_infos; + BackupInfo backup_info; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(1, backup_infos.size()); + const int backup_id = 1; + ASSERT_EQ(backup_id, backup_infos[0].backup_id); + ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info, + true /* include_file_details */)); + int64_t total_bytes_through_before_verify_backup = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + EXPECT_OK( + backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */)); + int64_t total_bytes_through_after_verify_backup = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + ASSERT_GT(total_bytes_through_after_verify_backup, + total_bytes_through_before_verify_backup); + + CloseDBAndBackupEngine(); + AssertBackupConsistency(backup_id, 0, 100, 101); + + int64_t total_bytes_through_before_initialize = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + OpenDBAndBackupEngine(false /* destroy_old_data */); + // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile, + // which is called in BackupEngineImpl::Initialize() during + // OpenBackupEngine(false) + int64_t total_bytes_through_after_initialize = + engine_options_->backup_rate_limiter->GetTotalBytesThrough(); + ASSERT_GT(total_bytes_through_after_initialize, + total_bytes_through_before_initialize); + CloseDBAndBackupEngine(); + + DestroyDBWithoutCheck(dbname_, Options()); + OpenBackupEngine(false /* destroy_old_data */); + int64_t total_bytes_through_before_restore = + engine_options_->restore_rate_limiter->GetTotalBytesThrough(); + EXPECT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_)); + int64_t total_bytes_through_after_restore = + engine_options_->restore_rate_limiter->GetTotalBytesThrough(); + ASSERT_GT(total_bytes_through_after_restore, + total_bytes_through_before_restore); + CloseBackupEngine(); + + DestroyDBWithoutCheck(dbname_, Options()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostTimedWait"); +} + +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(BackupEngineTest, ReadOnlyBackupEngine) { + DestroyDBWithoutCheck(dbname_, options_); + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100); + // Also test read-only DB with CreateNewBackup and flush=true (no flush) + CloseAndReopenDB(/*read_only*/ true); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true)); + CloseAndReopenDB(/*read_only*/ false); + FillDB(db_.get(), 100, 200); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true)); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + + engine_options_->destroy_old_data = false; + test_backup_fs_->ClearWrittenFiles(); + test_backup_fs_->SetLimitDeleteFiles(0); + BackupEngineReadOnly* read_only_backup_engine; + ASSERT_OK(BackupEngineReadOnly::Open(db_chroot_env_.get(), *engine_options_, + &read_only_backup_engine)); + std::vector backup_info; + read_only_backup_engine->GetBackupInfo(&backup_info); + ASSERT_EQ(backup_info.size(), 2U); + + RestoreOptions restore_options(false); + ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup( + dbname_, dbname_, restore_options)); + delete read_only_backup_engine; + std::vector should_have_written; + test_backup_fs_->AssertWrittenFiles(should_have_written); + + DB* db = OpenDB(); + AssertExists(db, 0, 200); + delete db; +} + +TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) { + DestroyDBWithoutCheck(dbname_, options_); + options_.write_dbid_to_manifest = false; + + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false)); + + options_.write_dbid_to_manifest = true; // exercises some read-only DB code + CloseAndReopenDB(); + + FillDB(db_.get(), 100, 200); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false)); + db_.reset(); // CloseDB + DestroyDBWithoutCheck(dbname_, options_); + BackupInfo backup_info; + // First, check that we get empty fields without include_file_details + ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info, + /*with file details*/ false)); + ASSERT_EQ(backup_info.name_for_open, ""); + ASSERT_FALSE(backup_info.env_for_open); + + // Now for the real test + backup_info = BackupInfo(); + ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info, + /*with file details*/ true)); + + // Caution: DBOptions only holds a raw pointer to Env, so something else + // must keep it alive. + // Case 1: Keeping BackupEngine open suffices to keep Env alive + DB* db = nullptr; + Options opts = options_; + // Ensure some key defaults are set + opts.wal_dir = ""; + opts.create_if_missing = false; + opts.info_log.reset(); + + opts.env = backup_info.env_for_open.get(); + std::string name = backup_info.name_for_open; + backup_info = BackupInfo(); + ASSERT_OK(DB::OpenForReadOnly(opts, name, &db)); + + AssertExists(db, 0, 100); + AssertEmpty(db, 100, 200); + + delete db; + db = nullptr; + + // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices + ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info, + /*with file details*/ true)); + CloseBackupEngine(); + opts.create_if_missing = true; // check also OK (though pointless) + opts.env = backup_info.env_for_open.get(); + name = backup_info.name_for_open; + // Note: keeping backup_info alive + ASSERT_OK(DB::OpenForReadOnly(opts, name, &db)); + + AssertExists(db, 0, 200); + delete db; + db = nullptr; + + // Now try opening read-write and make sure it fails, for safety. + ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError()); +} + +TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) { + DestroyDBWithoutCheck(dbname_, options_); + // Too big for this small DB + engine_options_->callback_trigger_interval_size = 100000; + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100); + bool is_callback_invoked = false; + ASSERT_OK(backup_engine_->CreateNewBackup( + db_.get(), true, + [&is_callback_invoked]() { is_callback_invoked = true; })); + ASSERT_FALSE(is_callback_invoked); + CloseBackupEngine(); + + // Easily small enough for this small DB + engine_options_->callback_trigger_interval_size = 1000; + OpenBackupEngine(); + ASSERT_OK(backup_engine_->CreateNewBackup( + db_.get(), true, + [&is_callback_invoked]() { is_callback_invoked = true; })); + ASSERT_TRUE(is_callback_invoked); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, GarbageCollectionBeforeBackup) { + DestroyDBWithoutCheck(dbname_, options_); + OpenDBAndBackupEngine(true); + + ASSERT_OK(backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared")); + std::string file_five = backupdir_ + "/shared/000009.sst"; + std::string file_five_contents = "I'm not really a sst file"; + // this depends on the fact that 00009.sst is the first file created by the DB + ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents)); + + FillDB(db_.get(), 0, 100); + // backup overwrites file 000009.sst + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + + std::string new_file_five_contents; + ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five, + &new_file_five_contents)); + // file 000009.sst was overwritten + ASSERT_TRUE(new_file_five_contents != file_five_contents); + + CloseDBAndBackupEngine(); + + AssertBackupConsistency(0, 0, 100); +} + +// Test that we properly propagate Env failures +TEST_F(BackupEngineTest, EnvFailures) { + BackupEngine* backup_engine; + + // get children failure + { + test_backup_fs_->SetGetChildrenFailure(true); + ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + test_backup_fs_->SetGetChildrenFailure(false); + } + + // created dir failure + { + test_backup_fs_->SetCreateDirIfMissingFailure(true); + ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + test_backup_fs_->SetCreateDirIfMissingFailure(false); + } + + // new directory failure + { + test_backup_fs_->SetNewDirectoryFailure(true); + ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + test_backup_fs_->SetNewDirectoryFailure(false); + } + + // Read from meta-file failure + { + DestroyDBWithoutCheck(dbname_, options_); + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + test_backup_fs_->SetDummySequentialFile(true); + test_backup_fs_->SetDummySequentialFileFailReads(true); + engine_options_->destroy_old_data = false; + ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + test_backup_fs_->SetDummySequentialFile(false); + test_backup_fs_->SetDummySequentialFileFailReads(false); + } + + // no failure + { + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_, + &backup_engine)); + delete backup_engine; + } +} + +// Verify manifest can roll while a backup is being created with the old +// manifest. +TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) { + DestroyDBWithoutCheck(dbname_, options_); + options_.max_manifest_file_size = 0; // always rollover manifest for file add + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100, kAutoFlushOnly); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1", + "VersionSet::LogAndApply:WriteManifest"}, + {"VersionSet::LogAndApply:WriteManifestDone", + "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread flush_thread{ + [this]() { ASSERT_OK(db_->Flush(FlushOptions())); }}; + + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + + flush_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // The last manifest roll would've already been cleaned up by the full scan + // that happens when CreateNewBackup invokes EnableFileDeletions. We need to + // trigger another roll to verify non-full scan purges stale manifests. + DBImpl* db_impl = static_cast_with_check(db_.get()); + std::string prev_manifest_path = + DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo()); + FillDB(db_.get(), 0, 100, kAutoFlushOnly); + ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path)); + ASSERT_OK(db_->Flush(FlushOptions())); + // Even though manual flush completed above, the background thread may not + // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait + // until all the background thread's work has completed, including cleanup. + ASSERT_OK(db_impl->TEST_WaitForBackgroundWork()); + ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound()); + + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); + AssertBackupConsistency(0, 0, 100); +} + +// see https://github.com/facebook/rocksdb/issues/921 +TEST_F(BackupEngineTest, Issue921Test) { + BackupEngine* backup_engine; + engine_options_->share_table_files = false; + ASSERT_OK( + backup_chroot_env_->CreateDirIfMissing(engine_options_->backup_dir)); + engine_options_->backup_dir += "/new_dir"; + ASSERT_OK(BackupEngine::Open(backup_chroot_env_.get(), *engine_options_, + &backup_engine)); + + delete backup_engine; +} + +TEST_F(BackupEngineTest, BackupWithMetadata) { + const int keys_iteration = 5000; + OpenDBAndBackupEngine(true); + // create five backups + for (int i = 0; i < 5; ++i) { + const std::string metadata = std::to_string(i); + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + // Here also test CreateNewBackupWithMetadata with CreateBackupOptions + // and outputting saved BackupID. + CreateBackupOptions opts; + opts.flush_before_backup = true; + BackupID new_id = 0; + ASSERT_OK(backup_engine_->CreateNewBackupWithMetadata(opts, db_.get(), + metadata, &new_id)); + ASSERT_EQ(new_id, static_cast(i + 1)); + } + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + { // Verify in bulk BackupInfo + std::vector backup_infos; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(5, backup_infos.size()); + for (int i = 0; i < 5; i++) { + ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata); + } + } + // Also verify in individual BackupInfo + for (int i = 0; i < 5; i++) { + BackupInfo backup_info; + ASSERT_OK(backup_engine_->GetBackupInfo(static_cast(i + 1), + &backup_info)); + ASSERT_EQ(std::to_string(i), backup_info.app_metadata); + } + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, BinaryMetadata) { + OpenDBAndBackupEngine(true); + std::string binaryMetadata = "abc\ndef"; + binaryMetadata.push_back('\0'); + binaryMetadata.append("ghi"); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), binaryMetadata)); + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + std::vector backup_infos; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(1, backup_infos.size()); + ASSERT_EQ(binaryMetadata, backup_infos[0].app_metadata); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, MetadataTooLarge) { + OpenDBAndBackupEngine(true); + std::string largeMetadata(1024 * 1024 + 1, 0); + ASSERT_NOK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), largeMetadata)); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, MetaSchemaVersion2_SizeCorruption) { + engine_options_->schema_version = 1; + OpenDBAndBackupEngine(/*destroy_old_data*/ true); + + // Backup 1: no future schema, no sizes, with checksums + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + CloseDBAndBackupEngine(); + engine_options_->schema_version = 2; + OpenDBAndBackupEngine(/*destroy_old_data*/ false); + + // Backup 2: no checksums, no sizes + TEST_BackupMetaSchemaOptions test_opts; + test_opts.crc32c_checksums = false; + test_opts.file_sizes = false; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + // Backup 3: no checksums, with sizes + test_opts.file_sizes = true; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + // Backup 4: with checksums and sizes + test_opts.crc32c_checksums = true; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + CloseDBAndBackupEngine(); + + // Corrupt all the CURRENT files with the wrong size + const std::string private_dir = backupdir_ + "/private"; + + for (int id = 1; id <= 3; ++id) { + ASSERT_OK(file_manager_->WriteToFile( + private_dir + "/" + std::to_string(id) + "/CURRENT", "x")); + } + // Except corrupt Backup 4 with same size CURRENT file + { + uint64_t size = 0; + ASSERT_OK(test_backup_env_->GetFileSize(private_dir + "/4/CURRENT", &size)); + ASSERT_OK(file_manager_->WriteToFile(private_dir + "/4/CURRENT", + std::string(size, 'x'))); + } + + OpenBackupEngine(); + + // Only the one with sizes in metadata will be immediately detected + // as corrupt + std::vector corrupted; + backup_engine_->GetCorruptedBackups(&corrupted); + ASSERT_EQ(corrupted.size(), 1); + ASSERT_EQ(corrupted[0], 3); + + // Size corruption detected on Restore with checksum + ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(1 /*id*/, dbname_, dbname_) + .IsCorruption()); + + // Size corruption not detected without checksums nor sizes + ASSERT_OK(backup_engine_->RestoreDBFromBackup(2 /*id*/, dbname_, dbname_)); + + // Non-size corruption detected on Restore with checksum + ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(4 /*id*/, dbname_, dbname_) + .IsCorruption()); + + CloseBackupEngine(); +} + +TEST_F(BackupEngineTest, MetaSchemaVersion2_NotSupported) { + engine_options_->schema_version = 2; + TEST_BackupMetaSchemaOptions test_opts; + std::string app_metadata = "abc\ndef"; + + OpenDBAndBackupEngine(true); + // Start with supported + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + + // Because we are injecting badness with a TEST API, the badness is only + // detected on attempt to restore. + // Not supported versions + test_opts.version = "3"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + test_opts.version = "23.45.67"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + test_opts.version = "2"; + + // Non-ignorable fields + test_opts.meta_fields["ni::blah"] = "123"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + test_opts.meta_fields.clear(); + + test_opts.file_fields["ni::123"] = "xyz"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + test_opts.file_fields.clear(); + + test_opts.footer_fields["ni::123"] = "xyz"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK( + backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata)); + test_opts.footer_fields.clear(); + CloseDBAndBackupEngine(); + + OpenBackupEngine(); + std::vector corrupted; + backup_engine_->GetCorruptedBackups(&corrupted); + ASSERT_EQ(corrupted.size(), 5); + + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_)); + CloseBackupEngine(); +} + +TEST_F(BackupEngineTest, MetaSchemaVersion2_Restore) { + engine_options_->schema_version = 2; + TEST_BackupMetaSchemaOptions test_opts; + const int keys_iteration = 5000; + + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + FillDB(db_.get(), 0, keys_iteration); + // Start with minimum metadata to ensure it works without it being filled + // based on shared files also in other backups with the metadata. + test_opts.crc32c_checksums = false; + test_opts.file_sizes = false; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + AssertBackupConsistency(1 /* id */, 0, keys_iteration, keys_iteration * 2); + + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + test_opts.file_sizes = true; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + for (int id = 1; id <= 2; ++id) { + AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2); + } + + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + test_opts.crc32c_checksums = true; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + for (int id = 1; id <= 3; ++id) { + AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2); + } + + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + // No TEST_EnableWriteFutureSchemaVersion2 + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + for (int id = 1; id <= 4; ++id) { + AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2); + } + + OpenDBAndBackupEngine(false /* destroy_old_data */, false, + kShareWithChecksum); + // Minor version updates should be forward-compatible + test_opts.version = "2.5.70"; + test_opts.meta_fields["asdf.3456"] = "-42"; + test_opts.meta_fields["__QRST"] = " 1 $ %%& "; + test_opts.file_fields["z94._"] = "^\\"; + test_opts.file_fields["_7yyyyyyyyy"] = "111111111111"; + test_opts.footer_fields["Qwzn.tz89"] = "ASDF!!@# ##=\t "; + test_opts.footer_fields["yes"] = "no!"; + TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + for (int id = 1; id <= 5; ++id) { + AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2); + } +} + +TEST_F(BackupEngineTest, Concurrency) { + // Check that we can simultaneously: + // * Run several read operations in different threads on a single + // BackupEngine object, and + // * With another BackupEngine object on the same + // backup_dir, run the same read operations in another thread, and + // * With yet another BackupEngine object on the same + // backup_dir, create two new backups in parallel threads. + // + // Because of the challenges of integrating this into db_stress, + // this is a non-deterministic mini-stress test here instead. + + // To check for a race condition in handling buffer size based on byte + // burst limit, we need a (generous) rate limiter + std::shared_ptr limiter{NewGenericRateLimiter(1000000000)}; + engine_options_->backup_rate_limiter = limiter; + engine_options_->restore_rate_limiter = limiter; + + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + + static constexpr int keys_iteration = 5000; + FillDB(db_.get(), 0, keys_iteration); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + FillDB(db_.get(), keys_iteration, 2 * keys_iteration); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + static constexpr int max_factor = 3; + FillDB(db_.get(), 2 * keys_iteration, max_factor * keys_iteration); + // will create another backup soon... + + Options db_opts = options_; + db_opts.wal_dir = ""; + db_opts.create_if_missing = false; + BackupEngineOptions be_opts = *engine_options_; + be_opts.destroy_old_data = false; + + std::mt19937 rng{std::random_device()()}; + + std::array read_threads; + std::array restore_verify_threads; + for (uint32_t i = 0; i < read_threads.size(); ++i) { + uint32_t sleep_micros = rng() % 100000; + read_threads[i] = std::thread([this, i, sleep_micros, &db_opts, &be_opts, + &restore_verify_threads, &limiter] { + test_db_env_->SleepForMicroseconds(sleep_micros); + + // Whether to also re-open the BackupEngine, potentially seeing + // additional backups + bool reopen = i == 3; + // Whether we are going to restore "latest" + bool latest = i > 1; + + BackupEngine* my_be; + if (reopen) { + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be)); + } else { + my_be = backup_engine_.get(); + } + + // Verify metadata (we don't receive updates from concurrently + // creating a new backup) + std::vector infos; + my_be->GetBackupInfo(&infos); + const uint32_t count = static_cast(infos.size()); + infos.clear(); + if (reopen) { + ASSERT_GE(count, 2U); + ASSERT_LE(count, 4U); + fprintf(stderr, "Reopen saw %u backups\n", count); + } else { + ASSERT_EQ(count, 2U); + } + std::vector ids; + my_be->GetCorruptedBackups(&ids); + ASSERT_EQ(ids.size(), 0U); + + // (Eventually, see below) Restore one of the backups, or "latest" + std::string restore_db_dir = dbname_ + "/restore" + std::to_string(i); + DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError(); + BackupID to_restore; + if (latest) { + to_restore = count; + } else { + to_restore = i + 1; + } + + // Open restored DB to verify its contents, but test atomic restore + // by doing it async and ensuring we either get OK or InvalidArgument + restore_verify_threads[i] = + std::thread([this, &db_opts, restore_db_dir, to_restore] { + DB* restored; + Status s; + for (;;) { + s = DB::Open(db_opts, restore_db_dir, &restored); + if (s.IsInvalidArgument()) { + // Restore hasn't finished + test_db_env_->SleepForMicroseconds(1000); + continue; + } else { + // We should only get InvalidArgument if restore is + // incomplete, or OK if complete + ASSERT_OK(s); + break; + } + } + int factor = std::min(static_cast(to_restore), max_factor); + AssertExists(restored, 0, factor * keys_iteration); + AssertEmpty(restored, factor * keys_iteration, + (factor + 1) * keys_iteration); + delete restored; + }); + + // (Ok now) Restore one of the backups, or "latest" + if (latest) { + ASSERT_OK( + my_be->RestoreDBFromLatestBackup(restore_db_dir, restore_db_dir)); + } else { + ASSERT_OK(my_be->VerifyBackup(to_restore, true)); + ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir, + restore_db_dir)); + } + + // Test for race condition in reconfiguring limiter + // FIXME: this could set to a different value in all threads, except + // GenericRateLimiter::SetBytesPerSecond has a write-write race + // reported by TSAN + if (i == 0) { + limiter->SetBytesPerSecond(2000000000); + } + + // Re-verify metadata (we don't receive updates from concurrently + // creating a new backup) + my_be->GetBackupInfo(&infos); + ASSERT_EQ(infos.size(), count); + my_be->GetCorruptedBackups(&ids); + ASSERT_EQ(ids.size(), 0); + // fprintf(stderr, "Finished read thread\n"); + + if (reopen) { + delete my_be; + } + }); + } + + BackupEngine* alt_be; + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &alt_be)); + + std::array append_threads; + for (unsigned i = 0; i < append_threads.size(); ++i) { + uint32_t sleep_micros = rng() % 100000; + append_threads[i] = std::thread([this, sleep_micros, alt_be] { + test_db_env_->SleepForMicroseconds(sleep_micros); + // WART: CreateNewBackup doesn't tell you the BackupID it just created, + // which is ugly for multithreaded setting. + // TODO: add delete backup also when that is added + ASSERT_OK(alt_be->CreateNewBackup(db_.get())); + // fprintf(stderr, "Finished append thread\n"); + }); + } + + for (auto& t : append_threads) { + t.join(); + } + // Verify metadata + std::vector infos; + alt_be->GetBackupInfo(&infos); + ASSERT_EQ(infos.size(), 2 + append_threads.size()); + + for (auto& t : read_threads) { + t.join(); + } + + delete alt_be; + + for (auto& t : restore_verify_threads) { + t.join(); + } + + CloseDBAndBackupEngine(); +} + +TEST_F(BackupEngineTest, LimitBackupsOpened) { + // Verify the specified max backups are opened, including skipping over + // corrupted backups. + // + // Setup: + // - backups 1, 2, and 4 are valid + // - backup 3 is corrupt + // - max_valid_backups_to_open == 2 + // + // Expectation: the engine opens backups 4 and 2 since those are latest two + // non-corrupt backups. + const int kNumKeys = 5000; + OpenDBAndBackupEngine(true); + for (int i = 1; i <= 4; ++i) { + FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + if (i == 3) { + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3)); + } + } + CloseDBAndBackupEngine(); + + engine_options_->max_valid_backups_to_open = 2; + engine_options_->destroy_old_data = false; + BackupEngineReadOnly* read_only_backup_engine; + ASSERT_OK(BackupEngineReadOnly::Open( + backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine)); + + std::vector backup_infos; + read_only_backup_engine->GetBackupInfo(&backup_infos); + ASSERT_EQ(2, backup_infos.size()); + ASSERT_EQ(2, backup_infos[0].backup_id); + ASSERT_EQ(4, backup_infos[1].backup_id); + delete read_only_backup_engine; +} + +TEST_F(BackupEngineTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) { + // Verify the specified max_valid_backups_to_open is ignored if the engine + // is not read-only. + // + // Setup: + // - backups 1, 2, and 4 are valid + // - backup 3 is corrupt + // - max_valid_backups_to_open == 2 + // + // Expectation: the engine opens backups 4, 2, and 1 since those are latest + // non-corrupt backups, by ignoring max_valid_backups_to_open == 2. + const int kNumKeys = 5000; + OpenDBAndBackupEngine(true); + for (int i = 1; i <= 4; ++i) { + FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + if (i == 3) { + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3)); + } + } + CloseDBAndBackupEngine(); + + engine_options_->max_valid_backups_to_open = 2; + OpenDBAndBackupEngine(); + std::vector backup_infos; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(3, backup_infos.size()); + ASSERT_EQ(1, backup_infos[0].backup_id); + ASSERT_EQ(2, backup_infos[1].backup_id); + ASSERT_EQ(4, backup_infos[2].backup_id); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +TEST_F(BackupEngineTest, CreateWhenLatestBackupCorrupted) { + // we should pick an ID greater than corrupted backups' IDs so creation can + // succeed even when latest backup is corrupted. + const int kNumKeys = 5000; + OpenDBAndBackupEngine(true /* destroy_old_data */); + BackupInfo backup_info; + ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound()); + FillDB(db_.get(), 0 /* from */, kNumKeys); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + true /* flush_before_backup */)); + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/1", + 3 /* bytes_to_corrupt */)); + CloseDBAndBackupEngine(); + + OpenDBAndBackupEngine(); + ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound()); + + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + true /* flush_before_backup */)); + + ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).ok()); + ASSERT_EQ(2, backup_info.backup_id); + + std::vector backup_infos; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(1, backup_infos.size()); + ASSERT_EQ(2, backup_infos[0].backup_id); + + // Verify individual GetBackupInfo by ID + ASSERT_TRUE(backup_engine_->GetBackupInfo(0U, &backup_info).IsNotFound()); + ASSERT_TRUE(backup_engine_->GetBackupInfo(1U, &backup_info).IsCorruption()); + ASSERT_TRUE(backup_engine_->GetBackupInfo(2U, &backup_info).ok()); + ASSERT_TRUE(backup_engine_->GetBackupInfo(3U, &backup_info).IsNotFound()); + ASSERT_TRUE( + backup_engine_->GetBackupInfo(999999U, &backup_info).IsNotFound()); +} + +TEST_F(BackupEngineTest, WriteOnlyEngineNoSharedFileDeletion) { + // Verifies a write-only BackupEngine does not delete files belonging to valid + // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called. + const int kNumKeys = 5000; + for (int i = 0; i < 3; ++i) { + OpenDBAndBackupEngine(i == 0 /* destroy_old_data */); + FillDB(db_.get(), i * kNumKeys, (i + 1) * kNumKeys); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true)); + CloseDBAndBackupEngine(); + + engine_options_->max_valid_backups_to_open = 0; + OpenDBAndBackupEngine(); + switch (i) { + case 0: + ASSERT_OK(backup_engine_->GarbageCollect()); + break; + case 1: + ASSERT_OK(backup_engine_->PurgeOldBackups(1 /* num_backups_to_keep */)); + break; + case 2: + ASSERT_OK(backup_engine_->DeleteBackup(2 /* backup_id */)); + break; + default: + assert(false); + } + CloseDBAndBackupEngine(); + + engine_options_->max_valid_backups_to_open = + std::numeric_limits::max(); + AssertBackupConsistency(i + 1, 0, (i + 1) * kNumKeys); + } +} + +TEST_P(BackupEngineTestWithParam, BackupUsingDirectIO) { + // Tests direct I/O on the backup engine's reads and writes on the DB env and + // backup env + // We use ChrootEnv underneath so the below line checks for direct I/O support + // in the chroot directory, not the true filesystem root. + if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) { + ROCKSDB_GTEST_SKIP("Test requires Direct I/O Support"); + return; + } + const int kNumKeysPerBackup = 100; + const int kNumBackups = 3; + options_.use_direct_reads = true; + OpenDBAndBackupEngine(true /* destroy_old_data */); + for (int i = 0; i < kNumBackups; ++i) { + FillDB(db_.get(), i * kNumKeysPerBackup /* from */, + (i + 1) * kNumKeysPerBackup /* to */, kFlushAll); + + // Clear the file open counters and then do a bunch of backup engine ops. + // For all ops, files should be opened in direct mode. + test_backup_fs_->ClearFileOpenCounters(); + test_db_fs_->ClearFileOpenCounters(); + CloseBackupEngine(); + OpenBackupEngine(); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + ASSERT_OK(backup_engine_->VerifyBackup(i + 1)); + CloseBackupEngine(); + OpenBackupEngine(); + std::vector backup_infos; + backup_engine_->GetBackupInfo(&backup_infos); + ASSERT_EQ(static_cast(i + 1), backup_infos.size()); + + // Verify backup engine always opened files with direct I/O + ASSERT_EQ(0, test_db_fs_->num_writers()); + ASSERT_GE(test_db_fs_->num_direct_rand_readers(), 0); + ASSERT_GT(test_db_fs_->num_direct_seq_readers(), 0); + // Currently the DB doesn't support reading WALs or manifest with direct + // I/O, so subtract two. + ASSERT_EQ(test_db_fs_->num_seq_readers() - 2, + test_db_fs_->num_direct_seq_readers()); + ASSERT_EQ(test_db_fs_->num_rand_readers(), + test_db_fs_->num_direct_rand_readers()); + } + CloseDBAndBackupEngine(); + + for (int i = 0; i < kNumBackups; ++i) { + AssertBackupConsistency(i + 1 /* backup_id */, + i * kNumKeysPerBackup /* start_exist */, + (i + 1) * kNumKeysPerBackup /* end_exist */, + (i + 2) * kNumKeysPerBackup /* end */); + } +} + +TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) { + std::atomic priority(CpuPriority::kNormal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) { + priority.store(*reinterpret_cast(new_priority)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // 1 thread is easier to test, otherwise, we may not be sure which thread + // actually does the work during CreateNewBackup. + engine_options_->max_background_operations = 1; + OpenDBAndBackupEngine(true); + + { + FillDB(db_.get(), 0, 100); + + // by default, cpu priority is not changed. + CreateBackupOptions options; + ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get())); + + ASSERT_EQ(priority, CpuPriority::kNormal); + } + + { + FillDB(db_.get(), 101, 200); + + // decrease cpu priority from normal to low. + CreateBackupOptions options; + options.decrease_background_thread_cpu_priority = true; + options.background_thread_cpu_priority = CpuPriority::kLow; + ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get())); + + ASSERT_EQ(priority, CpuPriority::kLow); + } + + { + FillDB(db_.get(), 201, 300); + + // try to upgrade cpu priority back to normal, + // the priority should still low. + CreateBackupOptions options; + options.decrease_background_thread_cpu_priority = true; + options.background_thread_cpu_priority = CpuPriority::kNormal; + ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get())); + + ASSERT_EQ(priority, CpuPriority::kLow); + } + + { + FillDB(db_.get(), 301, 400); + + // decrease cpu priority from low to idle. + CreateBackupOptions options; + options.decrease_background_thread_cpu_priority = true; + options.background_thread_cpu_priority = CpuPriority::kIdle; + ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get())); + + ASSERT_EQ(priority, CpuPriority::kIdle); + } + + { + FillDB(db_.get(), 301, 400); + + // reset priority to later verify that it's not updated by SetCpuPriority. + priority = CpuPriority::kNormal; + + // setting the same cpu priority won't call SetCpuPriority. + CreateBackupOptions options; + options.decrease_background_thread_cpu_priority = true; + options.background_thread_cpu_priority = CpuPriority::kIdle; + + // Also check output backup_id with CreateNewBackup + BackupID new_id = 0; + ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get(), &new_id)); + ASSERT_EQ(new_id, 5U); + + ASSERT_EQ(priority, CpuPriority::kNormal); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + CloseDBAndBackupEngine(); + DestroyDBWithoutCheck(dbname_, options_); +} + +// Populates `*total_size` with the size of all files under `backup_dir`. +// We don't go through `BackupEngine` currently because it's hard to figure out +// the metadata file size. +Status GetSizeOfBackupFiles(FileSystem* backup_fs, + const std::string& backup_dir, size_t* total_size) { + *total_size = 0; + std::vector dir_stack = {backup_dir}; + Status s; + while (s.ok() && !dir_stack.empty()) { + std::string dir = std::move(dir_stack.back()); + dir_stack.pop_back(); + std::vector children; + s = backup_fs->GetChildren(dir, IOOptions(), &children, nullptr /* dbg */); + for (size_t i = 0; s.ok() && i < children.size(); ++i) { + std::string path = dir + "/" + children[i]; + bool is_dir; + s = backup_fs->IsDirectory(path, IOOptions(), &is_dir, nullptr /* dbg */); + uint64_t file_size = 0; + if (s.ok()) { + if (is_dir) { + dir_stack.emplace_back(std::move(path)); + } else { + s = backup_fs->GetFileSize(path, IOOptions(), &file_size, + nullptr /* dbg */); + } + } + if (s.ok()) { + *total_size += file_size; + } + } + } + return s; +} + +TEST_F(BackupEngineTest, IOStats) { + // Tests the `BACKUP_READ_BYTES` and `BACKUP_WRITE_BYTES` ticker stats have + // the expected values according to the files in the backups. + + // These ticker stats are expected to be populated regardless of `PerfLevel` + // in user thread + SetPerfLevel(kDisable); + + options_.statistics = CreateDBStatistics(); + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum); + + FillDB(db_.get(), 0 /* from */, 100 /* to */, kFlushMost); + + ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_READ_BYTES)); + ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_WRITE_BYTES)); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + + size_t orig_backup_files_size; + ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(), + backupdir_, &orig_backup_files_size)); + size_t expected_bytes_written = orig_backup_files_size; + ASSERT_EQ(expected_bytes_written, + options_.statistics->getTickerCount(BACKUP_WRITE_BYTES)); + // Bytes read is more difficult to pin down since there are reads for many + // purposes other than creating file, like `GetSortedWalFiles()` to find first + // sequence number, or `CreateNewBackup()` thread to find SST file session ID. + // So we loosely require there are at least as many reads as needed for + // copying, but not as many as twice that. + ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES), + expected_bytes_written); + ASSERT_LT(expected_bytes_written, + 2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES)); + + FillDB(db_.get(), 100 /* from */, 200 /* to */, kFlushMost); + + ASSERT_OK(options_.statistics->Reset()); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), + false /* flush_before_backup */)); + size_t final_backup_files_size; + ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(), + backupdir_, &final_backup_files_size)); + expected_bytes_written = final_backup_files_size - orig_backup_files_size; + ASSERT_EQ(expected_bytes_written, + options_.statistics->getTickerCount(BACKUP_WRITE_BYTES)); + // See above for why these bounds were chosen. + ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES), + expected_bytes_written); + ASSERT_LT(expected_bytes_written, + 2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES)); +} + +TEST_F(BackupEngineTest, FileTemperatures) { + CloseDBAndBackupEngine(); + + // Required for recording+restoring temperatures + engine_options_->schema_version = 2; + + // More file IO instrumentation + auto my_db_fs = std::make_shared(db_chroot_fs_); + test_db_fs_ = std::make_shared(my_db_fs); + SetEnvsFromFileSystems(); + + // Use temperatures + options_.bottommost_temperature = Temperature::kWarm; + options_.level0_file_num_compaction_trigger = 2; + // set dynamic_level to true so the compaction would compact the data to the + // last level directly which will have the last_level_temperature + options_.level_compaction_dynamic_level_bytes = true; + + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, + kShareWithChecksum); + + // generate a bottommost file (combined from 2) and a non-bottommost file + DBImpl* dbi = static_cast_with_check(db_.get()); + ASSERT_OK(db_->Put(WriteOptions(), "a", "val")); + ASSERT_OK(db_->Put(WriteOptions(), "c", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); + ASSERT_OK(db_->Put(WriteOptions(), "d", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(dbi->TEST_WaitForCompact()); + ASSERT_OK(db_->Put(WriteOptions(), "e", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + + // Get temperatures from manifest + std::map manifest_temps; + std::map manifest_temp_counts; + { + std::vector infos; + ASSERT_OK( + db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos)); + for (auto info : infos) { + if (info.file_type == kTableFile) { + manifest_temps.emplace(info.file_number, info.temperature); + manifest_temp_counts[info.temperature]++; + } + } + } + + // Verify expected manifest temperatures + ASSERT_EQ(manifest_temp_counts.size(), 2); + ASSERT_EQ(manifest_temp_counts[Temperature::kWarm], 1); + ASSERT_EQ(manifest_temp_counts[Temperature::kUnknown], 1); + + // Verify manifest temperatures match FS temperatures + std::map current_temps; + my_db_fs->CopyCurrentSstFileTemperatures(¤t_temps); + for (const auto& manifest_temp : manifest_temps) { + ASSERT_EQ(current_temps[manifest_temp.first], manifest_temp.second); + } + + // Try a few different things + for (int i = 1; i <= 5; ++i) { + // Expected temperatures after restore are based on manifest temperatures + std::map expected_temps = manifest_temps; + + if (i >= 2) { + // For iterations 2 & 3, override current temperature of one file + // and vary which temperature is authoritative (current or manifest). + // For iterations 4 & 5, override current temperature of both files + // but make sure an current temperate always takes precedence over + // unknown regardless of current_temperatures_override_manifest setting. + bool use_current = ((i % 2) == 1); + engine_options_->current_temperatures_override_manifest = use_current; + CloseBackupEngine(); + OpenBackupEngine(); + for (const auto& manifest_temp : manifest_temps) { + if (i <= 3) { + if (manifest_temp.second == Temperature::kWarm) { + my_db_fs->OverrideSstFileTemperature(manifest_temp.first, + Temperature::kCold); + if (use_current) { + expected_temps[manifest_temp.first] = Temperature::kCold; + } + } + } else { + assert(i <= 5); + if (manifest_temp.second == Temperature::kWarm) { + my_db_fs->OverrideSstFileTemperature(manifest_temp.first, + Temperature::kUnknown); + } else { + ASSERT_EQ(manifest_temp.second, Temperature::kUnknown); + my_db_fs->OverrideSstFileTemperature(manifest_temp.first, + Temperature::kHot); + // regardless of use_current + expected_temps[manifest_temp.first] = Temperature::kHot; + } + } + } + } + + // Sample requested temperatures in opening files for backup + my_db_fs->PopRequestedSstFileTemperatures(); + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + + // Verify requested temperatures against manifest temperatures (before + // retry with kUnknown if needed, and before backup finds out current + // temperatures in FileSystem) + std::vector> requested_temps; + my_db_fs->PopRequestedSstFileTemperatures(&requested_temps); + std::set distinct_requests; + for (const auto& requested_temp : requested_temps) { + // Matching manifest temperatures, except allow retry request with + // kUnknown + auto manifest_temp = manifest_temps.at(requested_temp.first); + if (manifest_temp == Temperature::kUnknown || + requested_temp.second != Temperature::kUnknown) { + ASSERT_EQ(manifest_temp, requested_temp.second); + } + distinct_requests.insert(requested_temp.first); + } + // Two distinct requests + ASSERT_EQ(distinct_requests.size(), 2); + + // Verify against backup info file details API + BackupInfo info; + ASSERT_OK(backup_engine_->GetLatestBackupInfo( + &info, /*include_file_details*/ true)); + ASSERT_GT(info.file_details.size(), 2); + for (auto& e : info.file_details) { + ASSERT_EQ(expected_temps[e.file_number], e.temperature); + } + + // Restore backup to another virtual (tiered) dir + const std::string restore_dir = "/restore" + std::to_string(i); + ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup( + RestoreOptions(), restore_dir, restore_dir)); + + // Verify restored FS temperatures match expectation + // (FileTemperatureTestFS doesn't distinguish directories when reporting + // current temperatures, just whatever SST was written or overridden last + // with that file number.) + my_db_fs->CopyCurrentSstFileTemperatures(¤t_temps); + for (const auto& expected_temp : expected_temps) { + ASSERT_EQ(current_temps[expected_temp.first], expected_temp.second); + } + + // Delete backup to force next backup to copy files + ASSERT_OK(backup_engine_->PurgeOldBackups(0)); + } +} + +} // namespace + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc new file mode 100644 index 000000000..86907e979 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc @@ -0,0 +1,490 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_compaction_filter.h" + +#include + +#include "db/dbformat.h" +#include "logging/logging.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() { + if (blob_file_) { + CloseAndRegisterNewBlobFile(); + } + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_); + RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_); +} + +CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2( + int level, const Slice& key, ValueType value_type, const Slice& value, + std::string* new_value, std::string* skip_until) const { + const CompactionFilter* ucf = user_comp_filter(); + if (value_type != kBlobIndex) { + if (ucf == nullptr) { + return Decision::kKeep; + } + // Apply user compaction filter for inlined data. + CompactionFilter::Decision decision = + ucf->FilterV2(level, key, value_type, value, new_value, skip_until); + if (decision == Decision::kChangeValue) { + return HandleValueChange(key, new_value); + } + return decision; + } + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + // Unable to decode blob index. Keeping the value. + return Decision::kKeep; + } + if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) { + // Expired + expired_count_++; + expired_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (!blob_index.IsInlined() && + blob_index.file_number() < context_.next_file_number && + context_.current_blob_files.count(blob_index.file_number()) == 0) { + // Corresponding blob file gone (most likely, evicted by FIFO eviction). + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() && + blob_index.expiration() < context_.evict_expiration_up_to) { + // Hack: Internal key is passed to BlobIndexCompactionFilter for it to + // get sequence number. + ParsedInternalKey ikey; + if (!ParseInternalKey( + key, &ikey, + context_.blob_db_impl->db_options_.allow_data_in_errors) + .ok()) { + assert(false); + return Decision::kKeep; + } + // Remove keys that could have been remove by last FIFO eviction. + // If get error while parsing key, ignore and continue. + if (ikey.sequence < context_.fifo_eviction_seq) { + evicted_count_++; + evicted_size_ += key.size() + value.size(); + return Decision::kRemove; + } + } + // Apply user compaction filter for all non-TTL blob data. + if (ucf != nullptr && !blob_index.HasTTL()) { + // Hack: Internal key is passed to BlobIndexCompactionFilter for it to + // get sequence number. + ParsedInternalKey ikey; + if (!ParseInternalKey( + key, &ikey, + context_.blob_db_impl->db_options_.allow_data_in_errors) + .ok()) { + assert(false); + return Decision::kKeep; + } + // Read value from blob file. + PinnableSlice blob; + CompressionType compression_type = kNoCompression; + constexpr bool need_decompress = true; + if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress, + &compression_type)) { + return Decision::kIOError; + } + CompactionFilter::Decision decision = ucf->FilterV2( + level, ikey.user_key, kValue, blob, new_value, skip_until); + if (decision == Decision::kChangeValue) { + return HandleValueChange(ikey.user_key, new_value); + } + return decision; + } + return Decision::kKeep; +} + +CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange( + const Slice& key, std::string* new_value) const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) { + // Keep new_value inlined. + return Decision::kChangeValue; + } + if (!OpenNewBlobFileIfNeeded()) { + return Decision::kIOError; + } + Slice new_blob_value(*new_value); + std::string compression_output; + if (blob_db_impl->bdb_options_.compression != kNoCompression) { + new_blob_value = + blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output); + } + uint64_t new_blob_file_number = 0; + uint64_t new_blob_offset = 0; + if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number, + &new_blob_offset)) { + return Decision::kIOError; + } + if (!CloseAndRegisterNewBlobFileIfNeeded()) { + return Decision::kIOError; + } + BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset, + new_blob_value.size(), + blob_db_impl->bdb_options_.compression); + return Decision::kChangeBlobIndex; +} + +BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() { + assert(context().blob_db_impl); + + ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log, + "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64 + " bytes), relocated %" PRIu64 " blobs (%" PRIu64 + " bytes), created %" PRIu64 " new blob file(s)", + !gc_stats_.HasError() ? "successfully" : "with failure", + gc_stats_.AllBlobs(), gc_stats_.AllBytes(), + gc_stats_.RelocatedBlobs(), gc_stats_.RelocatedBytes(), + gc_stats_.NewFiles()); + + RecordTick(statistics(), BLOB_DB_GC_NUM_KEYS_RELOCATED, + gc_stats_.RelocatedBlobs()); + RecordTick(statistics(), BLOB_DB_GC_BYTES_RELOCATED, + gc_stats_.RelocatedBytes()); + RecordTick(statistics(), BLOB_DB_GC_NUM_NEW_FILES, gc_stats_.NewFiles()); + RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError()); +} + +bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const { + if (blob_file_) { + assert(writer_); + return true; + } + return false; +} + +bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const { + if (IsBlobFileOpened()) { + return true; + } + + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + const Status s = blob_db_impl->CreateBlobFileAndWriter( + /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_, + &writer_); + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error opening new blob file during compaction/GC, status: %s", + s.ToString().c_str()); + blob_file_.reset(); + writer_.reset(); + return false; + } + + assert(blob_file_); + assert(writer_); + + return true; +} + +bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile( + const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob, + bool need_decompress, CompressionType* compression_type) const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + Status s = blob_db_impl->GetRawBlobFromFile( + key, blob_index.file_number(), blob_index.offset(), blob_index.size(), + blob, compression_type); + + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error reading blob during compaction/GC, key: %s (%s), status: %s", + key.ToString(/* output_hex */ true).c_str(), + blob_index.DebugString(/* output_hex */ true).c_str(), + s.ToString().c_str()); + + return false; + } + + if (need_decompress && *compression_type != kNoCompression) { + s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob); + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Uncompression error during blob read from file: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s status: '%s'", + blob_index.file_number(), blob_index.offset(), blob_index.size(), + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + + return false; + } + } + + return true; +} + +bool BlobIndexCompactionFilterBase::WriteBlobToNewFile( + const Slice& key, const Slice& blob, uint64_t* new_blob_file_number, + uint64_t* new_blob_offset) const { + TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile"); + assert(new_blob_file_number); + assert(new_blob_offset); + + assert(blob_file_); + *new_blob_file_number = blob_file_->BlobFileNumber(); + + assert(writer_); + uint64_t new_key_offset = 0; + const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset, + new_blob_offset); + + if (!s.ok()) { + const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log, + "Error writing blob to new file %s during compaction/GC, " + "key: %s, status: %s", + blob_file_->PathName().c_str(), + key.ToString(/* output_hex */ true).c_str(), + s.ToString().c_str()); + return false; + } + + const uint64_t new_size = + BlobLogRecord::kHeaderSize + key.size() + blob.size(); + blob_file_->BlobRecordAdded(new_size); + + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + blob_db_impl->total_blob_size_ += new_size; + + return true; +} + +bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded() + const { + const BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + + assert(blob_file_); + if (blob_file_->GetFileSize() < blob_db_impl->bdb_options_.blob_file_size) { + return true; + } + + return CloseAndRegisterNewBlobFile(); +} + +bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const { + BlobDBImpl* const blob_db_impl = context_.blob_db_impl; + assert(blob_db_impl); + assert(blob_file_); + + Status s; + + { + WriteLock wl(&blob_db_impl->mutex_); + + s = blob_db_impl->CloseBlobFile(blob_file_); + + // Note: we delay registering the new blob file until it's closed to + // prevent FIFO eviction from processing it during compaction/GC. + blob_db_impl->RegisterBlobFile(blob_file_); + } + + assert(blob_file_->Immutable()); + + if (!s.ok()) { + ROCKS_LOG_ERROR( + blob_db_impl->db_options_.info_log, + "Error closing new blob file %s during compaction/GC, status: %s", + blob_file_->PathName().c_str(), s.ToString().c_str()); + } + + blob_file_.reset(); + return s.ok(); +} + +CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput( + const Slice& key, const Slice& existing_value, + std::string* new_value) const { + assert(new_value); + + const BlobDBImpl* const blob_db_impl = context().blob_db_impl; + (void)blob_db_impl; + + assert(blob_db_impl); + assert(blob_db_impl->bdb_options_.enable_garbage_collection); + + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(existing_value); + if (!s.ok()) { + gc_stats_.SetError(); + return BlobDecision::kCorruption; + } + + if (blob_index.IsInlined()) { + gc_stats_.AddBlob(blob_index.value().size()); + + return BlobDecision::kKeep; + } + + gc_stats_.AddBlob(blob_index.size()); + + if (blob_index.HasTTL()) { + return BlobDecision::kKeep; + } + + if (blob_index.file_number() >= context_gc_.cutoff_file_number) { + return BlobDecision::kKeep; + } + + // Note: each compaction generates its own blob files, which, depending on the + // workload, might result in many small blob files. The total number of files + // is bounded though (determined by the number of compactions and the blob + // file size option). + if (!OpenNewBlobFileIfNeeded()) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + PinnableSlice blob; + CompressionType compression_type = kNoCompression; + std::string compression_output; + if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + // If the compression_type is changed, re-compress it with the new compression + // type. + if (compression_type != blob_db_impl->bdb_options_.compression) { + if (compression_type != kNoCompression) { + const Status status = + blob_db_impl->DecompressSlice(blob, compression_type, &blob); + if (!status.ok()) { + gc_stats_.SetError(); + return BlobDecision::kCorruption; + } + } + if (blob_db_impl->bdb_options_.compression != kNoCompression) { + blob_db_impl->GetCompressedSlice(blob, &compression_output); + blob = PinnableSlice(&compression_output); + blob.PinSelf(); + } + } + + uint64_t new_blob_file_number = 0; + uint64_t new_blob_offset = 0; + if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + if (!CloseAndRegisterNewBlobFileIfNeeded()) { + gc_stats_.SetError(); + return BlobDecision::kIOError; + } + + BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset, + blob.size(), compression_type); + + gc_stats_.AddRelocatedBlob(blob_index.size()); + + return BlobDecision::kChangeValue; +} + +bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const { + if (IsBlobFileOpened()) { + return true; + } + bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded(); + if (result) { + gc_stats_.AddNewFile(); + } + return result; +} + +std::unique_ptr +BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory( + const CompactionFilter::Context& context) const { + std::unique_ptr user_comp_filter_from_factory; + if (user_comp_filter_factory_) { + user_comp_filter_from_factory = + user_comp_filter_factory_->CreateCompactionFilter(context); + } + return user_comp_filter_from_factory; +} + +std::unique_ptr +BlobIndexCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context& _context) { + assert(clock()); + + int64_t current_time = 0; + Status s = clock()->GetCurrentTime(¤t_time); + if (!s.ok()) { + return nullptr; + } + assert(current_time >= 0); + + assert(blob_db_impl()); + + BlobCompactionContext context; + blob_db_impl()->GetCompactionContext(&context); + + std::unique_ptr user_comp_filter_from_factory = + CreateUserCompactionFilterFromFactory(_context); + + return std::unique_ptr(new BlobIndexCompactionFilter( + std::move(context), user_comp_filter(), + std::move(user_comp_filter_from_factory), current_time, statistics())); +} + +std::unique_ptr +BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter( + const CompactionFilter::Context& _context) { + assert(clock()); + + int64_t current_time = 0; + Status s = clock()->GetCurrentTime(¤t_time); + if (!s.ok()) { + return nullptr; + } + assert(current_time >= 0); + + assert(blob_db_impl()); + + BlobCompactionContext context; + BlobCompactionContextGC context_gc; + blob_db_impl()->GetCompactionContext(&context, &context_gc); + + std::unique_ptr user_comp_filter_from_factory = + CreateUserCompactionFilterFromFactory(_context); + + return std::unique_ptr(new BlobIndexCompactionFilterGC( + std::move(context), std::move(context_gc), user_comp_filter(), + std::move(user_comp_filter_from_factory), current_time, statistics())); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.h b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h new file mode 100644 index 000000000..1493cfc1a --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h @@ -0,0 +1,204 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "db/blob/blob_index.h" +#include "monitoring/statistics.h" +#include "rocksdb/compaction_filter.h" +#include "utilities/blob_db/blob_db_gc_stats.h" +#include "utilities/blob_db/blob_db_impl.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; +namespace blob_db { + +struct BlobCompactionContext { + BlobDBImpl* blob_db_impl = nullptr; + uint64_t next_file_number = 0; + std::unordered_set current_blob_files; + SequenceNumber fifo_eviction_seq = 0; + uint64_t evict_expiration_up_to = 0; +}; + +struct BlobCompactionContextGC { + uint64_t cutoff_file_number = 0; +}; + +// Compaction filter that deletes expired blob indexes from the base DB. +// Comes into two varieties, one for the non-GC case and one for the GC case. +class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase { + public: + BlobIndexCompactionFilterBase( + BlobCompactionContext&& _context, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : LayeredCompactionFilterBase(_user_comp_filter, + std::move(_user_comp_filter_from_factory)), + context_(std::move(_context)), + current_time_(current_time), + statistics_(stats) {} + + ~BlobIndexCompactionFilterBase() override; + + // Filter expired blob indexes regardless of snapshots. + bool IgnoreSnapshots() const override { return true; } + + Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& value, std::string* new_value, + std::string* skip_until) const override; + + bool IsStackedBlobDbInternalCompactionFilter() const override { return true; } + + protected: + bool IsBlobFileOpened() const; + virtual bool OpenNewBlobFileIfNeeded() const; + bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index, + PinnableSlice* blob, bool need_decompress, + CompressionType* compression_type) const; + bool WriteBlobToNewFile(const Slice& key, const Slice& blob, + uint64_t* new_blob_file_number, + uint64_t* new_blob_offset) const; + bool CloseAndRegisterNewBlobFileIfNeeded() const; + bool CloseAndRegisterNewBlobFile() const; + + Statistics* statistics() const { return statistics_; } + const BlobCompactionContext& context() const { return context_; } + + private: + Decision HandleValueChange(const Slice& key, std::string* new_value) const; + + private: + BlobCompactionContext context_; + const uint64_t current_time_; + Statistics* statistics_; + + mutable std::shared_ptr blob_file_; + mutable std::shared_ptr writer_; + + // It is safe to not using std::atomic since the compaction filter, created + // from a compaction filter factroy, will not be called from multiple threads. + mutable uint64_t expired_count_ = 0; + mutable uint64_t expired_size_ = 0; + mutable uint64_t evicted_count_ = 0; + mutable uint64_t evicted_size_ = 0; +}; + +class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase { + public: + BlobIndexCompactionFilter( + BlobCompactionContext&& _context, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter, + std::move(_user_comp_filter_from_factory), + current_time, stats) {} + + const char* Name() const override { return "BlobIndexCompactionFilter"; } +}; + +class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase { + public: + BlobIndexCompactionFilterGC( + BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc, + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory, + uint64_t current_time, Statistics* stats) + : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter, + std::move(_user_comp_filter_from_factory), + current_time, stats), + context_gc_(std::move(context_gc)) {} + + ~BlobIndexCompactionFilterGC() override; + + const char* Name() const override { return "BlobIndexCompactionFilterGC"; } + + BlobDecision PrepareBlobOutput(const Slice& key, const Slice& existing_value, + std::string* new_value) const override; + + private: + bool OpenNewBlobFileIfNeeded() const override; + + private: + BlobCompactionContextGC context_gc_; + mutable BlobDBGarbageCollectionStats gc_stats_; +}; + +// Compaction filter factory; similarly to the filters above, it comes +// in two flavors, one that creates filters that support GC, and one +// that creates non-GC filters. +class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory { + public: + BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : blob_db_impl_(_blob_db_impl), + clock_(_clock), + statistics_(_statistics), + user_comp_filter_(_cf_options.compaction_filter), + user_comp_filter_factory_(_cf_options.compaction_filter_factory) {} + + protected: + std::unique_ptr CreateUserCompactionFilterFromFactory( + const CompactionFilter::Context& context) const; + + BlobDBImpl* blob_db_impl() const { return blob_db_impl_; } + SystemClock* clock() const { return clock_; } + Statistics* statistics() const { return statistics_; } + const CompactionFilter* user_comp_filter() const { return user_comp_filter_; } + + private: + BlobDBImpl* blob_db_impl_; + SystemClock* clock_; + Statistics* statistics_; + const CompactionFilter* user_comp_filter_; + std::shared_ptr user_comp_filter_factory_; +}; + +class BlobIndexCompactionFilterFactory + : public BlobIndexCompactionFilterFactoryBase { + public: + BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options, + _statistics) {} + + const char* Name() const override { + return "BlobIndexCompactionFilterFactory"; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; +}; + +class BlobIndexCompactionFilterFactoryGC + : public BlobIndexCompactionFilterFactoryBase { + public: + BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, + SystemClock* _clock, + const ColumnFamilyOptions& _cf_options, + Statistics* _statistics) + : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options, + _statistics) {} + + const char* Name() const override { + return "BlobIndexCompactionFilterFactoryGC"; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db.cc b/src/rocksdb/utilities/blob_db/blob_db.cc new file mode 100644 index 000000000..cbd02e68e --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db.h" + +#include + +#include "logging/logging.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db) { + *blob_db = nullptr; + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families, + &handles, blob_db); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status BlobDB::Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + BlobDB** blob_db) { + assert(handles); + + if (column_families.size() != 1 || + column_families[0].name != kDefaultColumnFamilyName) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + + BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options, + column_families[0].options); + Status s = blob_db_impl->Open(handles); + if (s.ok()) { + *blob_db = static_cast(blob_db_impl); + } else { + if (!handles->empty()) { + for (ColumnFamilyHandle* cfh : *handles) { + blob_db_impl->DestroyColumnFamilyHandle(cfh); + } + + handles->clear(); + } + + delete blob_db_impl; + *blob_db = nullptr; + } + return s; +} + +BlobDB::BlobDB() : StackableDB(nullptr) {} + +void BlobDBOptions::Dump(Logger* log) const { + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_dir: %s", + blob_dir.c_str()); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.path_relative: %d", + path_relative); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.is_fifo: %d", + is_fifo); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.max_db_size: %" PRIu64, + max_db_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.ttl_range_secs: %" PRIu64, + ttl_range_secs); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.bytes_per_sync: %" PRIu64, + bytes_per_sync); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.compression: %d", + static_cast(compression)); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.enable_garbage_collection: %d", + enable_garbage_collection); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.garbage_collection_cutoff: %f", + garbage_collection_cutoff); + ROCKS_LOG_HEADER( + log, " BlobDBOptions.disable_background_tasks: %d", + disable_background_tasks); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/utilities/blob_db/blob_db.h b/src/rocksdb/utilities/blob_db/blob_db.h new file mode 100644 index 000000000..e9d92486f --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db.h @@ -0,0 +1,266 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +// A wrapped database which puts values of KV pairs in a separate log +// and store location to the log in the underlying DB. +// +// The factory needs to be moved to include/rocksdb/utilities to allow +// users to use blob DB. + +constexpr uint64_t kNoExpiration = std::numeric_limits::max(); + +struct BlobDBOptions { + // Name of the directory under the base DB where blobs will be stored. Using + // a directory where the base DB stores its SST files is not supported. + // Default is "blob_dir" + std::string blob_dir = "blob_dir"; + + // whether the blob_dir path is relative or absolute. + bool path_relative = true; + + // When max_db_size is reached, evict blob files to free up space + // instead of returnning NoSpace error on write. Blob files will be + // evicted from oldest to newest, based on file creation time. + bool is_fifo = false; + + // Maximum size of the database (including SST files and blob files). + // + // Default: 0 (no limits) + uint64_t max_db_size = 0; + + // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds + // (10 minutes), and the first bucket starts at 1471542000 + // then the blob buckets will be + // first bucket is 1471542000 - 1471542600 + // second bucket is 1471542600 - 1471543200 + // and so on + uint64_t ttl_range_secs = 3600; + + // The smallest value to store in blob log. Values smaller than this threshold + // will be inlined in base DB together with the key. + uint64_t min_blob_size = 0; + + // Allows OS to incrementally sync blob files to disk for every + // bytes_per_sync bytes written. Users shouldn't rely on it for + // persistency guarantee. + uint64_t bytes_per_sync = 512 * 1024; + + // the target size of each blob file. File will become immutable + // after it exceeds that size + uint64_t blob_file_size = 256 * 1024 * 1024; + + // what compression to use for Blob's + CompressionType compression = kNoCompression; + + // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction + // by rewriting the remaining live blobs to new files. + bool enable_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N non-TTL blob files will be rewritten when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files. + double garbage_collection_cutoff = 0.25; + + // Disable all background job. Used for test only. + bool disable_background_tasks = false; + + void Dump(Logger* log) const; +}; + +class BlobDB : public StackableDB { + public: + using ROCKSDB_NAMESPACE::StackableDB::Put; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) override = 0; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return Put(options, key, value); + } + + using ROCKSDB_NAMESPACE::StackableDB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + assert(db_ != nullptr); + return db_->Delete(options, column_family, key); + } + + virtual Status PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) = 0; + virtual Status PutWithTTL(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t ttl) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return PutWithTTL(options, key, value, ttl); + } + + // Put with expiration. Key with expiration time equal to + // std::numeric_limits::max() means the key don't expire. + virtual Status PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) = 0; + virtual Status PutUntil(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t expiration) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + return PutUntil(options, key, value, expiration); + } + + using ROCKSDB_NAMESPACE::StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override = 0; + + // Get value and expiration. + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + return Get(options, DefaultColumnFamily(), key, value, expiration); + } + + using ROCKSDB_NAMESPACE::StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override = 0; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_families, + const std::vector& keys, + std::vector* values) override { + for (auto column_family : column_families) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return std::vector( + column_families.size(), + Status::NotSupported( + "Blob DB doesn't support non-default column family.")); + } + } + return MultiGet(options, keys, values); + } + virtual void MultiGet(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const size_t num_keys, const Slice* /*keys*/, + PinnableSlice* /*values*/, Status* statuses, + const bool /*sorted_input*/ = false) override { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = + Status::NotSupported("Blob DB doesn't support batched MultiGet"); + } + } + + using ROCKSDB_NAMESPACE::StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*wopts*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + using ROCKSDB_NAMESPACE::StackableDB::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + virtual Status Write(const WriteOptions& opts, + WriteBatch* updates) override = 0; + + using ROCKSDB_NAMESPACE::StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options) override = 0; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + // Blob DB doesn't support non-default column family. + return nullptr; + } + return NewIterator(options); + } + + Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override = 0; + Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + + return CompactFiles(compact_options, input_file_names, output_level, + output_path_id, output_file_names, compaction_job_info); + } + + using ROCKSDB_NAMESPACE::StackableDB::Close; + virtual Status Close() override = 0; + + // Opening blob db. + static Status Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db); + + static Status Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + BlobDB** blob_db); + + virtual BlobDBOptions GetBlobDBOptions() const = 0; + + virtual Status SyncBlobFiles() = 0; + + virtual ~BlobDB() {} + + protected: + explicit BlobDB(); +}; + +// Destroy the content of the database. +Status DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options); + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h new file mode 100644 index 000000000..fea6b0032 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +/** + * Statistics related to a single garbage collection pass (i.e. a single + * (sub)compaction). + */ +class BlobDBGarbageCollectionStats { + public: + uint64_t AllBlobs() const { return all_blobs_; } + uint64_t AllBytes() const { return all_bytes_; } + uint64_t RelocatedBlobs() const { return relocated_blobs_; } + uint64_t RelocatedBytes() const { return relocated_bytes_; } + uint64_t NewFiles() const { return new_files_; } + bool HasError() const { return error_; } + + void AddBlob(uint64_t size) { + ++all_blobs_; + all_bytes_ += size; + } + + void AddRelocatedBlob(uint64_t size) { + ++relocated_blobs_; + relocated_bytes_ += size; + } + + void AddNewFile() { ++new_files_; } + + void SetError() { error_ = true; } + + private: + uint64_t all_blobs_ = 0; + uint64_t all_bytes_ = 0; + uint64_t relocated_blobs_ = 0; + uint64_t relocated_bytes_ = 0; + uint64_t new_files_ = 0; + bool error_ = false; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.cc b/src/rocksdb/utilities/blob_db/blob_db_impl.cc new file mode 100644 index 000000000..87e294c5c --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_impl.cc @@ -0,0 +1,2177 @@ + +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db_impl.h" + +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/statistics.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_builder.h" +#include "table/meta_blocks.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/timer_queue.h" +#include "utilities/blob_db/blob_compaction_filter.h" +#include "utilities/blob_db/blob_db_iterator.h" +#include "utilities/blob_db/blob_db_listener.h" + +namespace { +int kBlockBasedTableVersionFormat = 2; +} // end namespace + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +bool BlobFileComparator::operator()( + const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const { + return lhs->BlobFileNumber() > rhs->BlobFileNumber(); +} + +bool BlobFileComparatorTTL::operator()( + const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const { + assert(lhs->HasTTL() && rhs->HasTTL()); + if (lhs->expiration_range_.first < rhs->expiration_range_.first) { + return true; + } + if (lhs->expiration_range_.first > rhs->expiration_range_.first) { + return false; + } + return lhs->BlobFileNumber() < rhs->BlobFileNumber(); +} + +BlobDBImpl::BlobDBImpl(const std::string& dbname, + const BlobDBOptions& blob_db_options, + const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : BlobDB(), + dbname_(dbname), + db_impl_(nullptr), + env_(db_options.env), + bdb_options_(blob_db_options), + db_options_(db_options), + cf_options_(cf_options), + file_options_(db_options), + statistics_(db_options_.statistics.get()), + next_file_number_(1), + flush_sequence_(0), + closed_(true), + open_file_count_(0), + total_blob_size_(0), + live_sst_size_(0), + fifo_eviction_seq_(0), + evict_expiration_up_to_(0), + debug_level_(0) { + clock_ = env_->GetSystemClock().get(); + blob_dir_ = (bdb_options_.path_relative) + ? dbname + "/" + bdb_options_.blob_dir + : bdb_options_.blob_dir; + file_options_.bytes_per_sync = blob_db_options.bytes_per_sync; +} + +BlobDBImpl::~BlobDBImpl() { + tqueue_.shutdown(); + // CancelAllBackgroundWork(db_, true); + Status s __attribute__((__unused__)) = Close(); + assert(s.ok()); +} + +Status BlobDBImpl::Close() { + if (closed_) { + return Status::OK(); + } + closed_ = true; + + // Close base DB before BlobDBImpl destructs to stop event listener and + // compaction filter call. + Status s = db_->Close(); + // delete db_ anyway even if close failed. + delete db_; + // Reset pointers to avoid StackableDB delete the pointer again. + db_ = nullptr; + db_impl_ = nullptr; + if (!s.ok()) { + return s; + } + + s = SyncBlobFiles(); + return s; +} + +BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; } + +Status BlobDBImpl::Open(std::vector* handles) { + assert(handles != nullptr); + assert(db_ == nullptr); + + if (blob_dir_.empty()) { + return Status::NotSupported("No blob directory in options"); + } + + if (bdb_options_.garbage_collection_cutoff < 0.0 || + bdb_options_.garbage_collection_cutoff > 1.0) { + return Status::InvalidArgument( + "Garbage collection cutoff must be in the interval [0.0, 1.0]"); + } + + // Temporarily disable compactions in the base DB during open; save the user + // defined value beforehand so we can restore it once BlobDB is initialized. + // Note: this is only needed if garbage collection is enabled. + const bool disable_auto_compactions = cf_options_.disable_auto_compactions; + + if (bdb_options_.enable_garbage_collection) { + cf_options_.disable_auto_compactions = true; + } + + Status s; + + // Create info log. + if (db_options_.info_log == nullptr) { + s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log); + if (!s.ok()) { + return s; + } + } + + ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB..."); + + if ((cf_options_.compaction_filter != nullptr || + cf_options_.compaction_filter_factory != nullptr)) { + ROCKS_LOG_INFO(db_options_.info_log, + "BlobDB only support compaction filter on non-TTL values."); + } + + // Open blob directory. + s = env_->CreateDirIfMissing(blob_dir_); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to create blob_dir %s, status: %s", + blob_dir_.c_str(), s.ToString().c_str()); + } + s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_, + nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(), + s.ToString().c_str()); + return s; + } + + // Open blob files. + s = OpenAllBlobFiles(); + if (!s.ok()) { + return s; + } + + // Update options + if (bdb_options_.enable_garbage_collection) { + db_options_.listeners.push_back(std::make_shared(this)); + cf_options_.compaction_filter_factory = + std::make_shared( + this, clock_, cf_options_, statistics_); + } else { + db_options_.listeners.push_back(std::make_shared(this)); + cf_options_.compaction_filter_factory = + std::make_shared( + this, clock_, cf_options_, statistics_); + } + + // Reset user compaction filter after building into compaction factory. + cf_options_.compaction_filter = nullptr; + + // Open base db. + ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_); + s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_); + if (!s.ok()) { + return s; + } + db_impl_ = static_cast_with_check(db_->GetRootDB()); + + // Sanitize the blob_dir provided. Using a directory where the + // base DB stores its files for the default CF is not supported. + const ColumnFamilyData* const cfd = + static_cast(DefaultColumnFamily())->cfd(); + assert(cfd); + + const ImmutableCFOptions* const ioptions = cfd->ioptions(); + assert(ioptions); + + assert(env_); + + for (const auto& cf_path : ioptions->cf_paths) { + bool blob_dir_same_as_cf_dir = false; + s = env_->AreFilesSame(blob_dir_, cf_path.path, &blob_dir_same_as_cf_dir); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Error while sanitizing blob_dir %s, status: %s", + blob_dir_.c_str(), s.ToString().c_str()); + return s; + } + + if (blob_dir_same_as_cf_dir) { + return Status::NotSupported( + "Using the base DB's storage directories for BlobDB files is not " + "supported."); + } + } + + // Initialize SST file <-> oldest blob file mapping if garbage collection + // is enabled. + if (bdb_options_.enable_garbage_collection) { + std::vector live_files; + db_->GetLiveFilesMetaData(&live_files); + + InitializeBlobFileToSstMapping(live_files); + + MarkUnreferencedBlobFilesObsoleteDuringOpen(); + + if (!disable_auto_compactions) { + s = db_->EnableAutoCompaction(*handles); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to enable automatic compactions during open, status: %s", + s.ToString().c_str()); + return s; + } + } + } + + // Add trash files in blob dir to file delete scheduler. + SstFileManagerImpl* sfm = static_cast( + db_impl_->immutable_db_options().sst_file_manager.get()); + DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_); + + UpdateLiveSSTSize(); + + // Start background jobs. + if (!bdb_options_.disable_background_tasks) { + StartBackgroundTasks(); + } + + ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this); + bdb_options_.Dump(db_options_.info_log.get()); + closed_ = false; + return s; +} + +void BlobDBImpl::StartBackgroundTasks() { + // store a call to a member function and object + tqueue_.add( + kReclaimOpenFilesPeriodMillisecs, + std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1)); + tqueue_.add( + kDeleteObsoleteFilesPeriodMillisecs, + std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1)); + tqueue_.add(kSanityCheckPeriodMillisecs, + std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1)); + tqueue_.add( + kEvictExpiredFilesPeriodMillisecs, + std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1)); +} + +Status BlobDBImpl::GetAllBlobFiles(std::set* file_numbers) { + assert(file_numbers != nullptr); + std::vector all_files; + Status s = env_->GetChildren(blob_dir_, &all_files); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to get list of blob files, status: %s", + s.ToString().c_str()); + return s; + } + + for (const auto& file_name : all_files) { + uint64_t file_number; + FileType type; + bool success = ParseFileName(file_name, &file_number, &type); + if (success && type == kBlobFile) { + file_numbers->insert(file_number); + } else { + ROCKS_LOG_WARN(db_options_.info_log, + "Skipping file in blob directory: %s", file_name.c_str()); + } + } + + return s; +} + +Status BlobDBImpl::OpenAllBlobFiles() { + std::set file_numbers; + Status s = GetAllBlobFiles(&file_numbers); + if (!s.ok()) { + return s; + } + + if (!file_numbers.empty()) { + next_file_number_.store(*file_numbers.rbegin() + 1); + } + + std::ostringstream blob_file_oss; + std::ostringstream live_imm_oss; + std::ostringstream obsolete_file_oss; + + for (auto& file_number : file_numbers) { + std::shared_ptr blob_file = std::make_shared( + this, blob_dir_, file_number, db_options_.info_log.get()); + blob_file->MarkImmutable(/* sequence */ 0); + + // Read file header and footer + Status read_metadata_status = + blob_file->ReadMetadata(env_->GetFileSystem(), file_options_); + if (read_metadata_status.IsCorruption()) { + // Remove incomplete file. + if (!obsolete_files_.empty()) { + obsolete_file_oss << ", "; + } + obsolete_file_oss << file_number; + + ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/); + continue; + } else if (!read_metadata_status.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Unable to read metadata of blob file %" PRIu64 + ", status: '%s'", + file_number, read_metadata_status.ToString().c_str()); + return read_metadata_status; + } + + total_blob_size_ += blob_file->GetFileSize(); + + if (!blob_files_.empty()) { + blob_file_oss << ", "; + } + blob_file_oss << file_number; + + blob_files_[file_number] = blob_file; + + if (!blob_file->HasTTL()) { + if (!live_imm_non_ttl_blob_files_.empty()) { + live_imm_oss << ", "; + } + live_imm_oss << file_number; + + live_imm_non_ttl_blob_files_[file_number] = blob_file; + } + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(), + blob_file_oss.str().c_str()); + ROCKS_LOG_INFO( + db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s", + live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str()); + ROCKS_LOG_INFO(db_options_.info_log, + "Found %" ROCKSDB_PRIszt + " incomplete or corrupted blob files: %s", + obsolete_files_.size(), obsolete_file_oss.str().c_str()); + return s; +} + +template +void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number, + uint64_t blob_file_number, + Linker linker) { + assert(bdb_options_.enable_garbage_collection); + assert(blob_file_number != kInvalidBlobFileNumber); + + auto it = blob_files_.find(blob_file_number); + if (it == blob_files_.end()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Blob file %" PRIu64 + " not found while trying to link " + "SST file %" PRIu64, + blob_file_number, sst_file_number); + return; + } + + BlobFile* const blob_file = it->second.get(); + assert(blob_file); + + linker(blob_file, sst_file_number); + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " linked to SST file %" PRIu64, + blob_file_number, sst_file_number); +} + +void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number) { + auto linker = [](BlobFile* blob_file, uint64_t sst_file) { + WriteLock file_lock(&blob_file->mutex_); + blob_file->LinkSstFile(sst_file); + }; + + LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker); +} + +void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number, + uint64_t blob_file_number) { + auto linker = [](BlobFile* blob_file, uint64_t sst_file) { + blob_file->LinkSstFile(sst_file); + }; + + LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker); +} + +void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number) { + assert(bdb_options_.enable_garbage_collection); + assert(blob_file_number != kInvalidBlobFileNumber); + + auto it = blob_files_.find(blob_file_number); + if (it == blob_files_.end()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Blob file %" PRIu64 + " not found while trying to unlink " + "SST file %" PRIu64, + blob_file_number, sst_file_number); + return; + } + + BlobFile* const blob_file = it->second.get(); + assert(blob_file); + + { + WriteLock file_lock(&blob_file->mutex_); + blob_file->UnlinkSstFile(sst_file_number); + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " unlinked from SST file %" PRIu64, + blob_file_number, sst_file_number); +} + +void BlobDBImpl::InitializeBlobFileToSstMapping( + const std::vector& live_files) { + assert(bdb_options_.enable_garbage_collection); + + for (const auto& live_file : live_files) { + const uint64_t sst_file_number = live_file.file_number; + const uint64_t blob_file_number = live_file.oldest_blob_file_number; + + if (blob_file_number == kInvalidBlobFileNumber) { + continue; + } + + LinkSstToBlobFileNoLock(sst_file_number, blob_file_number); + } +} + +void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) { + assert(bdb_options_.enable_garbage_collection); + + WriteLock lock(&mutex_); + + if (info.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number); + } + + assert(flush_sequence_ < info.largest_seqno); + flush_sequence_ = info.largest_seqno; + + MarkUnreferencedBlobFilesObsolete(); +} + +void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) { + assert(bdb_options_.enable_garbage_collection); + + if (!info.status.ok()) { + return; + } + + // Note: the same SST file may appear in both the input and the output + // file list in case of a trivial move. We walk through the two lists + // below in a fashion that's similar to merge sort to detect this. + + auto cmp = [](const CompactionFileInfo& lhs, const CompactionFileInfo& rhs) { + return lhs.file_number < rhs.file_number; + }; + + auto inputs = info.input_file_infos; + auto iit = inputs.begin(); + const auto iit_end = inputs.end(); + + std::sort(iit, iit_end, cmp); + + auto outputs = info.output_file_infos; + auto oit = outputs.begin(); + const auto oit_end = outputs.end(); + + std::sort(oit, oit_end, cmp); + + WriteLock lock(&mutex_); + + while (iit != iit_end && oit != oit_end) { + const auto& input = *iit; + const auto& output = *oit; + + if (input.file_number == output.file_number) { + ++iit; + ++oit; + } else if (input.file_number < output.file_number) { + if (input.oldest_blob_file_number != kInvalidBlobFileNumber) { + UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number); + } + + ++iit; + } else { + assert(output.file_number < input.file_number); + + if (output.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number); + } + + ++oit; + } + } + + while (iit != iit_end) { + const auto& input = *iit; + + if (input.oldest_blob_file_number != kInvalidBlobFileNumber) { + UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number); + } + + ++iit; + } + + while (oit != oit_end) { + const auto& output = *oit; + + if (output.oldest_blob_file_number != kInvalidBlobFileNumber) { + LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number); + } + + ++oit; + } + + MarkUnreferencedBlobFilesObsolete(); +} + +bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded( + const std::shared_ptr& blob_file, SequenceNumber obsolete_seq) { + assert(blob_file); + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + assert(bdb_options_.enable_garbage_collection); + + // Note: FIFO eviction could have marked this file obsolete already. + if (blob_file->Obsolete()) { + return true; + } + + // We cannot mark this file (or any higher-numbered files for that matter) + // obsolete if it is referenced by any memtables or SSTs. We keep track of + // the SSTs explicitly. To account for memtables, we keep track of the highest + // sequence number received in flush notifications, and we do not mark the + // blob file obsolete if there are still unflushed memtables from before + // the time the blob file was closed. + if (blob_file->GetImmutableSequence() > flush_sequence_ || + !blob_file->GetLinkedSstFiles().empty()) { + return false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Blob file %" PRIu64 " is no longer needed, marking obsolete", + blob_file->BlobFileNumber()); + + ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true); + return true; +} + +template +void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) { + assert(bdb_options_.enable_garbage_collection); + + // Iterate through all live immutable non-TTL blob files, and mark them + // obsolete assuming no SST files or memtables rely on the blobs in them. + // Note: we need to stop as soon as we find a blob file that has any + // linked SSTs (or one potentially referenced by memtables). + + uint64_t obsoleted_files = 0; + + auto it = live_imm_non_ttl_blob_files_.begin(); + while (it != live_imm_non_ttl_blob_files_.end()) { + const auto& blob_file = it->second; + assert(blob_file); + assert(blob_file->BlobFileNumber() == it->first); + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + + // Small optimization: Obsolete() does an atomic read, so we can do + // this check without taking a lock on the blob file's mutex. + if (blob_file->Obsolete()) { + it = live_imm_non_ttl_blob_files_.erase(it); + continue; + } + + if (!mark_if_needed(blob_file)) { + break; + } + + it = live_imm_non_ttl_blob_files_.erase(it); + + ++obsoleted_files; + } + + if (obsoleted_files > 0) { + ROCKS_LOG_INFO(db_options_.info_log, + "%" PRIu64 " blob file(s) marked obsolete by GC", + obsoleted_files); + RecordTick(statistics_, BLOB_DB_GC_NUM_FILES, obsoleted_files); + } +} + +void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() { + const SequenceNumber obsolete_seq = GetLatestSequenceNumber(); + + MarkUnreferencedBlobFilesObsoleteImpl( + [this, obsolete_seq](const std::shared_ptr& blob_file) { + WriteLock file_lock(&blob_file->mutex_); + return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq); + }); +} + +void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() { + MarkUnreferencedBlobFilesObsoleteImpl( + [this](const std::shared_ptr& blob_file) { + return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0); + }); +} + +void BlobDBImpl::CloseRandomAccessLocked( + const std::shared_ptr& bfile) { + bfile->CloseRandomAccessLocked(); + open_file_count_--; +} + +Status BlobDBImpl::GetBlobFileReader( + const std::shared_ptr& blob_file, + std::shared_ptr* reader) { + assert(reader != nullptr); + bool fresh_open = false; + Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open); + if (s.ok() && fresh_open) { + assert(*reader != nullptr); + open_file_count_++; + } + return s; +} + +std::shared_ptr BlobDBImpl::NewBlobFile( + bool has_ttl, const ExpirationRange& expiration_range, + const std::string& reason) { + assert(has_ttl == (expiration_range.first || expiration_range.second)); + + uint64_t file_num = next_file_number_++; + + const uint32_t column_family_id = + static_cast(DefaultColumnFamily())->GetID(); + auto blob_file = std::make_shared( + this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id, + bdb_options_.compression, has_ttl, expiration_range); + + ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'", + blob_file->PathName().c_str(), reason.c_str()); + LogFlush(db_options_.info_log); + + return blob_file; +} + +void BlobDBImpl::RegisterBlobFile(std::shared_ptr blob_file) { + const uint64_t blob_file_number = blob_file->BlobFileNumber(); + + auto it = blob_files_.lower_bound(blob_file_number); + assert(it == blob_files_.end() || it->first != blob_file_number); + + blob_files_.insert(it, + std::map>::value_type( + blob_file_number, std::move(blob_file))); +} + +Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { + std::string fpath(bfile->PathName()); + std::unique_ptr wfile; + const auto& fs = env_->GetFileSystem(); + + Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to open blob file for write: %s status: '%s'" + " exists: '%s'", + fpath.c_str(), s.ToString().c_str(), + fs->FileExists(fpath, file_options_.io_options, nullptr) + .ToString() + .c_str()); + return s; + } + + std::unique_ptr fwriter; + fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_)); + + uint64_t boffset = bfile->GetFileSize(); + if (debug_level_ >= 2 && boffset) { + ROCKS_LOG_DEBUG(db_options_.info_log, + "Open blob file: %s with offset: %" PRIu64, fpath.c_str(), + boffset); + } + + BlobLogWriter::ElemType et = BlobLogWriter::kEtNone; + if (bfile->file_size_ == BlobLogHeader::kSize) { + et = BlobLogWriter::kEtFileHdr; + } else if (bfile->file_size_ > BlobLogHeader::kSize) { + et = BlobLogWriter::kEtRecord; + } else if (bfile->file_size_) { + ROCKS_LOG_WARN(db_options_.info_log, + "Open blob file: %s with wrong size: %" PRIu64, + fpath.c_str(), boffset); + return Status::Corruption("Invalid blob file size"); + } + + constexpr bool do_flush = true; + + bfile->log_writer_ = std::make_shared( + std::move(fwriter), clock_, statistics_, bfile->file_number_, + db_options_.use_fsync, do_flush, boffset); + bfile->log_writer_->last_elem_type_ = et; + + return s; +} + +std::shared_ptr BlobDBImpl::FindBlobFileLocked( + uint64_t expiration) const { + if (open_ttl_files_.empty()) { + return nullptr; + } + + std::shared_ptr tmp = std::make_shared(); + tmp->SetHasTTL(true); + tmp->expiration_range_ = std::make_pair(expiration, 0); + tmp->file_number_ = std::numeric_limits::max(); + + auto citr = open_ttl_files_.equal_range(tmp); + if (citr.first == open_ttl_files_.end()) { + assert(citr.second == open_ttl_files_.end()); + + std::shared_ptr check = *(open_ttl_files_.rbegin()); + return (check->expiration_range_.second <= expiration) ? nullptr : check; + } + + if (citr.first != citr.second) { + return *(citr.first); + } + + auto finditr = citr.second; + if (finditr != open_ttl_files_.begin()) { + --finditr; + } + + bool b2 = (*finditr)->expiration_range_.second <= expiration; + bool b1 = (*finditr)->expiration_range_.first > expiration; + + return (b1 || b2) ? nullptr : (*finditr); +} + +Status BlobDBImpl::CheckOrCreateWriterLocked( + const std::shared_ptr& blob_file, + std::shared_ptr* writer) { + assert(writer != nullptr); + *writer = blob_file->GetWriter(); + if (*writer != nullptr) { + return Status::OK(); + } + Status s = CreateWriterLocked(blob_file); + if (s.ok()) { + *writer = blob_file->GetWriter(); + } + return s; +} + +Status BlobDBImpl::CreateBlobFileAndWriter( + bool has_ttl, const ExpirationRange& expiration_range, + const std::string& reason, std::shared_ptr* blob_file, + std::shared_ptr* writer) { + TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter"); + assert(has_ttl == (expiration_range.first || expiration_range.second)); + assert(blob_file); + assert(writer); + + *blob_file = NewBlobFile(has_ttl, expiration_range, reason); + assert(*blob_file); + + // file not visible, hence no lock + Status s = CheckOrCreateWriterLocked(*blob_file, writer); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to get writer for blob file: %s, error: %s", + (*blob_file)->PathName().c_str(), s.ToString().c_str()); + return s; + } + + assert(*writer); + + s = (*writer)->WriteHeader((*blob_file)->header_); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to write header to new blob file: %s" + " status: '%s'", + (*blob_file)->PathName().c_str(), s.ToString().c_str()); + return s; + } + + (*blob_file)->SetFileSize(BlobLogHeader::kSize); + total_blob_size_ += BlobLogHeader::kSize; + + return s; +} + +Status BlobDBImpl::SelectBlobFile(std::shared_ptr* blob_file) { + assert(blob_file); + + { + ReadLock rl(&mutex_); + + if (open_non_ttl_file_) { + assert(!open_non_ttl_file_->Immutable()); + *blob_file = open_non_ttl_file_; + return Status::OK(); + } + } + + // Check again + WriteLock wl(&mutex_); + + if (open_non_ttl_file_) { + assert(!open_non_ttl_file_->Immutable()); + *blob_file = open_non_ttl_file_; + return Status::OK(); + } + + std::shared_ptr writer; + const Status s = CreateBlobFileAndWriter( + /* has_ttl */ false, ExpirationRange(), + /* reason */ "SelectBlobFile", blob_file, &writer); + if (!s.ok()) { + return s; + } + + RegisterBlobFile(*blob_file); + open_non_ttl_file_ = *blob_file; + + return s; +} + +Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration, + std::shared_ptr* blob_file) { + assert(blob_file); + assert(expiration != kNoExpiration); + + { + ReadLock rl(&mutex_); + + *blob_file = FindBlobFileLocked(expiration); + if (*blob_file != nullptr) { + assert(!(*blob_file)->Immutable()); + return Status::OK(); + } + } + + // Check again + WriteLock wl(&mutex_); + + *blob_file = FindBlobFileLocked(expiration); + if (*blob_file != nullptr) { + assert(!(*blob_file)->Immutable()); + return Status::OK(); + } + + const uint64_t exp_low = + (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs; + const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs; + const ExpirationRange expiration_range(exp_low, exp_high); + + std::ostringstream oss; + oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')'; + + std::shared_ptr writer; + const Status s = + CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range, + /* reason */ oss.str(), blob_file, &writer); + if (!s.ok()) { + return s; + } + + RegisterBlobFile(*blob_file); + open_ttl_files_.insert(*blob_file); + + return s; +} + +class BlobDBImpl::BlobInserter : public WriteBatch::Handler { + private: + const WriteOptions& options_; + BlobDBImpl* blob_db_impl_; + uint32_t default_cf_id_; + WriteBatch batch_; + + public: + BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl, + uint32_t default_cf_id) + : options_(options), + blob_db_impl_(blob_db_impl), + default_cf_id_(default_cf_id) {} + + WriteBatch* batch() { return &batch_; } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration, + &batch_); + return s; + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key); + return s; + } + + virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) { + if (column_family_id != default_cf_id_) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id, + begin_key, end_key); + return s; + } + + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + void LogData(const Slice& blob) override { batch_.PutLogData(blob); } +}; + +Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_WRITE); + uint32_t default_cf_id = + static_cast_with_check(DefaultColumnFamily()) + ->GetID(); + Status s; + BlobInserter blob_inserter(options, this, default_cf_id); + { + // Release write_mutex_ before DB write to avoid race condition with + // flush begin listener, which also require write_mutex_ to sync + // blob files. + MutexLock l(&write_mutex_); + s = updates->Iterate(&blob_inserter); + } + if (!s.ok()) { + return s; + } + return db_->Write(options, blob_inserter.batch()); +} + +Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return PutUntil(options, key, value, kNoExpiration); +} + +Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) { + uint64_t now = EpochNow(); + uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration; + return PutUntil(options, key, value, expiration); +} + +Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_PUT); + Status s; + WriteBatch batch; + { + // Release write_mutex_ before DB write to avoid race condition with + // flush begin listener, which also require write_mutex_ to sync + // blob files. + MutexLock l(&write_mutex_); + s = PutBlobValue(options, key, value, expiration, &batch); + } + if (s.ok()) { + s = db_->Write(options, &batch); + } + return s; +} + +Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/, + const Slice& key, const Slice& value, + uint64_t expiration, WriteBatch* batch) { + write_mutex_.AssertHeld(); + Status s; + std::string index_entry; + uint32_t column_family_id = + static_cast_with_check(DefaultColumnFamily()) + ->GetID(); + if (value.size() < bdb_options_.min_blob_size) { + if (expiration == kNoExpiration) { + // Put as normal value + s = batch->Put(key, value); + RecordTick(statistics_, BLOB_DB_WRITE_INLINED); + } else { + // Inlined with TTL + BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value); + s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, + index_entry); + RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL); + } + } else { + std::string compression_output; + Slice value_compressed = GetCompressedSlice(value, &compression_output); + + std::string headerbuf; + BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed, + expiration); + + // Check DB size limit before selecting blob file to + // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be + // done before calling SelectBlobFile(). + s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() + + value_compressed.size()); + if (!s.ok()) { + return s; + } + + std::shared_ptr blob_file; + if (expiration != kNoExpiration) { + s = SelectBlobFileTTL(expiration, &blob_file); + } else { + s = SelectBlobFile(&blob_file); + } + if (s.ok()) { + assert(blob_file != nullptr); + assert(blob_file->GetCompressionType() == bdb_options_.compression); + s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration, + &index_entry); + } + if (s.ok()) { + if (expiration != kNoExpiration) { + WriteLock file_lock(&blob_file->mutex_); + blob_file->ExtendExpirationRange(expiration); + } + s = CloseBlobFileIfNeeded(blob_file); + } + if (s.ok()) { + s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, + index_entry); + } + if (s.ok()) { + if (expiration == kNoExpiration) { + RecordTick(statistics_, BLOB_DB_WRITE_BLOB); + } else { + RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL); + } + } else { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt + " status: '%s' blob_file: '%s'", + blob_file->PathName().c_str(), key.ToString().c_str(), value.size(), + s.ToString().c_str(), blob_file->DumpState().c_str()); + } + } + + RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN); + RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size()); + RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size()); + RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size()); + + return s; +} + +Slice BlobDBImpl::GetCompressedSlice(const Slice& raw, + std::string* compression_output) const { + if (bdb_options_.compression == kNoCompression) { + return raw; + } + StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS); + CompressionType type = bdb_options_.compression; + CompressionOptions opts; + CompressionContext context(type); + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type, + 0 /* sample_for_compression */); + CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false, + compression_output, nullptr, nullptr); + return *compression_output; +} + +Status BlobDBImpl::DecompressSlice(const Slice& compressed_value, + CompressionType compression_type, + PinnableSlice* value_output) const { + assert(compression_type != kNoCompression); + + BlockContents contents; + auto cfh = static_cast(DefaultColumnFamily()); + + { + StopWatch decompression_sw(clock_, statistics_, + BLOB_DB_DECOMPRESSION_MICROS); + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + Status s = UncompressBlockData( + info, compressed_value.data(), compressed_value.size(), &contents, + kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions())); + if (!s.ok()) { + return Status::Corruption("Unable to decompress blob."); + } + } + + value_output->PinSelf(contents.data); + + return Status::OK(); +} + +Status BlobDBImpl::CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id, std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { + // Note: we need CompactionJobInfo to be able to track updates to the + // blob file <-> SST mappings, so we provide one if the user hasn't, + // assuming that GC is enabled. + CompactionJobInfo info{}; + if (bdb_options_.enable_garbage_collection && !compaction_job_info) { + compaction_job_info = &info; + } + + const Status s = + db_->CompactFiles(compact_options, input_file_names, output_level, + output_path_id, output_file_names, compaction_job_info); + if (!s.ok()) { + return s; + } + + if (bdb_options_.enable_garbage_collection) { + assert(compaction_job_info); + ProcessCompactionJobInfo(*compaction_job_info); + } + + return s; +} + +void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) { + assert(context); + + context->blob_db_impl = this; + context->next_file_number = next_file_number_.load(); + context->current_blob_files.clear(); + for (auto& p : blob_files_) { + context->current_blob_files.insert(p.first); + } + context->fifo_eviction_seq = fifo_eviction_seq_; + context->evict_expiration_up_to = evict_expiration_up_to_; +} + +void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) { + assert(context); + + ReadLock l(&mutex_); + GetCompactionContextCommon(context); +} + +void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context, + BlobCompactionContextGC* context_gc) { + assert(context); + assert(context_gc); + + ReadLock l(&mutex_); + GetCompactionContextCommon(context); + + if (!live_imm_non_ttl_blob_files_.empty()) { + auto it = live_imm_non_ttl_blob_files_.begin(); + std::advance(it, bdb_options_.garbage_collection_cutoff * + live_imm_non_ttl_blob_files_.size()); + context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end() + ? it->first + : std::numeric_limits::max(); + } +} + +void BlobDBImpl::UpdateLiveSSTSize() { + uint64_t live_sst_size = 0; + bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + if (ok) { + live_sst_size_.store(live_sst_size); + ROCKS_LOG_INFO(db_options_.info_log, + "Updated total SST file size: %" PRIu64 " bytes.", + live_sst_size); + } else { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to update total SST file size after flush or compaction."); + } + { + // Trigger FIFO eviction if needed. + MutexLock l(&write_mutex_); + Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/); + if (s.IsNoSpace()) { + ROCKS_LOG_WARN(db_options_.info_log, + "DB grow out-of-space after SST size updated. Current live" + " SST size: %" PRIu64 + " , current blob files size: %" PRIu64 ".", + live_sst_size_.load(), total_blob_size_.load()); + } + } +} + +Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict) { + write_mutex_.AssertHeld(); + + uint64_t live_sst_size = live_sst_size_.load(); + if (bdb_options_.max_db_size == 0 || + live_sst_size + total_blob_size_.load() + blob_size <= + bdb_options_.max_db_size) { + return Status::OK(); + } + + if (bdb_options_.is_fifo == false || + (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) { + // FIFO eviction is disabled, or no space to insert new blob even we evict + // all blob files. + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); + } + + std::vector> candidate_files; + CopyBlobFiles(&candidate_files); + std::sort(candidate_files.begin(), candidate_files.end(), + BlobFileComparator()); + fifo_eviction_seq_ = GetLatestSequenceNumber(); + + WriteLock l(&mutex_); + + while (!candidate_files.empty() && + live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + std::shared_ptr blob_file = candidate_files.back(); + candidate_files.pop_back(); + WriteLock file_lock(&blob_file->mutex_); + if (blob_file->Obsolete()) { + // File already obsoleted by someone else. + assert(blob_file->Immutable()); + continue; + } + // FIFO eviction can evict open blob files. + if (!blob_file->Immutable()) { + Status s = CloseBlobFile(blob_file); + if (!s.ok()) { + return s; + } + } + assert(blob_file->Immutable()); + auto expiration_range = blob_file->GetExpirationRange(); + ROCKS_LOG_INFO(db_options_.info_log, + "Evict oldest blob file since DB out of space. Current " + "live SST file size: %" PRIu64 ", total blob size: %" PRIu64 + ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64 + ".", + live_sst_size, total_blob_size_.load(), + bdb_options_.max_db_size, blob_file->BlobFileNumber()); + ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/); + evict_expiration_up_to_ = expiration_range.first; + RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED); + RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED, + blob_file->BlobCount()); + RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED, + blob_file->GetFileSize()); + TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted"); + } + if (live_sst_size + total_blob_size_.load() + blob_size > + bdb_options_.max_db_size) { + return Status::NoSpace( + "Write failed, as writing it would exceed max_db_size limit."); + } + return Status::OK(); +} + +Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, uint64_t expiration, + std::string* index_entry) { + Status s; + uint64_t blob_offset = 0; + uint64_t key_offset = 0; + { + WriteLock lockbfile_w(&bfile->mutex_); + std::shared_ptr writer; + s = CheckOrCreateWriterLocked(bfile, &writer); + if (!s.ok()) { + return s; + } + + // write the blob to the blob log. + s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, + &blob_offset); + } + + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Invalid status in AppendBlob: %s status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return s; + } + + uint64_t size_put = headerbuf.size() + key.size() + value.size(); + bfile->BlobRecordAdded(size_put); + total_blob_size_ += size_put; + + if (expiration == kNoExpiration) { + BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset, + value.size(), bdb_options_.compression); + } else { + BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(), + blob_offset, value.size(), + bdb_options_.compression); + } + + return s; +} + +std::vector BlobDBImpl::MultiGet(const ReadOptions& read_options, + const std::vector& keys, + std::vector* values) { + StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_MULTIGET); + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + ReadOptions ro(read_options); + bool snapshot_created = SetSnapshotIfNeeded(&ro); + + std::vector statuses; + statuses.reserve(keys.size()); + values->clear(); + values->reserve(keys.size()); + PinnableSlice value; + for (size_t i = 0; i < keys.size(); i++) { + statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value)); + values->push_back(value.ToString()); + value.Reset(); + } + if (snapshot_created) { + db_->ReleaseSnapshot(ro.snapshot); + } + return statuses; +} + +bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) { + assert(read_options != nullptr); + if (read_options->snapshot != nullptr) { + return false; + } + read_options->snapshot = db_->GetSnapshot(); + return true; +} + +Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value, uint64_t* expiration) { + assert(value); + + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(index_entry); + if (!s.ok()) { + return s; + } + + if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) { + return Status::NotFound("Key expired"); + } + + if (expiration != nullptr) { + if (blob_index.HasTTL()) { + *expiration = blob_index.expiration(); + } else { + *expiration = kNoExpiration; + } + } + + if (blob_index.IsInlined()) { + // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same + // memory buffer to avoid extra copy. + value->PinSelf(blob_index.value()); + return Status::OK(); + } + + CompressionType compression_type = kNoCompression; + s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(), + blob_index.size(), value, &compression_type); + if (!s.ok()) { + return s; + } + + if (compression_type != kNoCompression) { + s = DecompressSlice(*value, compression_type, value); + if (!s.ok()) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Uncompression error during blob read from file: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s status: '%s'", + blob_index.file_number(), blob_index.offset(), blob_index.size(), + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + } + return s; + } + } + + return Status::OK(); +} + +Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, + uint64_t offset, uint64_t size, + PinnableSlice* value, + CompressionType* compression_type) { + assert(value); + assert(compression_type); + assert(*compression_type == kNoCompression); + + if (!size) { + value->PinSelf(""); + return Status::OK(); + } + + // offset has to have certain min, as we will read CRC + // later from the Blob Header, which needs to be also a + // valid offset. + if (offset < + (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Invalid blob index file_number: %" PRIu64 + " blob_offset: %" PRIu64 " blob_size: %" PRIu64 + " key: %s", + file_number, offset, size, + key.ToString(/* output_hex */ true).c_str()); + } + + return Status::NotFound("Invalid blob offset"); + } + + std::shared_ptr blob_file; + + { + ReadLock rl(&mutex_); + auto it = blob_files_.find(file_number); + + // file was deleted + if (it == blob_files_.end()) { + return Status::NotFound("Blob Not Found as blob file missing"); + } + + blob_file = it->second; + } + + *compression_type = blob_file->GetCompressionType(); + + // takes locks when called + std::shared_ptr reader; + Status s = GetBlobFileReader(blob_file, &reader); + if (!s.ok()) { + return s; + } + + assert(offset >= key.size() + sizeof(uint32_t)); + const uint64_t record_offset = offset - key.size() - sizeof(uint32_t); + const uint64_t record_size = sizeof(uint32_t) + key.size() + size; + + // Allocate the buffer. This is safe in C++11 + std::string buf; + AlignedBuf aligned_buf; + + // A partial blob record contain checksum, key and value. + Slice blob_record; + + { + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + // TODO: rate limit old blob DB file reads. + if (reader->use_direct_io()) { + s = reader->Read(IOOptions(), record_offset, + static_cast(record_size), &blob_record, nullptr, + &aligned_buf, Env::IO_TOTAL /* rate_limiter_priority */); + } else { + buf.reserve(static_cast(record_size)); + s = reader->Read(IOOptions(), record_offset, + static_cast(record_size), &blob_record, &buf[0], + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); + } + + if (!s.ok()) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'", + file_number, offset, size, key.size(), s.ToString().c_str()); + return s; + } + + if (blob_record.size() != record_size) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt + ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes", + file_number, offset, size, key.size(), blob_record.size(), record_size); + + return Status::Corruption("Failed to retrieve blob from blob index."); + } + + Slice crc_slice(blob_record.data(), sizeof(uint32_t)); + Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(), + static_cast(size)); + + uint32_t crc_exp = 0; + if (!GetFixed32(&crc_slice, &crc_exp)) { + ROCKS_LOG_DEBUG( + db_options_.info_log, + "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64 + ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'", + file_number, offset, size, key.size(), s.ToString().c_str()); + return Status::Corruption("Unable to decode checksum."); + } + + uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t), + blob_record.size() - sizeof(uint32_t)); + crc = crc32c::Mask(crc); // Adjust for storage + if (crc != crc_exp) { + if (debug_level_ >= 2) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " key: %s status: '%s'", + file_number, offset, size, + key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str()); + } + + return Status::Corruption("Corruption. Blob CRC mismatch"); + } + + value->PinSelf(blob_value); + + return Status::OK(); +} + +Status BlobDBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return Get(read_options, column_family, key, value, + static_cast(nullptr) /*expiration*/); +} + +Status BlobDBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_GET); + return GetImpl(read_options, column_family, key, value, expiration); +} + +Status BlobDBImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) { + if (column_family->GetID() != DefaultColumnFamily()->GetID()) { + return Status::NotSupported( + "Blob DB doesn't support non-default column family."); + } + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + // TODO(yiwu): For Get() retry if file not found would be a simpler strategy. + ReadOptions ro(read_options); + bool snapshot_created = SetSnapshotIfNeeded(&ro); + + PinnableSlice index_entry; + Status s; + bool is_blob_index = false; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &index_entry; + get_impl_options.is_blob_index = &is_blob_index; + s = db_impl_->GetImpl(ro, key, get_impl_options); + if (expiration != nullptr) { + *expiration = kNoExpiration; + } + RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ); + if (s.ok()) { + if (is_blob_index) { + s = GetBlobValue(key, index_entry, value, expiration); + } else { + // The index entry is the value itself in this case. + value->PinSelf(index_entry); + } + RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size()); + } + if (snapshot_created) { + db_->ReleaseSnapshot(ro.snapshot); + } + return s; +} + +std::pair BlobDBImpl::SanityCheck(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + ReadLock rl(&mutex_); + + ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check"); + ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt, + blob_files_.size()); + ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt, + open_ttl_files_.size()); + + for (const auto& blob_file : open_ttl_files_) { + (void)blob_file; + assert(!blob_file->Immutable()); + } + + for (const auto& pair : live_imm_non_ttl_blob_files_) { + const auto& blob_file = pair.second; + (void)blob_file; + assert(!blob_file->HasTTL()); + assert(blob_file->Immutable()); + } + + uint64_t now = EpochNow(); + + for (auto blob_file_pair : blob_files_) { + auto blob_file = blob_file_pair.second; + std::ostringstream buf; + + buf << "Blob file " << blob_file->BlobFileNumber() << ", size " + << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount() + << ", immutable " << blob_file->Immutable(); + + if (blob_file->HasTTL()) { + ExpirationRange expiration_range; + { + ReadLock file_lock(&blob_file->mutex_); + expiration_range = blob_file->GetExpirationRange(); + } + buf << ", expiration range (" << expiration_range.first << ", " + << expiration_range.second << ")"; + + if (!blob_file->Obsolete()) { + buf << ", expire in " << (expiration_range.second - now) << "seconds"; + } + } + if (blob_file->Obsolete()) { + buf << ", obsolete at " << blob_file->GetObsoleteSequence(); + } + buf << "."; + ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str()); + } + + // reschedule + return std::make_pair(true, -1); +} + +Status BlobDBImpl::CloseBlobFile(std::shared_ptr bfile) { + TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile"); + assert(bfile); + assert(!bfile->Immutable()); + assert(!bfile->Obsolete()); + + if (bfile->HasTTL() || bfile == open_non_ttl_file_) { + write_mutex_.AssertHeld(); + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Closing blob file %" PRIu64 ". Path: %s", + bfile->BlobFileNumber(), bfile->PathName().c_str()); + + const SequenceNumber sequence = GetLatestSequenceNumber(); + + const Status s = bfile->WriteFooterAndCloseLocked(sequence); + + if (s.ok()) { + total_blob_size_ += BlobLogFooter::kSize; + } else { + bfile->MarkImmutable(sequence); + + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to close blob file %" PRIu64 "with error: %s", + bfile->BlobFileNumber(), s.ToString().c_str()); + } + + if (bfile->HasTTL()) { + size_t erased __attribute__((__unused__)); + erased = open_ttl_files_.erase(bfile); + } else { + if (bfile == open_non_ttl_file_) { + open_non_ttl_file_ = nullptr; + } + + const uint64_t blob_file_number = bfile->BlobFileNumber(); + auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number); + assert(it == live_imm_non_ttl_blob_files_.end() || + it->first != blob_file_number); + live_imm_non_ttl_blob_files_.insert( + it, std::map>::value_type( + blob_file_number, bfile)); + } + + return s; +} + +Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr& bfile) { + write_mutex_.AssertHeld(); + + // atomic read + if (bfile->GetFileSize() < bdb_options_.blob_file_size) { + return Status::OK(); + } + + WriteLock lock(&mutex_); + WriteLock file_lock(&bfile->mutex_); + + assert(!bfile->Obsolete() || bfile->Immutable()); + if (bfile->Immutable()) { + return Status::OK(); + } + + return CloseBlobFile(bfile); +} + +void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + assert(blob_file->Immutable()); + assert(!blob_file->Obsolete()); + + // Should hold write lock of mutex_ or during DB open. + blob_file->MarkObsolete(obsolete_seq); + obsolete_files_.push_back(blob_file); + assert(total_blob_size_.load() >= blob_file->GetFileSize()); + if (update_size) { + total_blob_size_ -= blob_file->GetFileSize(); + } +} + +bool BlobDBImpl::VisibleToActiveSnapshot( + const std::shared_ptr& bfile) { + assert(bfile->Obsolete()); + + // We check whether the oldest snapshot is no less than the last sequence + // by the time the blob file become obsolete. If so, the blob file is not + // visible to all existing snapshots. + // + // If we keep track of the earliest sequence of the keys in the blob file, + // we could instead check if there's a snapshot falls in range + // [earliest_sequence, obsolete_sequence). But doing so will make the + // implementation more complicated. + SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence(); + SequenceNumber oldest_snapshot = kMaxSequenceNumber; + { + // Need to lock DBImpl mutex before access snapshot list. + InstrumentedMutexLock l(db_impl_->mutex()); + auto& snapshots = db_impl_->snapshots(); + if (!snapshots.empty()) { + oldest_snapshot = snapshots.oldest()->GetSequenceNumber(); + } + } + bool visible = oldest_snapshot < obsolete_sequence; + if (visible) { + ROCKS_LOG_INFO(db_options_.info_log, + "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64 + ") visible to oldest snapshot %" PRIu64 ".", + bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot); + } + return visible; +} + +std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:0"); + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:1"); + + std::vector> process_files; + uint64_t now = EpochNow(); + { + ReadLock rl(&mutex_); + for (auto p : blob_files_) { + auto& blob_file = p.second; + ReadLock file_lock(&blob_file->mutex_); + if (blob_file->HasTTL() && !blob_file->Obsolete() && + blob_file->GetExpirationRange().second <= now) { + process_files.push_back(blob_file); + } + } + } + + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:2"); + TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:3"); + TEST_SYNC_POINT_CALLBACK("BlobDBImpl::EvictExpiredFiles:cb", nullptr); + + SequenceNumber seq = GetLatestSequenceNumber(); + { + MutexLock l(&write_mutex_); + WriteLock lock(&mutex_); + for (auto& blob_file : process_files) { + WriteLock file_lock(&blob_file->mutex_); + + // Need to double check if the file is obsolete. + if (blob_file->Obsolete()) { + assert(blob_file->Immutable()); + continue; + } + + if (!blob_file->Immutable()) { + CloseBlobFile(blob_file); + } + + assert(blob_file->Immutable()); + + ObsoleteBlobFile(blob_file, seq, true /*update_size*/); + } + } + + return std::make_pair(true, -1); +} + +Status BlobDBImpl::SyncBlobFiles() { + MutexLock l(&write_mutex_); + + std::vector> process_files; + { + ReadLock rl(&mutex_); + for (auto fitr : open_ttl_files_) { + process_files.push_back(fitr); + } + if (open_non_ttl_file_ != nullptr) { + process_files.push_back(open_non_ttl_file_); + } + } + + Status s; + for (auto& blob_file : process_files) { + s = blob_file->Fsync(); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to sync blob file %" PRIu64 ", status: %s", + blob_file->BlobFileNumber(), s.ToString().c_str()); + return s; + } + } + + s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions()); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to sync blob directory, status: %s", + s.ToString().c_str()); + } + return s; +} + +std::pair BlobDBImpl::ReclaimOpenFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + if (open_file_count_.load() < kOpenFilesTrigger) { + return std::make_pair(true, -1); + } + + // in the future, we should sort by last_access_ + // instead of closing every file + ReadLock rl(&mutex_); + for (auto const& ent : blob_files_) { + auto bfile = ent.second; + if (bfile->last_access_.load() == -1) continue; + + WriteLock lockbfile_w(&bfile->mutex_); + CloseRandomAccessLocked(bfile); + } + + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { + if (aborted) { + return std::make_pair(false, -1); + } + + MutexLock delete_file_lock(&delete_file_mutex_); + if (disable_file_deletions_ > 0) { + return std::make_pair(true, -1); + } + + std::list> tobsolete; + { + WriteLock wl(&mutex_); + if (obsolete_files_.empty()) { + return std::make_pair(true, -1); + } + tobsolete.swap(obsolete_files_); + } + + bool file_deleted = false; + for (auto iter = tobsolete.begin(); iter != tobsolete.end();) { + auto bfile = *iter; + { + ReadLock lockbfile_r(&bfile->mutex_); + if (VisibleToActiveSnapshot(bfile)) { + ROCKS_LOG_INFO(db_options_.info_log, + "Could not delete file due to snapshot failure %s", + bfile->PathName().c_str()); + ++iter; + continue; + } + } + ROCKS_LOG_INFO(db_options_.info_log, + "Will delete file due to snapshot success %s", + bfile->PathName().c_str()); + + { + WriteLock wl(&mutex_); + blob_files_.erase(bfile->BlobFileNumber()); + } + + Status s = DeleteDBFile(&(db_impl_->immutable_db_options()), + bfile->PathName(), blob_dir_, true, + /*force_fg=*/false); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "File failed to be deleted as obsolete %s", + bfile->PathName().c_str()); + ++iter; + continue; + } + + file_deleted = true; + ROCKS_LOG_INFO(db_options_.info_log, + "File deleted as obsolete from blob dir %s", + bfile->PathName().c_str()); + + iter = tobsolete.erase(iter); + } + + // directory change. Fsync + if (file_deleted) { + Status s = dir_ent_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s", + blob_dir_.c_str(), s.ToString().c_str()); + } + } + + // put files back into obsolete if for some reason, delete failed + if (!tobsolete.empty()) { + WriteLock wl(&mutex_); + for (auto bfile : tobsolete) { + blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); + obsolete_files_.push_front(bfile); + } + } + + return std::make_pair(!aborted, -1); +} + +void BlobDBImpl::CopyBlobFiles( + std::vector>* bfiles_copy) { + ReadLock rl(&mutex_); + for (auto const& p : blob_files_) { + bfiles_copy->push_back(p.second); + } +} + +Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { + auto* cfd = + static_cast_with_check(DefaultColumnFamily()) + ->cfd(); + // Get a snapshot to avoid blob file get deleted between we + // fetch and index entry and reading from the file. + ManagedSnapshot* own_snapshot = nullptr; + const Snapshot* snapshot = read_options.snapshot; + if (snapshot == nullptr) { + own_snapshot = new ManagedSnapshot(db_); + snapshot = own_snapshot->snapshot(); + } + auto* iter = db_impl_->NewIteratorImpl( + read_options, cfd, snapshot->GetSequenceNumber(), + nullptr /*read_callback*/, true /*expose_blob_index*/); + return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_); +} + +Status DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options) { + const ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + + Status status; + std::string blobdir; + blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir + : bdb_options.blob_dir; + + std::vector filenames; + if (env->GetChildren(blobdir, &filenames).ok()) { + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kBlobFile) { + Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true, + /*force_fg=*/false); + if (status.ok() && !del.ok()) { + status = del; + } + } + } + // TODO: What to do if we cannot delete the directory? + env->DeleteDir(blobdir).PermitUncheckedError(); + } + Status destroy = DestroyDB(dbname, options); + if (status.ok() && !destroy.ok()) { + status = destroy; + } + + return status; +} + +#ifndef NDEBUG +Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value) { + return GetBlobValue(key, index_entry, value); +} + +void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number, + SequenceNumber immutable_sequence) { + auto blob_file = std::make_shared(this, blob_dir_, blob_file_number, + db_options_.info_log.get()); + blob_file->MarkImmutable(immutable_sequence); + + blob_files_[blob_file_number] = blob_file; + live_imm_non_ttl_blob_files_[blob_file_number] = blob_file; +} + +std::vector> BlobDBImpl::TEST_GetBlobFiles() const { + ReadLock l(&mutex_); + std::vector> blob_files; + for (auto& p : blob_files_) { + blob_files.emplace_back(p.second); + } + return blob_files; +} + +std::vector> BlobDBImpl::TEST_GetLiveImmNonTTLFiles() + const { + ReadLock l(&mutex_); + std::vector> live_imm_non_ttl_files; + for (const auto& pair : live_imm_non_ttl_blob_files_) { + live_imm_non_ttl_files.emplace_back(pair.second); + } + return live_imm_non_ttl_files; +} + +std::vector> BlobDBImpl::TEST_GetObsoleteFiles() + const { + ReadLock l(&mutex_); + std::vector> obsolete_files; + for (auto& bfile : obsolete_files_) { + obsolete_files.emplace_back(bfile); + } + return obsolete_files; +} + +void BlobDBImpl::TEST_DeleteObsoleteFiles() { + DeleteObsoleteFiles(false /*abort*/); +} + +Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr& bfile) { + MutexLock l(&write_mutex_); + WriteLock lock(&mutex_); + WriteLock file_lock(&bfile->mutex_); + + return CloseBlobFile(bfile); +} + +void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq, + bool update_size) { + return ObsoleteBlobFile(blob_file, obsolete_seq, update_size); +} + +void BlobDBImpl::TEST_EvictExpiredFiles() { + EvictExpiredFiles(false /*abort*/); +} + +uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); } + +void BlobDBImpl::TEST_InitializeBlobFileToSstMapping( + const std::vector& live_files) { + InitializeBlobFileToSstMapping(live_files); +} + +void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) { + ProcessFlushJobInfo(info); +} + +void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) { + ProcessCompactionJobInfo(info); +} + +#endif // !NDEBUG + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.h b/src/rocksdb/utilities/blob_db/blob_db_impl.h new file mode 100644 index 000000000..0b4dbf5e5 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_impl.h @@ -0,0 +1,503 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/db_iter.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/wal_filter.h" +#include "util/mutexlock.h" +#include "util/timer_queue.h" +#include "utilities/blob_db/blob_db.h" +#include "utilities/blob_db/blob_file.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class ColumnFamilyHandle; +class ColumnFamilyData; +class SystemClock; + +struct FlushJobInfo; + +namespace blob_db { + +struct BlobCompactionContext; +struct BlobCompactionContextGC; +class BlobDBImpl; +class BlobFile; + +// Comparator to sort "TTL" aware Blob files based on the lower value of +// TTL range. +struct BlobFileComparatorTTL { + bool operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const; +}; + +struct BlobFileComparator { + bool operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const; +}; + +/** + * The implementation class for BlobDB. It manages the blob logs, which + * are sequentially written files. Blob logs can be of the TTL or non-TTL + * varieties; the former are cleaned up when they expire, while the latter + * are (optionally) garbage collected. + */ +class BlobDBImpl : public BlobDB { + friend class BlobFile; + friend class BlobDBIterator; + friend class BlobDBListener; + friend class BlobDBListenerGC; + friend class BlobIndexCompactionFilterBase; + friend class BlobIndexCompactionFilterGC; + + public: + // deletions check period + static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000; + + // sanity check task + static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000; + + // how many random access open files can we tolerate + static constexpr uint32_t kOpenFilesTrigger = 100; + + // how often to schedule reclaim open files. + static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000; + + // how often to schedule delete obs files periods + static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000; + + // how often to schedule expired files eviction. + static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000; + + // when should oldest file be evicted: + // on reaching 90% of blob_dir_size + static constexpr double kEvictOldestFileAtSize = 0.9; + + using BlobDB::Put; + Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) override; + + using BlobDB::Get; + Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value, + uint64_t* expiration) override; + + using BlobDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& read_options) override; + + using BlobDB::NewIterators; + virtual Status NewIterators( + const ReadOptions& /*read_options*/, + const std::vector& /*column_families*/, + std::vector* /*iterators*/) override { + return Status::NotSupported("Not implemented"); + } + + using BlobDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& read_options, const std::vector& keys, + std::vector* values) override; + + using BlobDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + virtual Status Close() override; + + using BlobDB::PutWithTTL; + Status PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) override; + + using BlobDB::PutUntil; + Status PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration) override; + + using BlobDB::CompactFiles; + Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + BlobDBOptions GetBlobDBOptions() const override; + + BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options, + const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual void GetLiveFilesMetaData(std::vector*) override; + + ~BlobDBImpl(); + + Status Open(std::vector* handles); + + Status SyncBlobFiles() override; + + // Common part of the two GetCompactionContext methods below. + // REQUIRES: read lock on mutex_ + void GetCompactionContextCommon(BlobCompactionContext* context); + + void GetCompactionContext(BlobCompactionContext* context); + void GetCompactionContext(BlobCompactionContext* context, + BlobCompactionContextGC* context_gc); + +#ifndef NDEBUG + Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value); + + void TEST_AddDummyBlobFile(uint64_t blob_file_number, + SequenceNumber immutable_sequence); + + std::vector> TEST_GetBlobFiles() const; + + std::vector> TEST_GetLiveImmNonTTLFiles() const; + + std::vector> TEST_GetObsoleteFiles() const; + + Status TEST_CloseBlobFile(std::shared_ptr& bfile); + + void TEST_ObsoleteBlobFile(std::shared_ptr& blob_file, + SequenceNumber obsolete_seq = 0, + bool update_size = true); + + void TEST_EvictExpiredFiles(); + + void TEST_DeleteObsoleteFiles(); + + uint64_t TEST_live_sst_size(); + + const std::string& TEST_blob_dir() const { return blob_dir_; } + + void TEST_InitializeBlobFileToSstMapping( + const std::vector& live_files); + + void TEST_ProcessFlushJobInfo(const FlushJobInfo& info); + + void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info); + +#endif // !NDEBUG + + private: + class BlobInserter; + + // Create a snapshot if there isn't one in read options. + // Return true if a snapshot is created. + bool SetSnapshotIfNeeded(ReadOptions* read_options); + + Status GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration = nullptr); + + Status GetBlobValue(const Slice& key, const Slice& index_entry, + PinnableSlice* value, uint64_t* expiration = nullptr); + + Status GetRawBlobFromFile(const Slice& key, uint64_t file_number, + uint64_t offset, uint64_t size, + PinnableSlice* value, + CompressionType* compression_type); + + Slice GetCompressedSlice(const Slice& raw, + std::string* compression_output) const; + + Status DecompressSlice(const Slice& compressed_value, + CompressionType compression_type, + PinnableSlice* value_output) const; + + // Close a file by appending a footer, and removes file from open files list. + // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_ + // and the blob file's mutex_. If called on a blob file which is visible only + // to a single thread (like in the case of new files written during + // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be + // avoided. + Status CloseBlobFile(std::shared_ptr bfile); + + // Close a file if its size exceeds blob_file_size + // REQUIRES: lock held on write_mutex_. + Status CloseBlobFileIfNeeded(std::shared_ptr& bfile); + + // Mark file as obsolete and move the file to obsolete file list. + // + // REQUIRED: hold write lock of mutex_ or during DB open. + void ObsoleteBlobFile(std::shared_ptr blob_file, + SequenceNumber obsolete_seq, bool update_size); + + Status PutBlobValue(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t expiration, + WriteBatch* batch); + + Status AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, uint64_t expiration, + std::string* index_entry); + + // Create a new blob file and associated writer. + Status CreateBlobFileAndWriter(bool has_ttl, + const ExpirationRange& expiration_range, + const std::string& reason, + std::shared_ptr* blob_file, + std::shared_ptr* writer); + + // Get the open non-TTL blob log file, or create a new one if no such file + // exists. + Status SelectBlobFile(std::shared_ptr* blob_file); + + // Get the open TTL blob log file for a certain expiration, or create a new + // one if no such file exists. + Status SelectBlobFileTTL(uint64_t expiration, + std::shared_ptr* blob_file); + + std::shared_ptr FindBlobFileLocked(uint64_t expiration) const; + + // periodic sanity check. Bunch of checks + std::pair SanityCheck(bool aborted); + + // Delete files that have been marked obsolete (either because of TTL + // or GC). Check whether any snapshots exist which refer to the same. + std::pair DeleteObsoleteFiles(bool aborted); + + // periodically check if open blob files and their TTL's has expired + // if expired, close the sequential writer and make the file immutable + std::pair EvictExpiredFiles(bool aborted); + + // if the number of open files, approaches ULIMIT's this + // task will close random readers, which are kept around for + // efficiency + std::pair ReclaimOpenFiles(bool aborted); + + std::pair RemoveTimerQ(TimerQueue* tq, bool aborted); + + // Adds the background tasks to the timer queue + void StartBackgroundTasks(); + + // add a new Blob File + std::shared_ptr NewBlobFile(bool has_ttl, + const ExpirationRange& expiration_range, + const std::string& reason); + + // Register a new blob file. + // REQUIRES: write lock on mutex_. + void RegisterBlobFile(std::shared_ptr blob_file); + + // collect all the blob log files from the blob directory + Status GetAllBlobFiles(std::set* file_numbers); + + // Open all blob files found in blob_dir. + Status OpenAllBlobFiles(); + + // Link an SST to a blob file. Comes in locking and non-locking varieties + // (the latter is used during Open). + template + void LinkSstToBlobFileImpl(uint64_t sst_file_number, + uint64_t blob_file_number, Linker linker); + + void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number); + + void LinkSstToBlobFileNoLock(uint64_t sst_file_number, + uint64_t blob_file_number); + + // Unlink an SST from a blob file. + void UnlinkSstFromBlobFile(uint64_t sst_file_number, + uint64_t blob_file_number); + + // Initialize the mapping between blob files and SSTs during Open. + void InitializeBlobFileToSstMapping( + const std::vector& live_files); + + // Update the mapping between blob files and SSTs after a flush and mark + // any unneeded blob files obsolete. + void ProcessFlushJobInfo(const FlushJobInfo& info); + + // Update the mapping between blob files and SSTs after a compaction and + // mark any unneeded blob files obsolete. + void ProcessCompactionJobInfo(const CompactionJobInfo& info); + + // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs + // linked to it, and all memtables from before the blob file became immutable + // have been flushed. Note: should only be called if the condition holds for + // all lower-numbered non-TTL blob files as well. + bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr& blob_file, + SequenceNumber obsolete_seq); + + // Mark all immutable non-TTL blob files that aren't needed by any SSTs as + // obsolete. Comes in two varieties; the version used during Open need not + // worry about locking or snapshots. + template + void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed); + + void MarkUnreferencedBlobFilesObsolete(); + void MarkUnreferencedBlobFilesObsoleteDuringOpen(); + + void UpdateLiveSSTSize(); + + Status GetBlobFileReader(const std::shared_ptr& blob_file, + std::shared_ptr* reader); + + // hold write mutex on file and call. + // Close the above Random Access reader + void CloseRandomAccessLocked(const std::shared_ptr& bfile); + + // hold write mutex on file and call + // creates a sequential (append) writer for this blobfile + Status CreateWriterLocked(const std::shared_ptr& bfile); + + // returns a BlobLogWriter object for the file. If writer is not + // already present, creates one. Needs Write Mutex to be held + Status CheckOrCreateWriterLocked(const std::shared_ptr& blob_file, + std::shared_ptr* writer); + + // checks if there is no snapshot which is referencing the + // blobs + bool VisibleToActiveSnapshot(const std::shared_ptr& file); + bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr& bfile); + + void CopyBlobFiles(std::vector>* bfiles_copy); + + uint64_t EpochNow() { return clock_->NowMicros() / 1000000; } + + // Check if inserting a new blob will make DB grow out of space. + // If is_fifo = true, FIFO eviction will be triggered to make room for the + // new blob. If force_evict = true, FIFO eviction will evict blob files + // even eviction will not make enough room for the new blob. + Status CheckSizeAndEvictBlobFiles(uint64_t blob_size, + bool force_evict = false); + + // name of the database directory + std::string dbname_; + + // the base DB + DBImpl* db_impl_; + Env* env_; + SystemClock* clock_; + // the options that govern the behavior of Blob Storage + BlobDBOptions bdb_options_; + DBOptions db_options_; + ColumnFamilyOptions cf_options_; + FileOptions file_options_; + + // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold + // ownership. + Statistics* statistics_; + + // by default this is "blob_dir" under dbname_ + // but can be configured + std::string blob_dir_; + + // pointer to directory + std::unique_ptr dir_ent_; + + // Read Write Mutex, which protects all the data structures + // HEAVILY TRAFFICKED + mutable port::RWMutex mutex_; + + // Writers has to hold write_mutex_ before writing. + mutable port::Mutex write_mutex_; + + // counter for blob file number + std::atomic next_file_number_; + + // entire metadata of all the BLOB files memory + std::map> blob_files_; + + // All live immutable non-TTL blob files. + std::map> live_imm_non_ttl_blob_files_; + + // The largest sequence number that has been flushed. + SequenceNumber flush_sequence_; + + // opened non-TTL blob file. + std::shared_ptr open_non_ttl_file_; + + // all the blob files which are currently being appended to based + // on variety of incoming TTL's + std::set, BlobFileComparatorTTL> open_ttl_files_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + + // timer based queue to execute tasks + TimerQueue tqueue_; + + // number of files opened for random access/GET + // counter is used to monitor and close excess RA files. + std::atomic open_file_count_; + + // Total size of all live blob files (i.e. exclude obsolete files). + std::atomic total_blob_size_; + + // total size of SST files. + std::atomic live_sst_size_; + + // Latest FIFO eviction timestamp + // + // REQUIRES: access with metex_ lock held. + uint64_t fifo_eviction_seq_; + + // The expiration up to which latest FIFO eviction evicts. + // + // REQUIRES: access with metex_ lock held. + uint64_t evict_expiration_up_to_; + + std::list> obsolete_files_; + + // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block + // on the mutex to avoid contention. + // + // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note + // the difference. mutex_ only needs to be held when access the + // data-structure, and delete_file_mutex_ needs to be held the whole time + // during DeleteObsoleteFiles to avoid being run simultaneously with + // DisableFileDeletions. + // + // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced + // to hold delete_file_mutex_ first to avoid deadlock. + mutable port::Mutex delete_file_mutex_; + + // Each call of DisableFileDeletions will increase disable_file_deletion_ + // by 1. EnableFileDeletions will either decrease the count by 1 or reset + // it to zeor, depending on the force flag. + // + // REQUIRES: access with delete_file_mutex_ held. + int disable_file_deletions_ = 0; + + uint32_t debug_level_; +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc new file mode 100644 index 000000000..87e3f33cc --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "file/filename.h" +#include "logging/logging.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/blob_db/blob_db_impl.h" + +// BlobDBImpl methods to get snapshot of files, e.g. for replication. + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +Status BlobDBImpl::DisableFileDeletions() { + // Disable base DB file deletions. + Status s = db_impl_->DisableFileDeletions(); + if (!s.ok()) { + return s; + } + + int count = 0; + { + // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job + // is running. + MutexLock l(&delete_file_mutex_); + count = ++disable_file_deletions_; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "Disabled blob file deletions. count: %d", count); + return Status::OK(); +} + +Status BlobDBImpl::EnableFileDeletions(bool force) { + // Enable base DB file deletions. + Status s = db_impl_->EnableFileDeletions(force); + if (!s.ok()) { + return s; + } + + int count = 0; + { + MutexLock l(&delete_file_mutex_); + if (force) { + disable_file_deletions_ = 0; + } else if (disable_file_deletions_ > 0) { + count = --disable_file_deletions_; + } + assert(count >= 0); + } + + ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d", + count); + // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to + // make DeleteobsoleteFiles re-run interval configuration. + return Status::OK(); +} + +Status BlobDBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + if (!bdb_options_.path_relative) { + return Status::NotSupported( + "Not able to get relative blob file path from absolute blob_dir."); + } + // Hold a lock in the beginning to avoid updates to base DB during the call + ReadLock rl(&mutex_); + Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable); + if (!s.ok()) { + return s; + } + ret.reserve(ret.size() + blob_files_.size()); + for (auto bfile_pair : blob_files_) { + auto blob_file = bfile_pair.second; + // Path should be relative to db_name, but begin with slash. + ret.emplace_back( + BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber())); + } + return Status::OK(); +} + +void BlobDBImpl::GetLiveFilesMetaData(std::vector* metadata) { + // Path should be relative to db_name. + assert(bdb_options_.path_relative); + // Hold a lock in the beginning to avoid updates to base DB during the call + ReadLock rl(&mutex_); + db_->GetLiveFilesMetaData(metadata); + for (auto bfile_pair : blob_files_) { + auto blob_file = bfile_pair.second; + LiveFileMetaData filemetadata; + filemetadata.size = blob_file->GetFileSize(); + const uint64_t file_number = blob_file->BlobFileNumber(); + // Path should be relative to db_name, but begin with slash. + filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number); + filemetadata.file_number = file_number; + if (blob_file->HasTTL()) { + filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first; + } + auto cfh = + static_cast_with_check(DefaultColumnFamily()); + filemetadata.column_family_name = cfh->GetName(); + metadata->emplace_back(filemetadata); + } +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_iterator.h b/src/rocksdb/utilities/blob_db/blob_db_iterator.h new file mode 100644 index 000000000..fd2b2f8f5 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_iterator.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/arena_wrapped_db_iter.h" +#include "rocksdb/iterator.h" +#include "util/stop_watch.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +class Statistics; +class SystemClock; + +namespace blob_db { + +using ROCKSDB_NAMESPACE::ManagedSnapshot; + +class BlobDBIterator : public Iterator { + public: + BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter, + BlobDBImpl* blob_db, SystemClock* clock, + Statistics* statistics) + : snapshot_(snapshot), + iter_(iter), + blob_db_(blob_db), + clock_(clock), + statistics_(statistics) {} + + virtual ~BlobDBIterator() = default; + + bool Valid() const override { + if (!iter_->Valid()) { + return false; + } + return status_.ok(); + } + + Status status() const override { + if (!iter_->status().ok()) { + return iter_->status(); + } + return status_; + } + + void SeekToFirst() override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekToFirst(); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void SeekToLast() override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekToLast(); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + void Seek(const Slice& target) override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->Seek(target); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void SeekForPrev(const Slice& target) override { + StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_SEEK); + iter_->SeekForPrev(target); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + void Next() override { + assert(Valid()); + StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_NEXT); + iter_->Next(); + while (UpdateBlobValue()) { + iter_->Next(); + } + } + + void Prev() override { + assert(Valid()); + StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS); + RecordTick(statistics_, BLOB_DB_NUM_PREV); + iter_->Prev(); + while (UpdateBlobValue()) { + iter_->Prev(); + } + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice value() const override { + assert(Valid()); + if (!iter_->IsBlob()) { + return iter_->value(); + } + return value_; + } + + // Iterator::Refresh() not supported. + + private: + // Return true if caller should continue to next value. + bool UpdateBlobValue() { + value_.Reset(); + status_ = Status::OK(); + if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) { + Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_); + if (s.IsNotFound()) { + return true; + } else { + if (!s.ok()) { + status_ = s; + } + return false; + } + } else { + return false; + } + } + + std::unique_ptr snapshot_; + std::unique_ptr iter_; + BlobDBImpl* blob_db_; + SystemClock* clock_; + Statistics* statistics_; + Status status_; + PinnableSlice value_; +}; +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_listener.h b/src/rocksdb/utilities/blob_db/blob_db_listener.h new file mode 100644 index 000000000..d17d29853 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_listener.h @@ -0,0 +1,71 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/listener.h" +#include "util/mutexlock.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDBListener : public EventListener { + public: + explicit BlobDBListener(BlobDBImpl* blob_db_impl) + : blob_db_impl_(blob_db_impl) {} + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->SyncBlobFiles(); + } + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*info*/) override { + assert(blob_db_impl_ != nullptr); + blob_db_impl_->UpdateLiveSSTSize(); + } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BlobDBListener"; } + + protected: + BlobDBImpl* blob_db_impl_; +}; + +class BlobDBListenerGC : public BlobDBListener { + public: + explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl) + : BlobDBListener(blob_db_impl) {} + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BlobDBListenerGC"; } + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + BlobDBListener::OnFlushCompleted(db, info); + + assert(blob_db_impl_); + blob_db_impl_->ProcessFlushJobInfo(info); + } + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { + BlobDBListener::OnCompactionCompleted(db, info); + + assert(blob_db_impl_); + blob_db_impl_->ProcessCompactionJobInfo(info); + } +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_db_test.cc b/src/rocksdb/utilities/blob_db/blob_db_test.cc new file mode 100644 index 000000000..e392962b2 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_db_test.cc @@ -0,0 +1,2407 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" +#include "port/port.h" +#include "rocksdb/utilities/debug.h" +#include "test_util/mock_time_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/blob_db/blob_db_impl.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDBTest : public testing::Test { + public: + const int kMaxBlobSize = 1 << 14; + + struct BlobIndexVersion { + BlobIndexVersion() = default; + BlobIndexVersion(std::string _user_key, uint64_t _file_number, + uint64_t _expiration, SequenceNumber _sequence, + ValueType _type) + : user_key(std::move(_user_key)), + file_number(_file_number), + expiration(_expiration), + sequence(_sequence), + type(_type) {} + + std::string user_key; + uint64_t file_number = kInvalidBlobFileNumber; + uint64_t expiration = kNoExpiration; + SequenceNumber sequence = 0; + ValueType type = kTypeValue; + }; + + BlobDBTest() + : dbname_(test::PerThreadDBPath("blob_db_test")), blob_db_(nullptr) { + mock_clock_ = std::make_shared(SystemClock::Default()); + mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_)); + fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default())); + + Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions()); + assert(s.ok()); + } + + ~BlobDBTest() override { + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(); + } + + Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(), + Options options = Options()) { + options.create_if_missing = true; + if (options.env == mock_env_.get()) { + // Need to disable stats dumping and persisting which also use + // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal. + // With mocked time, this can hang on some platforms (MacOS) + // because (a) on some platforms, pthread_cond_timedwait does not appear + // to release the lock for other threads to operate if the deadline time + // is already passed, and (b) TimedWait calls are currently a bad + // abstraction because the deadline parameter is usually computed from + // Env time, but is interpreted in real clock time. + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + } + return BlobDB::Open(options, bdb_options, dbname_, &blob_db_); + } + + void Open(BlobDBOptions bdb_options = BlobDBOptions(), + Options options = Options()) { + ASSERT_OK(TryOpen(bdb_options, options)); + } + + void Reopen(BlobDBOptions bdb_options = BlobDBOptions(), + Options options = Options()) { + assert(blob_db_ != nullptr); + delete blob_db_; + blob_db_ = nullptr; + Open(bdb_options, options); + } + + void Close() { + assert(blob_db_ != nullptr); + delete blob_db_; + blob_db_ = nullptr; + } + + void Destroy() { + if (blob_db_) { + Options options = blob_db_->GetOptions(); + BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions(); + delete blob_db_; + blob_db_ = nullptr; + ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options)); + } + } + + BlobDBImpl *blob_db_impl() { + return reinterpret_cast(blob_db_); + } + + Status Put(const Slice &key, const Slice &value, + std::map *data = nullptr) { + Status s = blob_db_->Put(WriteOptions(), key, value); + if (data != nullptr) { + (*data)[key.ToString()] = value.ToString(); + } + return s; + } + + void Delete(const std::string &key, + std::map *data = nullptr) { + ASSERT_OK(blob_db_->Delete(WriteOptions(), key)); + if (data != nullptr) { + data->erase(key); + } + } + + Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl, + std::map *data = nullptr) { + Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl); + if (data != nullptr) { + (*data)[key.ToString()] = value.ToString(); + } + return s; + } + + Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) { + return blob_db_->PutUntil(WriteOptions(), key, value, expiration); + } + + void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd, + std::map *data = nullptr) { + int len = rnd->Next() % kMaxBlobSize + 1; + std::string value = rnd->HumanReadableString(len); + ASSERT_OK( + blob_db_->PutWithTTL(WriteOptions(), Slice(key), Slice(value), ttl)); + if (data != nullptr) { + (*data)[key] = value; + } + } + + void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd, + std::map *data = nullptr) { + int len = rnd->Next() % kMaxBlobSize + 1; + std::string value = rnd->HumanReadableString(len); + ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value), + expiration)); + if (data != nullptr) { + (*data)[key] = value; + } + } + + void PutRandom(const std::string &key, Random *rnd, + std::map *data = nullptr) { + PutRandom(blob_db_, key, rnd, data); + } + + void PutRandom(DB *db, const std::string &key, Random *rnd, + std::map *data = nullptr) { + int len = rnd->Next() % kMaxBlobSize + 1; + std::string value = rnd->HumanReadableString(len); + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + if (data != nullptr) { + (*data)[key] = value; + } + } + + void PutRandomToWriteBatch( + const std::string &key, Random *rnd, WriteBatch *batch, + std::map *data = nullptr) { + int len = rnd->Next() % kMaxBlobSize + 1; + std::string value = rnd->HumanReadableString(len); + ASSERT_OK(batch->Put(key, value)); + if (data != nullptr) { + (*data)[key] = value; + } + } + + // Verify blob db contain expected data and nothing more. + void VerifyDB(const std::map &data) { + VerifyDB(blob_db_, data); + } + + void VerifyDB(DB *db, const std::map &data) { + // Verify normal Get + auto *cfh = db->DefaultColumnFamily(); + for (auto &p : data) { + PinnableSlice value_slice; + ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice)); + ASSERT_EQ(p.second, value_slice.ToString()); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value)); + ASSERT_EQ(p.second, value); + } + + // Verify iterators + Iterator *iter = db->NewIterator(ReadOptions()); + iter->SeekToFirst(); + for (auto &p : data) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(p.first, iter->key().ToString()); + ASSERT_EQ(p.second, iter->value().ToString()); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } + + void VerifyBaseDB( + const std::map &expected_versions) { + auto *bdb_impl = static_cast(blob_db_); + DB *db = blob_db_->GetRootDB(); + const size_t kMaxKeys = 10000; + std::vector versions; + ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions)); + ASSERT_EQ(expected_versions.size(), versions.size()); + size_t i = 0; + for (auto &key_version : expected_versions) { + const KeyVersion &expected_version = key_version.second; + ASSERT_EQ(expected_version.user_key, versions[i].user_key); + ASSERT_EQ(expected_version.sequence, versions[i].sequence); + ASSERT_EQ(expected_version.type, versions[i].type); + if (versions[i].type == kTypeValue) { + ASSERT_EQ(expected_version.value, versions[i].value); + } else { + ASSERT_EQ(kTypeBlobIndex, versions[i].type); + PinnableSlice value; + ASSERT_OK(bdb_impl->TEST_GetBlobValue(versions[i].user_key, + versions[i].value, &value)); + ASSERT_EQ(expected_version.value, value.ToString()); + } + i++; + } + } + + void VerifyBaseDBBlobIndex( + const std::map &expected_versions) { + const size_t kMaxKeys = 10000; + std::vector versions; + ASSERT_OK( + GetAllKeyVersions(blob_db_->GetRootDB(), "", "", kMaxKeys, &versions)); + ASSERT_EQ(versions.size(), expected_versions.size()); + + size_t i = 0; + for (const auto &expected_pair : expected_versions) { + const BlobIndexVersion &expected_version = expected_pair.second; + + ASSERT_EQ(versions[i].user_key, expected_version.user_key); + ASSERT_EQ(versions[i].sequence, expected_version.sequence); + ASSERT_EQ(versions[i].type, expected_version.type); + if (versions[i].type != kTypeBlobIndex) { + ASSERT_EQ(kInvalidBlobFileNumber, expected_version.file_number); + ASSERT_EQ(kNoExpiration, expected_version.expiration); + + ++i; + continue; + } + + BlobIndex blob_index; + ASSERT_OK(blob_index.DecodeFrom(versions[i].value)); + + const uint64_t file_number = !blob_index.IsInlined() + ? blob_index.file_number() + : kInvalidBlobFileNumber; + ASSERT_EQ(file_number, expected_version.file_number); + + const uint64_t expiration = + blob_index.HasTTL() ? blob_index.expiration() : kNoExpiration; + ASSERT_EQ(expiration, expected_version.expiration); + + ++i; + } + } + + void InsertBlobs() { + WriteOptions wo; + std::string value; + + Random rnd(301); + for (size_t i = 0; i < 100000; i++) { + uint64_t ttl = rnd.Next() % 86400; + PutRandomWithTTL("key" + std::to_string(i % 500), ttl, &rnd, nullptr); + } + + for (size_t i = 0; i < 10; i++) { + Delete("key" + std::to_string(i % 500)); + } + } + + const std::string dbname_; + std::shared_ptr mock_clock_; + std::unique_ptr mock_env_; + std::unique_ptr fault_injection_env_; + BlobDB *blob_db_; +}; // class BlobDBTest + +TEST_F(BlobDBTest, Put) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("key" + std::to_string(i), &rnd, &data); + } + VerifyDB(data); +} + +TEST_F(BlobDBTest, PutWithTTL) { + Random rnd(301); + Options options; + options.env = mock_env_.get(); + BlobDBOptions bdb_options; + bdb_options.ttl_range_secs = 1000; + bdb_options.min_blob_size = 0; + bdb_options.blob_file_size = 256 * 1000 * 1000; + bdb_options.disable_background_tasks = true; + Open(bdb_options, options); + std::map data; + mock_clock_->SetCurrentTime(50); + for (size_t i = 0; i < 100; i++) { + uint64_t ttl = rnd.Next() % 100; + PutRandomWithTTL("key" + std::to_string(i), ttl, &rnd, + (ttl <= 50 ? nullptr : &data)); + } + mock_clock_->SetCurrentTime(100); + auto *bdb_impl = static_cast(blob_db_); + auto blob_files = bdb_impl->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_TRUE(blob_files[0]->HasTTL()); + ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); + VerifyDB(data); +} + +TEST_F(BlobDBTest, PutUntil) { + Random rnd(301); + Options options; + options.env = mock_env_.get(); + BlobDBOptions bdb_options; + bdb_options.ttl_range_secs = 1000; + bdb_options.min_blob_size = 0; + bdb_options.blob_file_size = 256 * 1000 * 1000; + bdb_options.disable_background_tasks = true; + Open(bdb_options, options); + std::map data; + mock_clock_->SetCurrentTime(50); + for (size_t i = 0; i < 100; i++) { + uint64_t expiration = rnd.Next() % 100 + 50; + PutRandomUntil("key" + std::to_string(i), expiration, &rnd, + (expiration <= 100 ? nullptr : &data)); + } + mock_clock_->SetCurrentTime(100); + auto *bdb_impl = static_cast(blob_db_); + auto blob_files = bdb_impl->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_TRUE(blob_files[0]->HasTTL()); + ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); + VerifyDB(data); +} + +TEST_F(BlobDBTest, StackableDBGet) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("key" + std::to_string(i), &rnd, &data); + } + for (size_t i = 0; i < 100; i++) { + StackableDB *db = blob_db_; + ColumnFamilyHandle *column_family = db->DefaultColumnFamily(); + std::string key = "key" + std::to_string(i); + PinnableSlice pinnable_value; + ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value)); + std::string string_value; + ASSERT_OK(db->Get(ReadOptions(), column_family, key, &string_value)); + ASSERT_EQ(string_value, pinnable_value.ToString()); + ASSERT_EQ(string_value, data[key]); + } +} + +TEST_F(BlobDBTest, GetExpiration) { + Options options; + options.env = mock_env_.get(); + BlobDBOptions bdb_options; + bdb_options.disable_background_tasks = true; + mock_clock_->SetCurrentTime(100); + Open(bdb_options, options); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(PutWithTTL("key2", "value2", 200)); + PinnableSlice value; + uint64_t expiration; + ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration)); + ASSERT_EQ("value1", value.ToString()); + ASSERT_EQ(kNoExpiration, expiration); + ASSERT_OK(blob_db_->Get(ReadOptions(), "key2", &value, &expiration)); + ASSERT_EQ("value2", value.ToString()); + ASSERT_EQ(300 /* = 100 + 200 */, expiration); +} + +TEST_F(BlobDBTest, GetIOError) { + Options options; + options.env = fault_injection_env_.get(); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; // Make sure value write to blob file + bdb_options.disable_background_tasks = true; + Open(bdb_options, options); + ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily(); + PinnableSlice value; + ASSERT_OK(Put("foo", "bar")); + fault_injection_env_->SetFilesystemActive(false, Status::IOError()); + Status s = blob_db_->Get(ReadOptions(), column_family, "foo", &value); + ASSERT_TRUE(s.IsIOError()); + // Reactivate file system to allow test to close DB. + fault_injection_env_->SetFilesystemActive(true); +} + +TEST_F(BlobDBTest, PutIOError) { + Options options; + options.env = fault_injection_env_.get(); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; // Make sure value write to blob file + bdb_options.disable_background_tasks = true; + Open(bdb_options, options); + fault_injection_env_->SetFilesystemActive(false, Status::IOError()); + ASSERT_TRUE(Put("foo", "v1").IsIOError()); + fault_injection_env_->SetFilesystemActive(true, Status::IOError()); + ASSERT_OK(Put("bar", "v1")); +} + +TEST_F(BlobDBTest, WriteBatch) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + WriteBatch batch; + for (size_t j = 0; j < 10; j++) { + PutRandomToWriteBatch("key" + std::to_string(j * 100 + i), &rnd, &batch, + &data); + } + + ASSERT_OK(blob_db_->Write(WriteOptions(), &batch)); + } + VerifyDB(data); +} + +TEST_F(BlobDBTest, Delete) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("key" + std::to_string(i), &rnd, &data); + } + for (size_t i = 0; i < 100; i += 5) { + Delete("key" + std::to_string(i), &data); + } + VerifyDB(data); +} + +TEST_F(BlobDBTest, DeleteBatch) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + for (size_t i = 0; i < 100; i++) { + PutRandom("key" + std::to_string(i), &rnd); + } + WriteBatch batch; + for (size_t i = 0; i < 100; i++) { + ASSERT_OK(batch.Delete("key" + std::to_string(i))); + } + ASSERT_OK(blob_db_->Write(WriteOptions(), &batch)); + // DB should be empty. + VerifyDB({}); +} + +TEST_F(BlobDBTest, Override) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (int i = 0; i < 10000; i++) { + PutRandom("key" + std::to_string(i), &rnd, nullptr); + } + // override all the keys + for (int i = 0; i < 10000; i++) { + PutRandom("key" + std::to_string(i), &rnd, &data); + } + VerifyDB(data); +} + +#ifdef SNAPPY +TEST_F(BlobDBTest, Compression) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + bdb_options.compression = CompressionType::kSnappyCompression; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("put-key" + std::to_string(i), &rnd, &data); + } + for (int i = 0; i < 100; i++) { + WriteBatch batch; + for (size_t j = 0; j < 10; j++) { + PutRandomToWriteBatch("write-batch-key" + std::to_string(j * 100 + i), + &rnd, &batch, &data); + } + ASSERT_OK(blob_db_->Write(WriteOptions(), &batch)); + } + VerifyDB(data); +} + +TEST_F(BlobDBTest, DecompressAfterReopen) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + bdb_options.compression = CompressionType::kSnappyCompression; + Open(bdb_options); + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("put-key" + std::to_string(i), &rnd, &data); + } + VerifyDB(data); + bdb_options.compression = CompressionType::kNoCompression; + Reopen(bdb_options); + VerifyDB(data); +} + +TEST_F(BlobDBTest, EnableDisableCompressionGC) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.garbage_collection_cutoff = 1.0; + bdb_options.disable_background_tasks = true; + bdb_options.compression = kSnappyCompression; + Open(bdb_options); + std::map data; + size_t data_idx = 0; + for (; data_idx < 100; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType()); + + // disable compression + bdb_options.compression = kNoCompression; + Reopen(bdb_options); + + // Add more data with new compression type + for (; data_idx < 200; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType()); + + // Enable GC. If we do it earlier the snapshot release triggered compaction + // may compact files and trigger GC before we can verify there are two files. + bdb_options.enable_garbage_collection = true; + Reopen(bdb_options); + + // Trigger compaction + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + VerifyDB(data); + + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + for (auto bfile : blob_files) { + ASSERT_EQ(kNoCompression, bfile->GetCompressionType()); + } + + // enabling the compression again + bdb_options.compression = kSnappyCompression; + Reopen(bdb_options); + + // Add more data with new compression type + for (; data_idx < 300; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + // Trigger compaction + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + VerifyDB(data); + + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + for (auto bfile : blob_files) { + ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType()); + } +} + +#ifdef LZ4 +// Test switch compression types and run GC, it needs both Snappy and LZ4 +// support. +TEST_F(BlobDBTest, ChangeCompressionGC) { + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.garbage_collection_cutoff = 1.0; + bdb_options.disable_background_tasks = true; + bdb_options.compression = kLZ4Compression; + Open(bdb_options); + std::map data; + size_t data_idx = 0; + for (; data_idx < 100; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType()); + + // Change compression type + bdb_options.compression = kSnappyCompression; + Reopen(bdb_options); + + // Add more data with Snappy compression type + for (; data_idx < 200; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + // Verify blob file compression type + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType()); + + // Enable GC. If we do it earlier the snapshot release triggered compaction + // may compact files and trigger GC before we can verify there are two files. + bdb_options.enable_garbage_collection = true; + Reopen(bdb_options); + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyDB(data); + + blob_db_impl()->TEST_DeleteObsoleteFiles(); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + for (auto bfile : blob_files) { + ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType()); + } + + // Disable compression + bdb_options.compression = kNoCompression; + Reopen(bdb_options); + for (; data_idx < 300; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyDB(data); + + blob_db_impl()->TEST_DeleteObsoleteFiles(); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + for (auto bfile : blob_files) { + ASSERT_EQ(kNoCompression, bfile->GetCompressionType()); + } + + // switching different compression types to generate mixed compression types + bdb_options.compression = kSnappyCompression; + Reopen(bdb_options); + for (; data_idx < 400; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + bdb_options.compression = kLZ4Compression; + Reopen(bdb_options); + for (; data_idx < 500; data_idx++) { + PutRandom("put-key" + std::to_string(data_idx), &rnd, &data); + } + VerifyDB(data); + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyDB(data); + + blob_db_impl()->TEST_DeleteObsoleteFiles(); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + for (auto bfile : blob_files) { + ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType()); + } +} +#endif // LZ4 +#endif // SNAPPY + +TEST_F(BlobDBTest, MultipleWriters) { + Open(BlobDBOptions()); + + std::vector workers; + std::vector> data_set(10); + for (uint32_t i = 0; i < 10; i++) + workers.push_back(port::Thread( + [&](uint32_t id) { + Random rnd(301 + id); + for (int j = 0; j < 100; j++) { + std::string key = + "key" + std::to_string(id) + "_" + std::to_string(j); + if (id < 5) { + PutRandom(key, &rnd, &data_set[id]); + } else { + WriteBatch batch; + PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]); + ASSERT_OK(blob_db_->Write(WriteOptions(), &batch)); + } + } + }, + i)); + std::map data; + for (size_t i = 0; i < 10; i++) { + workers[i].join(); + data.insert(data_set[i].begin(), data_set[i].end()); + } + VerifyDB(data); +} + +TEST_F(BlobDBTest, SstFileManager) { + // run the same test for Get(), MultiGet() and Iterator each. + std::shared_ptr sst_file_manager( + NewSstFileManager(mock_env_.get())); + sst_file_manager->SetDeleteRateBytesPerSecond(1); + SstFileManagerImpl *sfm = + static_cast(sst_file_manager.get()); + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.enable_garbage_collection = true; + bdb_options.garbage_collection_cutoff = 1.0; + Options db_options; + + int files_scheduled_to_delete = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) { + assert(arg); + const std::string *const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + db_options.sst_file_manager = sst_file_manager; + + Open(bdb_options, db_options); + + // Create one obselete file and clean it. + ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar")); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + std::shared_ptr bfile = blob_files[0]; + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile)); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + + // Even if SSTFileManager is not set, DB is creating a dummy one. + ASSERT_EQ(1, files_scheduled_to_delete); + Destroy(); + // Make sure that DestroyBlobDB() also goes through delete scheduler. + ASSERT_EQ(2, files_scheduled_to_delete); + SyncPoint::GetInstance()->DisableProcessing(); + sfm->WaitForEmptyTrash(); +} + +TEST_F(BlobDBTest, SstFileManagerRestart) { + int files_scheduled_to_delete = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) { + assert(arg); + const std::string *const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + // run the same test for Get(), MultiGet() and Iterator each. + std::shared_ptr sst_file_manager( + NewSstFileManager(mock_env_.get())); + sst_file_manager->SetDeleteRateBytesPerSecond(1); + SstFileManagerImpl *sfm = + static_cast(sst_file_manager.get()); + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + Options db_options; + + SyncPoint::GetInstance()->EnableProcessing(); + db_options.sst_file_manager = sst_file_manager; + + Open(bdb_options, db_options); + std::string blob_dir = blob_db_impl()->TEST_blob_dir(); + ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar")); + Close(); + + // Create 3 dummy trash files under the blob_dir + const auto &fs = db_options.env->GetFileSystem(); + ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false)); + ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true)); + ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false)); + + // Make sure that reopening the DB rescan the existing trash files + Open(bdb_options, db_options); + ASSERT_EQ(files_scheduled_to_delete, 2); + + sfm->WaitForEmptyTrash(); + + // There should be exact one file under the blob dir now. + std::vector all_files; + ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files)); + int nfiles = 0; + for (const auto &f : all_files) { + assert(!f.empty()); + if (f[0] == '.') { + continue; + } + nfiles++; + } + ASSERT_EQ(nfiles, 1); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(BlobDBTest, SnapshotAndGarbageCollection) { + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.enable_garbage_collection = true; + bdb_options.garbage_collection_cutoff = 1.0; + bdb_options.disable_background_tasks = true; + + Options options; + options.disable_auto_compactions = true; + + // i = when to take snapshot + for (int i = 0; i < 4; i++) { + Destroy(); + Open(bdb_options, options); + + const Snapshot *snapshot = nullptr; + + // First file + ASSERT_OK(Put("key1", "value")); + if (i == 0) { + snapshot = blob_db_->GetSnapshot(); + } + + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0])); + + // Second file + ASSERT_OK(Put("key2", "value")); + if (i == 1) { + snapshot = blob_db_->GetSnapshot(); + } + + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + auto bfile = blob_files[1]; + ASSERT_FALSE(bfile->Immutable()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile)); + + // Third file + ASSERT_OK(Put("key3", "value")); + if (i == 2) { + snapshot = blob_db_->GetSnapshot(); + } + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(bfile->Obsolete()); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), + bfile->GetObsoleteSequence()); + + Delete("key2"); + if (i == 3) { + snapshot = blob_db_->GetSnapshot(); + } + + ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + + if (i >= 2) { + // The snapshot shouldn't see data in bfile + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + blob_db_->ReleaseSnapshot(snapshot); + } else { + // The snapshot will see data in bfile, so the file shouldn't be deleted + ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size()); + blob_db_->ReleaseSnapshot(snapshot); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + } + } +} + +TEST_F(BlobDBTest, ColumnFamilyNotSupported) { + Options options; + options.env = mock_env_.get(); + mock_clock_->SetCurrentTime(0); + Open(BlobDBOptions(), options); + ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily(); + ColumnFamilyHandle *handle = nullptr; + std::string value; + std::vector values; + // The call simply pass through to base db. It should succeed. + ASSERT_OK( + blob_db_->CreateColumnFamily(ColumnFamilyOptions(), "foo", &handle)); + ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported()); + ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60) + .IsNotSupported()); + ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100) + .IsNotSupported()); + WriteBatch batch; + ASSERT_OK(batch.Put("k1", "v1")); + ASSERT_OK(batch.Put(handle, "k2", "v2")); + ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported()); + ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound()); + ASSERT_TRUE( + blob_db_->Get(ReadOptions(), handle, "k", &value).IsNotSupported()); + auto statuses = blob_db_->MultiGet(ReadOptions(), {default_handle, handle}, + {"k1", "k2"}, &values); + ASSERT_EQ(2, statuses.size()); + ASSERT_TRUE(statuses[0].IsNotSupported()); + ASSERT_TRUE(statuses[1].IsNotSupported()); + ASSERT_EQ(nullptr, blob_db_->NewIterator(ReadOptions(), handle)); + delete handle; +} + +TEST_F(BlobDBTest, GetLiveFilesMetaData) { + Random rnd(301); + + BlobDBOptions bdb_options; + bdb_options.blob_dir = "blob_dir"; + bdb_options.path_relative = true; + bdb_options.ttl_range_secs = 10; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + + Options options; + options.env = mock_env_.get(); + + Open(bdb_options, options); + + std::map data; + for (size_t i = 0; i < 100; i++) { + PutRandom("key" + std::to_string(i), &rnd, &data); + } + + constexpr uint64_t expiration = 1000ULL; + PutRandomUntil("key100", expiration, &rnd, &data); + + std::vector metadata; + blob_db_->GetLiveFilesMetaData(&metadata); + + ASSERT_EQ(2U, metadata.size()); + // Path should be relative to db_name, but begin with slash. + const std::string filename1("/blob_dir/000001.blob"); + ASSERT_EQ(filename1, metadata[0].name); + ASSERT_EQ(1, metadata[0].file_number); + ASSERT_EQ(0, metadata[0].oldest_ancester_time); + ASSERT_EQ(kDefaultColumnFamilyName, metadata[0].column_family_name); + + const std::string filename2("/blob_dir/000002.blob"); + ASSERT_EQ(filename2, metadata[1].name); + ASSERT_EQ(2, metadata[1].file_number); + ASSERT_EQ(expiration, metadata[1].oldest_ancester_time); + ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name); + + std::vector livefile; + uint64_t mfs; + ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false)); + ASSERT_EQ(5U, livefile.size()); + ASSERT_EQ(filename1, livefile[3]); + ASSERT_EQ(filename2, livefile[4]); + VerifyDB(data); +} + +TEST_F(BlobDBTest, MigrateFromPlainRocksDB) { + constexpr size_t kNumKey = 20; + constexpr size_t kNumIteration = 10; + Random rnd(301); + std::map data; + std::vector is_blob(kNumKey, false); + + // Write to plain rocksdb. + Options options; + options.create_if_missing = true; + DB *db = nullptr; + ASSERT_OK(DB::Open(options, dbname_, &db)); + for (size_t i = 0; i < kNumIteration; i++) { + auto key_index = rnd.Next() % kNumKey; + std::string key = "key" + std::to_string(key_index); + PutRandom(db, key, &rnd, &data); + } + VerifyDB(db, data); + delete db; + db = nullptr; + + // Open as blob db. Verify it can read existing data. + Open(); + VerifyDB(blob_db_, data); + for (size_t i = 0; i < kNumIteration; i++) { + auto key_index = rnd.Next() % kNumKey; + std::string key = "key" + std::to_string(key_index); + is_blob[key_index] = true; + PutRandom(blob_db_, key, &rnd, &data); + } + VerifyDB(blob_db_, data); + delete blob_db_; + blob_db_ = nullptr; + + // Verify plain db return error for keys written by blob db. + ASSERT_OK(DB::Open(options, dbname_, &db)); + std::string value; + for (size_t i = 0; i < kNumKey; i++) { + std::string key = "key" + std::to_string(i); + Status s = db->Get(ReadOptions(), key, &value); + if (data.count(key) == 0) { + ASSERT_TRUE(s.IsNotFound()); + } else if (is_blob[i]) { + ASSERT_TRUE(s.IsCorruption()); + } else { + ASSERT_OK(s); + ASSERT_EQ(data[key], value); + } + } + delete db; +} + +// Test to verify that a NoSpace IOError Status is returned on reaching +// max_db_size limit. +TEST_F(BlobDBTest, OutOfSpace) { + // Use mock env to stop wall clock. + Options options; + options.env = mock_env_.get(); + BlobDBOptions bdb_options; + bdb_options.max_db_size = 200; + bdb_options.is_fifo = false; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + + // Each stored blob has an overhead of about 42 bytes currently. + // So a small key + a 100 byte blob should take up ~150 bytes in the db. + std::string value(100, 'v'); + ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60)); + + // Putting another blob should fail as ading it would exceed the max_db_size + // limit. + Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60); + ASSERT_TRUE(s.IsIOError()); + ASSERT_TRUE(s.IsNoSpace()); +} + +TEST_F(BlobDBTest, FIFOEviction) { + BlobDBOptions bdb_options; + bdb_options.max_db_size = 200; + bdb_options.blob_file_size = 100; + bdb_options.is_fifo = true; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + + std::atomic evict_count{0}; + SyncPoint::GetInstance()->SetCallBack( + "BlobDBImpl::EvictOldestBlobFile:Evicted", + [&](void *) { evict_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Each stored blob has an overhead of 32 bytes currently. + // So a 100 byte blob should take up 132 bytes. + std::string value(100, 'v'); + ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10)); + VerifyDB({{"key1", value}}); + + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + + // Adding another 100 bytes blob would take the total size to 264 bytes + // (2*132). max_db_size will be exceeded + // than max_db_size and trigger FIFO eviction. + ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60)); + ASSERT_EQ(1, evict_count); + // key1 will exist until corresponding file be deleted. + VerifyDB({{"key1", value}, {"key2", value}}); + + // Adding another 100 bytes blob without TTL. + ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value)); + ASSERT_EQ(2, evict_count); + // key1 and key2 will exist until corresponding file be deleted. + VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}}); + + // The fourth blob file, without TTL. + ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value)); + ASSERT_EQ(3, evict_count); + VerifyDB( + {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}}); + + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(4, blob_files.size()); + ASSERT_TRUE(blob_files[0]->Obsolete()); + ASSERT_TRUE(blob_files[1]->Obsolete()); + ASSERT_TRUE(blob_files[2]->Obsolete()); + ASSERT_FALSE(blob_files[3]->Obsolete()); + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_EQ(3, obsolete_files.size()); + ASSERT_EQ(blob_files[0], obsolete_files[0]); + ASSERT_EQ(blob_files[1], obsolete_files[1]); + ASSERT_EQ(blob_files[2], obsolete_files[2]); + + blob_db_impl()->TEST_DeleteObsoleteFiles(); + obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_TRUE(obsolete_files.empty()); + VerifyDB({{"key4", value}}); +} + +TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) { + Options options; + BlobDBOptions bdb_options; + bdb_options.max_db_size = 1000; + bdb_options.blob_file_size = 5000; + bdb_options.is_fifo = true; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + + std::atomic evict_count{0}; + SyncPoint::GetInstance()->SetCallBack( + "BlobDBImpl::EvictOldestBlobFile:Evicted", + [&](void *) { evict_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::string value(2000, 'v'); + ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace()); + ASSERT_EQ(0, evict_count); +} + +TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { + BlobDBOptions bdb_options; + bdb_options.is_fifo = true; + bdb_options.min_blob_size = 100; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + options.env = mock_env_.get(); + options.disable_auto_compactions = true; + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + Open(bdb_options, options); + + ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size()); + std::string small_value(50, 'v'); + std::map data; + // Insert some data into LSM tree to make sure FIFO eviction take SST + // file size into account. + for (int i = 0; i < 1000; i++) { + ASSERT_OK(Put("key" + std::to_string(i), small_value, &data)); + } + ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = 0; + ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, + &live_sst_size)); + ASSERT_TRUE(live_sst_size > 0); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); + + bdb_options.max_db_size = live_sst_size + 2000; + Reopen(bdb_options, options); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); + + std::string value_1k(1000, 'v'); + ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data)); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB(data); + // large_key2 evicts large_key1 + ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + data.erase("large_key1"); + VerifyDB(data); + // large_key3 get no enough space even after evicting large_key2, so it + // instead return no space error. + std::string value_2k(2000, 'v'); + ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace()); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + // Verify large_key2 still exists. + VerifyDB(data); +} + +// Test flush or compaction will trigger FIFO eviction since they update +// total SST file size. +TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { + BlobDBOptions bdb_options; + bdb_options.max_db_size = 1000; + bdb_options.is_fifo = true; + bdb_options.min_blob_size = 100; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + options.env = mock_env_.get(); + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + options.compression = kNoCompression; + Open(bdb_options, options); + + std::string value(800, 'v'); + ASSERT_OK(PutWithTTL("large_key", value, 60)); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB({{"large_key", value}}); + + // Insert some small keys and flush to bring DB out of space. + std::map data; + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + std::to_string(i), "v", &data)); + } + ASSERT_OK(blob_db_->Flush(FlushOptions())); + + // Verify large_key is deleted by FIFO eviction. + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + VerifyDB(data); +} + +TEST_F(BlobDBTest, InlineSmallValues) { + constexpr uint64_t kMaxExpiration = 1000; + Random rnd(301); + BlobDBOptions bdb_options; + bdb_options.ttl_range_secs = kMaxExpiration; + bdb_options.min_blob_size = 100; + bdb_options.blob_file_size = 256 * 1000 * 1000; + bdb_options.disable_background_tasks = true; + Options options; + options.env = mock_env_.get(); + mock_clock_->SetCurrentTime(0); + Open(bdb_options, options); + std::map data; + std::map versions; + for (size_t i = 0; i < 1000; i++) { + bool is_small_value = rnd.Next() % 2; + bool has_ttl = rnd.Next() % 2; + uint64_t expiration = rnd.Next() % kMaxExpiration; + int len = is_small_value ? 50 : 200; + std::string key = "key" + std::to_string(i); + std::string value = rnd.HumanReadableString(len); + std::string blob_index; + data[key] = value; + SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + if (!has_ttl) { + ASSERT_OK(blob_db_->Put(WriteOptions(), key, value)); + } else { + ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration)); + } + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + versions[key] = + KeyVersion(key, value, sequence, + (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex); + } + VerifyDB(data); + VerifyBaseDB(versions); + auto *bdb_impl = static_cast(blob_db_); + auto blob_files = bdb_impl->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + std::shared_ptr non_ttl_file; + std::shared_ptr ttl_file; + if (blob_files[0]->HasTTL()) { + ttl_file = blob_files[0]; + non_ttl_file = blob_files[1]; + } else { + non_ttl_file = blob_files[0]; + ttl_file = blob_files[1]; + } + ASSERT_FALSE(non_ttl_file->HasTTL()); + ASSERT_TRUE(ttl_file->HasTTL()); +} + +TEST_F(BlobDBTest, UserCompactionFilter) { + class CustomerFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value, + std::string *new_value, bool *value_changed) const override { + *value_changed = false; + // changing value size to test value transitions between inlined data + // and stored-in-blob data + if (value.size() % 4 == 1) { + *new_value = value.ToString(); + // double size by duplicating value + *new_value += *new_value; + *value_changed = true; + return false; + } else if (value.size() % 3 == 1) { + *new_value = value.ToString(); + // trancate value size by half + *new_value = new_value->substr(0, new_value->size() / 2); + *value_changed = true; + return false; + } else if (value.size() % 2 == 1) { + return true; + } + return false; + } + bool IgnoreSnapshots() const override { return true; } + const char *Name() const override { return "CustomerFilter"; } + }; + class CustomerFilterFactory : public CompactionFilterFactory { + const char *Name() const override { return "CustomerFilterFactory"; } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context & /*context*/) override { + return std::unique_ptr(new CustomerFilter()); + } + }; + + constexpr size_t kNumPuts = 1 << 10; + // Generate both inlined and blob value + constexpr uint64_t kMinValueSize = 1 << 6; + constexpr uint64_t kMaxValueSize = 1 << 8; + constexpr uint64_t kMinBlobSize = 1 << 7; + static_assert(kMinValueSize < kMinBlobSize, ""); + static_assert(kMaxValueSize > kMinBlobSize, ""); + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = kMinBlobSize; + bdb_options.blob_file_size = kMaxValueSize * 10; + bdb_options.disable_background_tasks = true; + if (Snappy_Supported()) { + bdb_options.compression = CompressionType::kSnappyCompression; + } + // case_num == 0: Test user defined compaction filter + // case_num == 1: Test user defined compaction filter factory + for (int case_num = 0; case_num < 2; case_num++) { + Options options; + if (case_num == 0) { + options.compaction_filter = new CustomerFilter(); + } else { + options.compaction_filter_factory.reset(new CustomerFilterFactory()); + } + options.disable_auto_compactions = true; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + Open(bdb_options, options); + + std::map data; + std::map data_after_compact; + Random rnd(301); + uint64_t value_size = kMinValueSize; + int drop_record = 0; + for (size_t i = 0; i < kNumPuts; ++i) { + std::ostringstream oss; + oss << "key" << std::setw(4) << std::setfill('0') << i; + + const std::string key(oss.str()); + const std::string value = rnd.HumanReadableString((int)value_size); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(Put(key, value)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + + data[key] = value; + if (value.length() % 4 == 1) { + data_after_compact[key] = value + value; + } else if (value.length() % 3 == 1) { + data_after_compact[key] = value.substr(0, value.size() / 2); + } else if (value.length() % 2 == 1) { + ++drop_record; + } else { + data_after_compact[key] = value; + } + + if (++value_size > kMaxValueSize) { + value_size = kMinValueSize; + } + } + // Verify full data set + VerifyDB(data); + // Applying compaction filter for records + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Verify data after compaction, only value with even length left. + VerifyDB(data_after_compact); + ASSERT_EQ(drop_record, + options.statistics->getTickerCount(COMPACTION_KEY_DROP_USER)); + delete options.compaction_filter; + Destroy(); + } +} + +// Test user comapction filter when there is IO error on blob data. +TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) { + class CustomerFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value, + std::string *new_value, bool *value_changed) const override { + *new_value = value.ToString() + "_new"; + *value_changed = true; + return false; + } + bool IgnoreSnapshots() const override { return true; } + const char *Name() const override { return "CustomerFilter"; } + }; + + constexpr size_t kNumPuts = 100; + constexpr int kValueSize = 100; + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.blob_file_size = kValueSize * 10; + bdb_options.disable_background_tasks = true; + bdb_options.compression = CompressionType::kNoCompression; + + std::vector io_failure_cases = { + "BlobDBImpl::CreateBlobFileAndWriter", + "BlobIndexCompactionFilterBase::WriteBlobToNewFile", + "BlobDBImpl::CloseBlobFile"}; + + for (size_t case_num = 0; case_num < io_failure_cases.size(); case_num++) { + Options options; + options.compaction_filter = new CustomerFilter(); + options.disable_auto_compactions = true; + options.env = fault_injection_env_.get(); + options.statistics = CreateDBStatistics(); + Open(bdb_options, options); + + std::map data; + Random rnd(301); + for (size_t i = 0; i < kNumPuts; ++i) { + std::ostringstream oss; + oss << "key" << std::setw(4) << std::setfill('0') << i; + + const std::string key(oss.str()); + const std::string value = rnd.HumanReadableString(kValueSize); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(Put(key, value)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + data[key] = value; + } + + // Verify full data set + VerifyDB(data); + + SyncPoint::GetInstance()->SetCallBack( + io_failure_cases[case_num], [&](void * /*arg*/) { + fault_injection_env_->SetFilesystemActive(false, Status::IOError()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + auto s = blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.IsIOError()); + + // Reactivate file system to allow test to verify and close DB. + fault_injection_env_->SetFilesystemActive(true); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify full data set after compaction failure + VerifyDB(data); + + delete options.compaction_filter; + Destroy(); + } +} + +// Test comapction filter should remove any expired blob index. +TEST_F(BlobDBTest, FilterExpiredBlobIndex) { + constexpr size_t kNumKeys = 100; + constexpr size_t kNumPuts = 1000; + constexpr uint64_t kMaxExpiration = 1000; + constexpr uint64_t kCompactTime = 500; + constexpr uint64_t kMinBlobSize = 100; + Random rnd(301); + mock_clock_->SetCurrentTime(0); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = kMinBlobSize; + bdb_options.disable_background_tasks = true; + Options options; + options.env = mock_env_.get(); + Open(bdb_options, options); + + std::map data; + std::map data_after_compact; + for (size_t i = 0; i < kNumPuts; i++) { + bool is_small_value = rnd.Next() % 2; + bool has_ttl = rnd.Next() % 2; + uint64_t expiration = rnd.Next() % kMaxExpiration; + int len = is_small_value ? 10 : 200; + std::string key = "key" + std::to_string(rnd.Next() % kNumKeys); + std::string value = rnd.HumanReadableString(len); + if (!has_ttl) { + if (is_small_value) { + std::string blob_entry; + BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value); + // Fake blob index with TTL. See what it will do. + ASSERT_GT(kMinBlobSize, blob_entry.size()); + value = blob_entry; + } + ASSERT_OK(Put(key, value)); + data_after_compact[key] = value; + } else { + ASSERT_OK(PutUntil(key, value, expiration)); + if (expiration <= kCompactTime) { + data_after_compact.erase(key); + } else { + data_after_compact[key] = value; + } + } + data[key] = value; + } + VerifyDB(data); + + mock_clock_->SetCurrentTime(kCompactTime); + // Take a snapshot before compaction. Make sure expired blob indexes is + // filtered regardless of snapshot. + const Snapshot *snapshot = blob_db_->GetSnapshot(); + // Issue manual compaction to trigger compaction filter. + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + blob_db_->ReleaseSnapshot(snapshot); + // Verify expired blob index are filtered. + std::vector versions; + const size_t kMaxKeys = 10000; + ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions)); + ASSERT_EQ(data_after_compact.size(), versions.size()); + for (auto &version : versions) { + ASSERT_TRUE(data_after_compact.count(version.user_key) > 0); + } + VerifyDB(data_after_compact); +} + +// Test compaction filter should remove any blob index where corresponding +// blob file has been removed. +TEST_F(BlobDBTest, FilterFileNotAvailable) { + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Options options; + options.disable_auto_compactions = true; + Open(bdb_options, options); + + ASSERT_OK(Put("foo", "v1")); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_EQ(1, blob_files[0]->BlobFileNumber()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0])); + + ASSERT_OK(Put("bar", "v2")); + blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(2, blob_files.size()); + ASSERT_EQ(2, blob_files[1]->BlobFileNumber()); + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1])); + + const size_t kMaxKeys = 10000; + + DB *base_db = blob_db_->GetRootDB(); + std::vector versions; + ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions)); + ASSERT_EQ(2, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + ASSERT_EQ("foo", versions[1].user_key); + VerifyDB({{"bar", "v2"}, {"foo", "v1"}}); + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions)); + ASSERT_EQ(2, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + ASSERT_EQ("foo", versions[1].user_key); + VerifyDB({{"bar", "v2"}, {"foo", "v1"}}); + + // Remove the first blob file and compact. foo should be remove from base db. + blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions)); + ASSERT_EQ(1, versions.size()); + ASSERT_EQ("bar", versions[0].user_key); + VerifyDB({{"bar", "v2"}}); + + // Remove the second blob file and compact. bar should be remove from base db. + blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions)); + ASSERT_EQ(0, versions.size()); + VerifyDB({}); +} + +// Test compaction filter should filter any inlined TTL keys that would have +// been dropped by last FIFO eviction if they are store out-of-line. +TEST_F(BlobDBTest, FilterForFIFOEviction) { + Random rnd(215); + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 100; + bdb_options.ttl_range_secs = 60; + bdb_options.max_db_size = 0; + bdb_options.disable_background_tasks = true; + Options options; + // Use mock env to stop wall clock. + mock_clock_->SetCurrentTime(0); + options.env = mock_env_.get(); + auto statistics = CreateDBStatistics(); + options.statistics = statistics; + options.disable_auto_compactions = true; + Open(bdb_options, options); + + std::map data; + std::map data_after_compact; + // Insert some small values that will be inlined. + for (int i = 0; i < 1000; i++) { + std::string key = "key" + std::to_string(i); + std::string value = rnd.HumanReadableString(50); + uint64_t ttl = rnd.Next() % 120 + 1; + ASSERT_OK(PutWithTTL(key, value, ttl, &data)); + if (ttl >= 60) { + data_after_compact[key] = value; + } + } + uint64_t num_keys_to_evict = data.size() - data_after_compact.size(); + ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size(); + ASSERT_GT(live_sst_size, 0); + VerifyDB(data); + + bdb_options.max_db_size = live_sst_size + 30000; + bdb_options.is_fifo = true; + Reopen(bdb_options, options); + VerifyDB(data); + + // Put two large values, each on a different blob file. + std::string large_value(10000, 'v'); + ASSERT_OK(PutWithTTL("large_key1", large_value, 90)); + ASSERT_OK(PutWithTTL("large_key2", large_value, 150)); + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + data["large_key1"] = large_value; + data["large_key2"] = large_value; + VerifyDB(data); + + // Put a third large value which will bring the DB out of space. + // FIFO eviction will evict the file of large_key1. + ASSERT_OK(PutWithTTL("large_key3", large_value, 150)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + data.erase("large_key1"); + data["large_key3"] = large_value; + VerifyDB(data); + + // Putting some more small values. These values shouldn't be evicted by + // compaction filter since they are inserted after FIFO eviction. + ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact)); + ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact)); + + // FIFO eviction doesn't trigger again since there enough room for the flush. + ASSERT_OK(blob_db_->Flush(FlushOptions())); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + + // Manual compact and check if compaction filter evict those keys with + // expiration < 60. + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // All keys with expiration < 60, plus large_key1 is filtered by + // compaction filter. + ASSERT_EQ(num_keys_to_evict + 1, + statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT)); + ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + data_after_compact["large_key2"] = large_value; + data_after_compact["large_key3"] = large_value; + VerifyDB(data_after_compact); +} + +TEST_F(BlobDBTest, GarbageCollection) { + constexpr size_t kNumPuts = 1 << 10; + + constexpr uint64_t kExpiration = 1000; + constexpr uint64_t kCompactTime = 500; + + constexpr uint64_t kKeySize = 7; // "key" + 4 digits + + constexpr uint64_t kSmallValueSize = 1 << 6; + constexpr uint64_t kLargeValueSize = 1 << 8; + constexpr uint64_t kMinBlobSize = 1 << 7; + static_assert(kSmallValueSize < kMinBlobSize, ""); + static_assert(kLargeValueSize > kMinBlobSize, ""); + + constexpr size_t kBlobsPerFile = 8; + constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile; + constexpr uint64_t kBlobFileSize = + BlobLogHeader::kSize + + (BlobLogRecord::kHeaderSize + kKeySize + kLargeValueSize) * kBlobsPerFile; + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = kMinBlobSize; + bdb_options.blob_file_size = kBlobFileSize; + bdb_options.enable_garbage_collection = true; + bdb_options.garbage_collection_cutoff = 0.25; + bdb_options.disable_background_tasks = true; + + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + + Open(bdb_options, options); + + std::map data; + std::map blob_value_versions; + std::map blob_index_versions; + + Random rnd(301); + + // Add a bunch of large non-TTL values. These will be written to non-TTL + // blob files and will be subject to GC. + for (size_t i = 0; i < kNumPuts; ++i) { + std::ostringstream oss; + oss << "key" << std::setw(4) << std::setfill('0') << i; + + const std::string key(oss.str()); + const std::string value = rnd.HumanReadableString(kLargeValueSize); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(Put(key, value)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + + data[key] = value; + blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex); + blob_index_versions[key] = + BlobIndexVersion(key, /* file_number */ (i >> 3) + 1, kNoExpiration, + sequence, kTypeBlobIndex); + } + + // Add some small and/or TTL values that will be ignored during GC. + // First, add a large TTL value will be written to its own TTL blob file. + { + const std::string key("key2000"); + const std::string value = rnd.HumanReadableString(kLargeValueSize); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(PutUntil(key, value, kExpiration)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + + data[key] = value; + blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex); + blob_index_versions[key] = + BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kExpiration, + sequence, kTypeBlobIndex); + } + + // Now add a small TTL value (which will be inlined). + { + const std::string key("key3000"); + const std::string value = rnd.HumanReadableString(kSmallValueSize); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(PutUntil(key, value, kExpiration)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + + data[key] = value; + blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex); + blob_index_versions[key] = BlobIndexVersion( + key, kInvalidBlobFileNumber, kExpiration, sequence, kTypeBlobIndex); + } + + // Finally, add a small non-TTL value (which will be stored as a regular + // value). + { + const std::string key("key4000"); + const std::string value = rnd.HumanReadableString(kSmallValueSize); + const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1; + + ASSERT_OK(Put(key, value)); + ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence); + + data[key] = value; + blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeValue); + blob_index_versions[key] = BlobIndexVersion( + key, kInvalidBlobFileNumber, kNoExpiration, sequence, kTypeValue); + } + + VerifyDB(data); + VerifyBaseDB(blob_value_versions); + VerifyBaseDBBlobIndex(blob_index_versions); + + // At this point, we should have 128 immutable non-TTL files with file numbers + // 1..128. + { + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), kNumBlobFiles); + for (size_t i = 0; i < kNumBlobFiles; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1); + ASSERT_EQ(live_imm_files[i]->GetFileSize(), + kBlobFileSize + BlobLogFooter::kSize); + } + } + + mock_clock_->SetCurrentTime(kCompactTime); + + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // We expect the data to remain the same and the blobs from the oldest N files + // to be moved to new files. Sequence numbers get zeroed out during the + // compaction. + VerifyDB(data); + + for (auto &pair : blob_value_versions) { + KeyVersion &version = pair.second; + version.sequence = 0; + } + + VerifyBaseDB(blob_value_versions); + + const uint64_t cutoff = static_cast( + bdb_options.garbage_collection_cutoff * kNumBlobFiles); + for (auto &pair : blob_index_versions) { + BlobIndexVersion &version = pair.second; + + version.sequence = 0; + + if (version.file_number == kInvalidBlobFileNumber) { + continue; + } + + if (version.file_number > cutoff) { + continue; + } + + version.file_number += kNumBlobFiles + 1; + } + + VerifyBaseDBBlobIndex(blob_index_versions); + + const Statistics *const statistics = options.statistics.get(); + assert(statistics); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), cutoff); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), cutoff); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED), + cutoff * kBlobsPerFile); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED), + cutoff * kBlobsPerFile * kLargeValueSize); + + // At this point, we should have 128 immutable non-TTL files with file numbers + // 33..128 and 130..161. (129 was taken by the TTL blob file.) + { + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), kNumBlobFiles); + for (size_t i = 0; i < kNumBlobFiles; ++i) { + uint64_t expected_file_number = i + cutoff + 1; + if (expected_file_number > kNumBlobFiles) { + ++expected_file_number; + } + + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), expected_file_number); + ASSERT_EQ(live_imm_files[i]->GetFileSize(), + kBlobFileSize + BlobLogFooter::kSize); + } + } +} + +TEST_F(BlobDBTest, GarbageCollectionFailure) { + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + bdb_options.enable_garbage_collection = true; + bdb_options.garbage_collection_cutoff = 1.0; + bdb_options.disable_background_tasks = true; + + Options db_options; + db_options.statistics = CreateDBStatistics(); + + Open(bdb_options, db_options); + + // Write a couple of valid blobs. + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("dead", "beef")); + + // Write a fake blob reference into the base DB that points to a non-existing + // blob file. + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, /* file_number */ 1000, /* offset */ 1234, + /* size */ 5678, kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex( + &batch, blob_db_->DefaultColumnFamily()->GetID(), "key", blob_index)); + ASSERT_OK(blob_db_->GetRootDB()->Write(WriteOptions(), &batch)); + + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + auto blob_file = blob_files[0]; + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file)); + + ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIOError()); + + const Statistics *const statistics = db_options.statistics.get(); + assert(statistics); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), 1); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 1); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED), 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED), 7); +} + +// File should be evicted after expiration. +TEST_F(BlobDBTest, EvictExpiredFile) { + BlobDBOptions bdb_options; + bdb_options.ttl_range_secs = 100; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = true; + Options options; + options.env = mock_env_.get(); + Open(bdb_options, options); + mock_clock_->SetCurrentTime(50); + std::map data; + ASSERT_OK(PutWithTTL("foo", "bar", 100, &data)); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + auto blob_file = blob_files[0]; + ASSERT_FALSE(blob_file->Immutable()); + ASSERT_FALSE(blob_file->Obsolete()); + VerifyDB(data); + mock_clock_->SetCurrentTime(250); + // The key should expired now. + blob_db_impl()->TEST_EvictExpiredFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + ASSERT_TRUE(blob_file->Immutable()); + ASSERT_TRUE(blob_file->Obsolete()); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size()); + // Make sure we don't return garbage value after blob file being evicted, + // but the blob index still exists in the LSM tree. + std::string val = ""; + ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound()); + ASSERT_EQ("", val); +} + +TEST_F(BlobDBTest, DisableFileDeletions) { + BlobDBOptions bdb_options; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + std::map data; + for (bool force : {true, false}) { + ASSERT_OK(Put("foo", "v", &data)); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + auto blob_file = blob_files[0]; + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file)); + blob_db_impl()->TEST_ObsoleteBlobFile(blob_file); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + // Call DisableFileDeletions twice. + ASSERT_OK(blob_db_->DisableFileDeletions()); + ASSERT_OK(blob_db_->DisableFileDeletions()); + // File deletions should be disabled. + blob_db_impl()->TEST_DeleteObsoleteFiles(); + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB(data); + // Enable file deletions once. If force=true, file deletion is enabled. + // Otherwise it needs to enable it for a second time. + ASSERT_OK(blob_db_->EnableFileDeletions(force)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + if (!force) { + ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB(data); + // Call EnableFileDeletions a second time. + ASSERT_OK(blob_db_->EnableFileDeletions(false)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + } + // Regardless of value of `force`, file should be deleted by now. + ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); + ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size()); + VerifyDB({}); + } +} + +TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) { + BlobDBOptions bdb_options; + bdb_options.enable_garbage_collection = true; + bdb_options.disable_background_tasks = true; + Open(bdb_options); + + // Register some dummy blob files. + blob_db_impl()->TEST_AddDummyBlobFile(1, /* immutable_sequence */ 200); + blob_db_impl()->TEST_AddDummyBlobFile(2, /* immutable_sequence */ 300); + blob_db_impl()->TEST_AddDummyBlobFile(3, /* immutable_sequence */ 400); + blob_db_impl()->TEST_AddDummyBlobFile(4, /* immutable_sequence */ 500); + blob_db_impl()->TEST_AddDummyBlobFile(5, /* immutable_sequence */ 600); + + // Initialize the blob <-> SST file mapping. First, add some SST files with + // blob file references, then some without. + std::vector live_files; + + for (uint64_t i = 1; i <= 10; ++i) { + LiveFileMetaData live_file; + live_file.file_number = i; + live_file.oldest_blob_file_number = ((i - 1) % 5) + 1; + + live_files.emplace_back(live_file); + } + + for (uint64_t i = 11; i <= 20; ++i) { + LiveFileMetaData live_file; + live_file.file_number = i; + + live_files.emplace_back(live_file); + } + + blob_db_impl()->TEST_InitializeBlobFileToSstMapping(live_files); + + // Check that the blob <-> SST mappings have been correctly initialized. + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + + ASSERT_EQ(blob_files.size(), 5); + + { + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 5); + for (size_t i = 0; i < 5; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1); + } + + ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty()); + } + + { + const std::vector> expected_sst_files{ + {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + const std::vector expected_obsolete{false, false, false, false, + false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 5); + for (size_t i = 0; i < 5; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1); + } + + ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty()); + } + + // Simulate a flush where the SST does not reference any blob files. + { + FlushJobInfo info{}; + info.file_number = 21; + info.smallest_seqno = 1; + info.largest_seqno = 100; + + blob_db_impl()->TEST_ProcessFlushJobInfo(info); + + const std::vector> expected_sst_files{ + {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + const std::vector expected_obsolete{false, false, false, false, + false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 5); + for (size_t i = 0; i < 5; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1); + } + + ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty()); + } + + // Simulate a flush where the SST references a blob file. + { + FlushJobInfo info{}; + info.file_number = 22; + info.oldest_blob_file_number = 5; + info.smallest_seqno = 101; + info.largest_seqno = 200; + + blob_db_impl()->TEST_ProcessFlushJobInfo(info); + + const std::vector> expected_sst_files{ + {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}}; + const std::vector expected_obsolete{false, false, false, false, + false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 5); + for (size_t i = 0; i < 5; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1); + } + + ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty()); + } + + // Simulate a compaction. Some inputs and outputs have blob file references, + // some don't. There is also a trivial move (which means the SST appears on + // both the input and the output list). Blob file 1 loses all its linked SSTs, + // and since it got marked immutable at sequence number 200 which has already + // been flushed, it can be marked obsolete. + { + CompactionJobInfo info{}; + info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1}); + info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2}); + info.input_file_infos.emplace_back(CompactionFileInfo{1, 6, 1}); + info.input_file_infos.emplace_back( + CompactionFileInfo{1, 11, kInvalidBlobFileNumber}); + info.input_file_infos.emplace_back(CompactionFileInfo{1, 22, 5}); + info.output_file_infos.emplace_back(CompactionFileInfo{2, 22, 5}); + info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3}); + info.output_file_infos.emplace_back( + CompactionFileInfo{2, 24, kInvalidBlobFileNumber}); + + blob_db_impl()->TEST_ProcessCompactionJobInfo(info); + + const std::vector> expected_sst_files{ + {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}}; + const std::vector expected_obsolete{true, false, false, false, false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 4); + for (size_t i = 0; i < 4; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2); + } + + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_EQ(obsolete_files.size(), 1); + ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1); + } + + // Simulate a failed compaction. No mappings should be updated. + { + CompactionJobInfo info{}; + info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2}); + info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5}); + info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3}); + info.status = Status::Corruption(); + + blob_db_impl()->TEST_ProcessCompactionJobInfo(info); + + const std::vector> expected_sst_files{ + {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}}; + const std::vector expected_obsolete{true, false, false, false, false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 4); + for (size_t i = 0; i < 4; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2); + } + + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_EQ(obsolete_files.size(), 1); + ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1); + } + + // Simulate another compaction. Blob file 2 loses all its linked SSTs + // but since it got marked immutable at sequence number 300 which hasn't + // been flushed yet, it cannot be marked obsolete at this point. + { + CompactionJobInfo info{}; + info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2}); + info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5}); + info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3}); + + blob_db_impl()->TEST_ProcessCompactionJobInfo(info); + + const std::vector> expected_sst_files{ + {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}}; + const std::vector expected_obsolete{true, false, false, false, false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 4); + for (size_t i = 0; i < 4; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2); + } + + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_EQ(obsolete_files.size(), 1); + ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1); + } + + // Simulate a flush with largest sequence number 300. This will make it + // possible to mark blob file 2 obsolete. + { + FlushJobInfo info{}; + info.file_number = 26; + info.smallest_seqno = 201; + info.largest_seqno = 300; + + blob_db_impl()->TEST_ProcessFlushJobInfo(info); + + const std::vector> expected_sst_files{ + {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}}; + const std::vector expected_obsolete{true, true, false, false, false}; + for (size_t i = 0; i < 5; ++i) { + const auto &blob_file = blob_files[i]; + ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]); + ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]); + } + + auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles(); + ASSERT_EQ(live_imm_files.size(), 3); + for (size_t i = 0; i < 3; ++i) { + ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 3); + } + + auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles(); + ASSERT_EQ(obsolete_files.size(), 2); + ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1); + ASSERT_EQ(obsolete_files[1]->BlobFileNumber(), 2); + } +} + +TEST_F(BlobDBTest, ShutdownWait) { + BlobDBOptions bdb_options; + bdb_options.ttl_range_secs = 100; + bdb_options.min_blob_size = 0; + bdb_options.disable_background_tasks = false; + Options options; + options.env = mock_env_.get(); + + SyncPoint::GetInstance()->LoadDependency({ + {"BlobDBImpl::EvictExpiredFiles:0", "BlobDBTest.ShutdownWait:0"}, + {"BlobDBTest.ShutdownWait:1", "BlobDBImpl::EvictExpiredFiles:1"}, + {"BlobDBImpl::EvictExpiredFiles:2", "BlobDBTest.ShutdownWait:2"}, + {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"}, + }); + // Force all tasks to be scheduled immediately. + SyncPoint::GetInstance()->SetCallBack( + "TimeQueue::Add:item.end", [&](void *arg) { + std::chrono::steady_clock::time_point *tp = + static_cast(arg); + *tp = + std::chrono::steady_clock::now() - std::chrono::milliseconds(10000); + }); + + SyncPoint::GetInstance()->SetCallBack( + "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) { + // Sleep 3 ms to increase the chance of data race. + // We've synced up the code so that EvictExpiredFiles() + // is called concurrently with ~BlobDBImpl(). + // ~BlobDBImpl() is supposed to wait for all background + // task to shutdown before doing anything else. In order + // to use the same test to reproduce a bug of the waiting + // logic, we wait a little bit here, so that TSAN can + // catch the data race. + // We should improve the test if we find a better way. + Env::Default()->SleepForMicroseconds(3000); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + Open(bdb_options, options); + mock_clock_->SetCurrentTime(50); + std::map data; + ASSERT_OK(PutWithTTL("foo", "bar", 100, &data)); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + auto blob_file = blob_files[0]; + ASSERT_FALSE(blob_file->Immutable()); + ASSERT_FALSE(blob_file->Obsolete()); + VerifyDB(data); + + TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0"); + mock_clock_->SetCurrentTime(250); + // The key should expired now. + TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1"); + + TEST_SYNC_POINT("BlobDBTest.ShutdownWait:2"); + TEST_SYNC_POINT("BlobDBTest.ShutdownWait:3"); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(BlobDBTest, SyncBlobFileBeforeClose) { + Options options; + options.statistics = CreateDBStatistics(); + + BlobDBOptions blob_options; + blob_options.min_blob_size = 0; + blob_options.bytes_per_sync = 1 << 20; + blob_options.disable_background_tasks = true; + + Open(blob_options, options); + + ASSERT_OK(Put("foo", "bar")); + + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0])); + ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_BLOB_FILE_SYNCED), 1); +} + +TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) { + Options options; + options.env = fault_injection_env_.get(); + + BlobDBOptions blob_options; + blob_options.min_blob_size = 0; + blob_options.bytes_per_sync = 1 << 20; + blob_options.disable_background_tasks = true; + + Open(blob_options, options); + + ASSERT_OK(Put("foo", "bar")); + + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + SyncPoint::GetInstance()->SetCallBack( + "BlobLogWriter::Sync", [this](void * /* arg */) { + fault_injection_env_->SetFilesystemActive(false, Status::IOError()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = blob_db_impl()->TEST_CloseBlobFile(blob_files[0]); + + fault_injection_env_->SetFilesystemActive(true); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(s.IsIOError()); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE + +// A black-box test for the ttl wrapper around rocksdb +int main(int argc, char **argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as BlobDB is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.cc b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc new file mode 100644 index 000000000..1e0632990 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_dump_tool.h" + +#include + +#include +#include +#include +#include + +#include "file/random_access_file_reader.h" +#include "file/readahead_raf.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/file_system.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +BlobDumpTool::BlobDumpTool() + : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {} + +Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key, + DisplayType show_blob, + DisplayType show_uncompressed_blob, + bool show_summary) { + constexpr size_t kReadaheadSize = 2 * 1024 * 1024; + Status s; + const auto fs = FileSystem::Default(); + IOOptions io_opts; + s = fs->FileExists(filename, io_opts, nullptr); + if (!s.ok()) { + return s; + } + uint64_t file_size = 0; + s = fs->GetFileSize(filename, io_opts, &file_size, nullptr); + if (!s.ok()) { + return s; + } + std::unique_ptr file; + s = fs->NewRandomAccessFile(filename, FileOptions(), &file, nullptr); + if (!s.ok()) { + return s; + } + file = NewReadaheadRandomAccessFile(std::move(file), kReadaheadSize); + if (file_size == 0) { + return Status::Corruption("File is empty."); + } + reader_.reset(new RandomAccessFileReader(std::move(file), filename)); + uint64_t offset = 0; + uint64_t footer_offset = 0; + CompressionType compression = kNoCompression; + s = DumpBlobLogHeader(&offset, &compression); + if (!s.ok()) { + return s; + } + s = DumpBlobLogFooter(file_size, &footer_offset); + if (!s.ok()) { + return s; + } + uint64_t total_records = 0; + uint64_t total_key_size = 0; + uint64_t total_blob_size = 0; + uint64_t total_uncompressed_blob_size = 0; + if (show_key != DisplayType::kNone || show_summary) { + while (offset < footer_offset) { + s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary, + compression, &offset, &total_records, &total_key_size, + &total_blob_size, &total_uncompressed_blob_size); + if (!s.ok()) { + break; + } + } + } + if (show_summary) { + fprintf(stdout, "Summary:\n"); + fprintf(stdout, " total records: %" PRIu64 "\n", total_records); + fprintf(stdout, " total key size: %" PRIu64 "\n", total_key_size); + fprintf(stdout, " total blob size: %" PRIu64 "\n", total_blob_size); + if (compression != kNoCompression) { + fprintf(stdout, " total raw blob size: %" PRIu64 "\n", + total_uncompressed_blob_size); + } + } + return s; +} + +Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) { + if (buffer_size_ < size) { + if (buffer_size_ == 0) { + buffer_size_ = 4096; + } + while (buffer_size_ < size) { + buffer_size_ *= 2; + } + buffer_.reset(new char[buffer_size_]); + } + Status s = reader_->Read(IOOptions(), offset, size, result, buffer_.get(), + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + if (result->size() != size) { + return Status::Corruption("Reach the end of the file unexpectedly."); + } + return s; +} + +Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset, + CompressionType* compression) { + Slice slice; + Status s = Read(0, BlobLogHeader::kSize, &slice); + if (!s.ok()) { + return s; + } + BlobLogHeader header; + s = header.DecodeFrom(slice); + if (!s.ok()) { + return s; + } + fprintf(stdout, "Blob log header:\n"); + fprintf(stdout, " Version : %" PRIu32 "\n", header.version); + fprintf(stdout, " Column Family ID : %" PRIu32 "\n", + header.column_family_id); + std::string compression_str; + if (!GetStringFromCompressionType(&compression_str, header.compression) + .ok()) { + compression_str = "Unrecongnized compression type (" + + std::to_string((int)header.compression) + ")"; + } + fprintf(stdout, " Compression : %s\n", compression_str.c_str()); + fprintf(stdout, " Expiration range : %s\n", + GetString(header.expiration_range).c_str()); + *offset = BlobLogHeader::kSize; + *compression = header.compression; + return s; +} + +Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size, + uint64_t* footer_offset) { + auto no_footer = [&]() { + *footer_offset = file_size; + fprintf(stdout, "No blob log footer.\n"); + return Status::OK(); + }; + if (file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) { + return no_footer(); + } + Slice slice; + *footer_offset = file_size - BlobLogFooter::kSize; + Status s = Read(*footer_offset, BlobLogFooter::kSize, &slice); + if (!s.ok()) { + return s; + } + BlobLogFooter footer; + s = footer.DecodeFrom(slice); + if (!s.ok()) { + return no_footer(); + } + fprintf(stdout, "Blob log footer:\n"); + fprintf(stdout, " Blob count : %" PRIu64 "\n", footer.blob_count); + fprintf(stdout, " Expiration Range : %s\n", + GetString(footer.expiration_range).c_str()); + return s; +} + +Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob, + DisplayType show_uncompressed_blob, + bool show_summary, CompressionType compression, + uint64_t* offset, uint64_t* total_records, + uint64_t* total_key_size, + uint64_t* total_blob_size, + uint64_t* total_uncompressed_blob_size) { + if (show_key != DisplayType::kNone) { + fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n", + *offset, *offset); + } + Slice slice; + Status s = Read(*offset, BlobLogRecord::kHeaderSize, &slice); + if (!s.ok()) { + return s; + } + BlobLogRecord record; + s = record.DecodeHeaderFrom(slice); + if (!s.ok()) { + return s; + } + uint64_t key_size = record.key_size; + uint64_t value_size = record.value_size; + if (show_key != DisplayType::kNone) { + fprintf(stdout, " key size : %" PRIu64 "\n", key_size); + fprintf(stdout, " value size : %" PRIu64 "\n", value_size); + fprintf(stdout, " expiration : %" PRIu64 "\n", record.expiration); + } + *offset += BlobLogRecord::kHeaderSize; + s = Read(*offset, static_cast(key_size + value_size), &slice); + if (!s.ok()) { + return s; + } + // Decompress value + std::string uncompressed_value; + if (compression != kNoCompression && + (show_uncompressed_blob != DisplayType::kNone || show_summary)) { + BlockContents contents; + UncompressionContext context(compression); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression); + s = UncompressBlockData( + info, slice.data() + key_size, static_cast(value_size), + &contents, 2 /*compress_format_version*/, ImmutableOptions(Options())); + if (!s.ok()) { + return s; + } + uncompressed_value = contents.data.ToString(); + } + if (show_key != DisplayType::kNone) { + fprintf(stdout, " key : "); + DumpSlice(Slice(slice.data(), static_cast(key_size)), show_key); + if (show_blob != DisplayType::kNone) { + fprintf(stdout, " blob : "); + DumpSlice(Slice(slice.data() + static_cast(key_size), + static_cast(value_size)), + show_blob); + } + if (show_uncompressed_blob != DisplayType::kNone) { + fprintf(stdout, " raw blob : "); + DumpSlice(Slice(uncompressed_value), show_uncompressed_blob); + } + } + *offset += key_size + value_size; + *total_records += 1; + *total_key_size += key_size; + *total_blob_size += value_size; + *total_uncompressed_blob_size += uncompressed_value.size(); + return s; +} + +void BlobDumpTool::DumpSlice(const Slice s, DisplayType type) { + if (type == DisplayType::kRaw) { + fprintf(stdout, "%s\n", s.ToString().c_str()); + } else if (type == DisplayType::kHex) { + fprintf(stdout, "%s\n", s.ToString(true /*hex*/).c_str()); + } else if (type == DisplayType::kDetail) { + char buf[100]; + for (size_t i = 0; i < s.size(); i += 16) { + memset(buf, 0, sizeof(buf)); + for (size_t j = 0; j < 16 && i + j < s.size(); j++) { + unsigned char c = s[i + j]; + snprintf(buf + j * 3 + 15, 2, "%x", c >> 4); + snprintf(buf + j * 3 + 16, 2, "%x", c & 0xf); + snprintf(buf + j + 65, 2, "%c", (0x20 <= c && c <= 0x7e) ? c : '.'); + } + for (size_t p = 0; p + 1 < sizeof(buf); p++) { + if (buf[p] == 0) { + buf[p] = ' '; + } + } + fprintf(stdout, "%s\n", i == 0 ? buf + 15 : buf); + } + } +} + +template +std::string BlobDumpTool::GetString(std::pair p) { + if (p.first == 0 && p.second == 0) { + return "nil"; + } + return "(" + std::to_string(p.first) + ", " + std::to_string(p.second) + ")"; +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.h b/src/rocksdb/utilities/blob_db/blob_dump_tool.h new file mode 100644 index 000000000..bece564e1 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.h @@ -0,0 +1,58 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/random_access_file_reader.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDumpTool { + public: + enum class DisplayType { + kNone, + kRaw, + kHex, + kDetail, + }; + + BlobDumpTool(); + + Status Run(const std::string& filename, DisplayType show_key, + DisplayType show_blob, DisplayType show_uncompressed_blob, + bool show_summary); + + private: + std::unique_ptr reader_; + std::unique_ptr buffer_; + size_t buffer_size_; + + Status Read(uint64_t offset, size_t size, Slice* result); + Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression); + Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset); + Status DumpRecord(DisplayType show_key, DisplayType show_blob, + DisplayType show_uncompressed_blob, bool show_summary, + CompressionType compression, uint64_t* offset, + uint64_t* total_records, uint64_t* total_key_size, + uint64_t* total_blob_size, + uint64_t* total_uncompressed_blob_size); + void DumpSlice(const Slice s, DisplayType type); + + template + std::string GetString(std::pair p); +}; + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_file.cc b/src/rocksdb/utilities/blob_db/blob_file.cc new file mode 100644 index 000000000..c68e557c6 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_file.cc @@ -0,0 +1,318 @@ + +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE +#include "utilities/blob_db/blob_file.h" + +#include + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "file/filename.h" +#include "file/readahead_raf.h" +#include "logging/logging.h" +#include "utilities/blob_db/blob_db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +namespace blob_db { + +BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, + Logger* info_log) + : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {} + +BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn, + Logger* info_log, uint32_t column_family_id, + CompressionType compression, bool has_ttl, + const ExpirationRange& expiration_range) + : parent_(p), + path_to_dir_(bdir), + file_number_(fn), + info_log_(info_log), + column_family_id_(column_family_id), + compression_(compression), + has_ttl_(has_ttl), + expiration_range_(expiration_range), + header_(column_family_id, compression, has_ttl, expiration_range), + header_valid_(true) {} + +BlobFile::~BlobFile() { + if (obsolete_) { + std::string pn(PathName()); + Status s = Env::Default()->DeleteFile(PathName()); + if (!s.ok()) { + // ROCKS_LOG_INFO(db_options_.info_log, + // "File could not be deleted %s", pn.c_str()); + } + } +} + +uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; } + +std::string BlobFile::PathName() const { + return BlobFileName(path_to_dir_, file_number_); +} + +std::string BlobFile::DumpState() const { + char str[1000]; + snprintf( + str, sizeof(str), + "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64 + " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64 + "), writer: %d reader: %d", + path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(), + closed_.load(), obsolete_.load(), expiration_range_.first, + expiration_range_.second, (!!log_writer_), (!!ra_file_reader_)); + return str; +} + +void BlobFile::MarkObsolete(SequenceNumber sequence) { + assert(Immutable()); + obsolete_sequence_ = sequence; + obsolete_.store(true); +} + +Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) { + BlobLogFooter footer; + footer.blob_count = blob_count_; + if (HasTTL()) { + footer.expiration_range = expiration_range_; + } + + // this will close the file and reset the Writable File Pointer. + Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr, + /* checksum_value */ nullptr); + if (s.ok()) { + closed_ = true; + immutable_sequence_ = sequence; + file_size_ += BlobLogFooter::kSize; + } + // delete the sequential writer + log_writer_.reset(); + return s; +} + +Status BlobFile::ReadFooter(BlobLogFooter* bf) { + if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) { + return Status::IOError("File does not have footer", PathName()); + } + + uint64_t footer_offset = file_size_ - BlobLogFooter::kSize; + // assume that ra_file_reader_ is valid before we enter this + assert(ra_file_reader_); + + Slice result; + std::string buf; + AlignedBuf aligned_buf; + Status s; + // TODO: rate limit reading footers from blob files. + if (ra_file_reader_->use_direct_io()) { + s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, + &result, nullptr, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + buf.reserve(BlobLogFooter::kSize + 10); + s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, + &result, &buf[0], nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) return s; + if (result.size() != BlobLogFooter::kSize) { + // should not happen + return Status::IOError("EOF reached before footer"); + } + + s = bf->DecodeFrom(result); + return s; +} + +Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) { + blob_count_ = footer.blob_count; + expiration_range_ = footer.expiration_range; + closed_ = true; + return Status::OK(); +} + +Status BlobFile::Fsync() { + Status s; + if (log_writer_.get()) { + s = log_writer_->Sync(); + } + return s; +} + +void BlobFile::CloseRandomAccessLocked() { + ra_file_reader_.reset(); + last_access_ = -1; +} + +Status BlobFile::GetReader(Env* env, const FileOptions& file_options, + std::shared_ptr* reader, + bool* fresh_open) { + assert(reader != nullptr); + assert(fresh_open != nullptr); + *fresh_open = false; + int64_t current_time = 0; + if (env->GetCurrentTime(¤t_time).ok()) { + last_access_.store(current_time); + } + Status s; + + { + ReadLock lockbfile_r(&mutex_); + if (ra_file_reader_) { + *reader = ra_file_reader_; + return s; + } + } + + WriteLock lockbfile_w(&mutex_); + // Double check. + if (ra_file_reader_) { + *reader = ra_file_reader_; + return s; + } + + std::unique_ptr rfile; + s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options, + &rfile, nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to open blob file for random-read: %s status: '%s'" + " exists: '%s'", + PathName().c_str(), s.ToString().c_str(), + env->FileExists(PathName()).ToString().c_str()); + return s; + } + + ra_file_reader_ = + std::make_shared(std::move(rfile), PathName()); + *reader = ra_file_reader_; + *fresh_open = true; + return s; +} + +Status BlobFile::ReadMetadata(const std::shared_ptr& fs, + const FileOptions& file_options) { + assert(Immutable()); + // Get file size. + uint64_t file_size = 0; + Status s = + fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr); + if (s.ok()) { + file_size_ = file_size; + } else { + ROCKS_LOG_ERROR(info_log_, + "Failed to get size of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + if (file_size < BlobLogHeader::kSize) { + ROCKS_LOG_ERROR( + info_log_, "Incomplete blob file blob file %" PRIu64 ", size: %" PRIu64, + file_number_, file_size); + return Status::Corruption("Incomplete blob file header."); + } + + // Create file reader. + std::unique_ptr file_reader; + s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader, + nullptr); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to open blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + + // Read file header. + std::string header_buf; + AlignedBuf aligned_buf; + Slice header_slice; + // TODO: rate limit reading headers from blob files. + if (file_reader->use_direct_io()) { + s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, + nullptr, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + header_buf.reserve(BlobLogHeader::kSize); + s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, + &header_buf[0], nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + info_log_, "Failed to read header of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + BlobLogHeader header; + s = header.DecodeFrom(header_slice); + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, + "Failed to decode header of blob file %" PRIu64 + ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + column_family_id_ = header.column_family_id; + compression_ = header.compression; + has_ttl_ = header.has_ttl; + if (has_ttl_) { + expiration_range_ = header.expiration_range; + } + header_valid_ = true; + + // Read file footer. + if (file_size_ < BlobLogHeader::kSize + BlobLogFooter::kSize) { + // OK not to have footer. + assert(!footer_valid_); + return Status::OK(); + } + std::string footer_buf; + Slice footer_slice; + // TODO: rate limit reading footers from blob files. + if (file_reader->use_direct_io()) { + s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, + BlobLogFooter::kSize, &footer_slice, nullptr, + &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + } else { + footer_buf.reserve(BlobLogFooter::kSize); + s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, + BlobLogFooter::kSize, &footer_slice, &footer_buf[0], + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + } + if (!s.ok()) { + ROCKS_LOG_ERROR( + info_log_, "Failed to read footer of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); + return s; + } + BlobLogFooter footer; + s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + // OK not to have footer. + assert(!footer_valid_); + return Status::OK(); + } + blob_count_ = footer.blob_count; + if (has_ttl_) { + assert(header.expiration_range.first <= footer.expiration_range.first); + assert(header.expiration_range.second >= footer.expiration_range.second); + expiration_range_ = footer.expiration_range; + } + footer_valid_ = true; + return Status::OK(); +} + +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/blob_db/blob_file.h b/src/rocksdb/utilities/blob_db/blob_file.h new file mode 100644 index 000000000..6f3f2bea7 --- /dev/null +++ b/src/rocksdb/utilities/blob_db/blob_file.h @@ -0,0 +1,246 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "file/random_access_file_reader.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { +namespace blob_db { + +class BlobDBImpl; + +class BlobFile { + friend class BlobDBImpl; + friend struct BlobFileComparator; + friend struct BlobFileComparatorTTL; + friend class BlobIndexCompactionFilterBase; + friend class BlobIndexCompactionFilterGC; + + private: + // access to parent + const BlobDBImpl* parent_{nullptr}; + + // path to blob directory + std::string path_to_dir_; + + // the id of the file. + // the above 2 are created during file creation and never changed + // after that + uint64_t file_number_{0}; + + // The file numbers of the SST files whose oldest blob file reference + // points to this blob file. + std::unordered_set linked_sst_files_; + + // Info log. + Logger* info_log_{nullptr}; + + // Column family id. + uint32_t column_family_id_{std::numeric_limits::max()}; + + // Compression type of blobs in the file + CompressionType compression_{kNoCompression}; + + // If true, the keys in this file all has TTL. Otherwise all keys don't + // have TTL. + bool has_ttl_{false}; + + // TTL range of blobs in the file. + ExpirationRange expiration_range_; + + // number of blobs in the file + std::atomic blob_count_{0}; + + // size of the file + std::atomic file_size_{0}; + + BlobLogHeader header_; + + // closed_ = true implies the file is no more mutable + // no more blobs will be appended and the footer has been written out + std::atomic closed_{false}; + + // The latest sequence number when the file was closed/made immutable. + SequenceNumber immutable_sequence_{0}; + + // Whether the file was marked obsolete (due to either TTL or GC). + // obsolete_ still needs to do iterator/snapshot checks + std::atomic obsolete_{false}; + + // The last sequence number by the time the file marked as obsolete. + // Data in this file is visible to a snapshot taken before the sequence. + SequenceNumber obsolete_sequence_{0}; + + // Sequential/Append writer for blobs + std::shared_ptr log_writer_; + + // random access file reader for GET calls + std::shared_ptr ra_file_reader_; + + // This Read-Write mutex is per file specific and protects + // all the datastructures + mutable port::RWMutex mutex_; + + // time when the random access reader was last created. + std::atomic last_access_{-1}; + + bool header_valid_{false}; + + bool footer_valid_{false}; + + public: + BlobFile() = default; + + BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum, + Logger* info_log); + + BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum, + Logger* info_log, uint32_t column_family_id, + CompressionType compression, bool has_ttl, + const ExpirationRange& expiration_range); + + ~BlobFile(); + + uint32_t GetColumnFamilyId() const; + + // Returns log file's absolute pathname. + std::string PathName() const; + + // Primary identifier for blob file. + // once the file is created, this never changes + uint64_t BlobFileNumber() const { return file_number_; } + + // Get the set of SST files whose oldest blob file reference points to + // this file. + const std::unordered_set& GetLinkedSstFiles() const { + return linked_sst_files_; + } + + // Link an SST file whose oldest blob file reference points to this file. + void LinkSstFile(uint64_t sst_file_number) { + assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end()); + linked_sst_files_.insert(sst_file_number); + } + + // Unlink an SST file whose oldest blob file reference points to this file. + void UnlinkSstFile(uint64_t sst_file_number) { + auto it = linked_sst_files_.find(sst_file_number); + assert(it != linked_sst_files_.end()); + linked_sst_files_.erase(it); + } + + // the following functions are atomic, and don't need + // read lock + uint64_t BlobCount() const { + return blob_count_.load(std::memory_order_acquire); + } + + std::string DumpState() const; + + // if the file is not taking any more appends. + bool Immutable() const { return closed_.load(); } + + // Mark the file as immutable. + // REQUIRES: write lock held, or access from single thread (on DB open). + void MarkImmutable(SequenceNumber sequence) { + closed_ = true; + immutable_sequence_ = sequence; + } + + SequenceNumber GetImmutableSequence() const { + assert(Immutable()); + return immutable_sequence_; + } + + // Whether the file was marked obsolete (due to either TTL or GC). + bool Obsolete() const { + assert(Immutable() || !obsolete_.load()); + return obsolete_.load(); + } + + // Mark file as obsolete (due to either TTL or GC). The file is not visible to + // snapshots with sequence greater or equal to the given sequence. + void MarkObsolete(SequenceNumber sequence); + + SequenceNumber GetObsoleteSequence() const { + assert(Obsolete()); + return obsolete_sequence_; + } + + Status Fsync(); + + uint64_t GetFileSize() const { + return file_size_.load(std::memory_order_acquire); + } + + // All Get functions which are not atomic, will need ReadLock on the mutex + + const ExpirationRange& GetExpirationRange() const { + return expiration_range_; + } + + void ExtendExpirationRange(uint64_t expiration) { + expiration_range_.first = std::min(expiration_range_.first, expiration); + expiration_range_.second = std::max(expiration_range_.second, expiration); + } + + bool HasTTL() const { return has_ttl_; } + + void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; } + + CompressionType GetCompressionType() const { return compression_; } + + std::shared_ptr GetWriter() const { return log_writer_; } + + // Read blob file header and footer. Return corruption if file header is + // malform or incomplete. If footer is malform or incomplete, set + // footer_valid_ to false and return Status::OK. + Status ReadMetadata(const std::shared_ptr& fs, + const FileOptions& file_options); + + Status GetReader(Env* env, const FileOptions& file_options, + std::shared_ptr* reader, + bool* fresh_open); + + private: + Status ReadFooter(BlobLogFooter* footer); + + Status WriteFooterAndCloseLocked(SequenceNumber sequence); + + void CloseRandomAccessLocked(); + + // this is used, when you are reading only the footer of a + // previously closed file + Status SetFromFooterLocked(const BlobLogFooter& footer); + + void set_expiration_range(const ExpirationRange& expiration_range) { + expiration_range_ = expiration_range; + } + + // The following functions are atomic, and don't need locks + void SetFileSize(uint64_t fs) { file_size_ = fs; } + + void SetBlobCount(uint64_t bc) { blob_count_ = bc; } + + void BlobRecordAdded(uint64_t record_size) { + ++blob_count_; + file_size_ += record_size; + } +}; +} // namespace blob_db +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/cache_dump_load.cc b/src/rocksdb/utilities/cache_dump_load.cc new file mode 100644 index 000000000..9a7c76798 --- /dev/null +++ b/src/rocksdb/utilities/cache_dump_load.cc @@ -0,0 +1,69 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/cache_dump_load.h" + +#include "file/writable_file_writer.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "table/format.h" +#include "util/crc32c.h" +#include "utilities/cache_dump_load_impl.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus NewToFileCacheDumpWriter(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* writer) { + std::unique_ptr file_writer; + IOStatus io_s = WritableFileWriter::Create(fs, file_name, file_opts, + &file_writer, nullptr); + if (!io_s.ok()) { + return io_s; + } + writer->reset(new ToFileCacheDumpWriter(std::move(file_writer))); + return io_s; +} + +IOStatus NewFromFileCacheDumpReader(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* reader) { + std::unique_ptr file_reader; + IOStatus io_s = RandomAccessFileReader::Create(fs, file_name, file_opts, + &file_reader, nullptr); + if (!io_s.ok()) { + return io_s; + } + reader->reset(new FromFileCacheDumpReader(std::move(file_reader))); + return io_s; +} + +Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer, + std::unique_ptr* cache_dumper) { + cache_dumper->reset( + new CacheDumperImpl(dump_options, cache, std::move(writer))); + return Status::OK(); +} + +Status NewDefaultCacheDumpedLoader( + const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& toptions, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader, + std::unique_ptr* cache_dump_loader) { + cache_dump_loader->reset(new CacheDumpedLoaderImpl( + dump_options, toptions, secondary_cache, std::move(reader))); + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/cache_dump_load_impl.cc b/src/rocksdb/utilities/cache_dump_load_impl.cc new file mode 100644 index 000000000..2b9f2a29d --- /dev/null +++ b/src/rocksdb/utilities/cache_dump_load_impl.cc @@ -0,0 +1,393 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_key.h" +#include "table/block_based/block_based_table_reader.h" +#ifndef ROCKSDB_LITE + +#include "cache/cache_entry_roles.h" +#include "file/writable_file_writer.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "table/format.h" +#include "util/crc32c.h" +#include "utilities/cache_dump_load_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// Set the dump filter with a list of DBs. Block cache may be shared by multipe +// DBs and we may only want to dump out the blocks belonging to certain DB(s). +// Therefore, a filter is need to decide if the key of the block satisfy the +// requirement. +Status CacheDumperImpl::SetDumpFilter(std::vector db_list) { + Status s = Status::OK(); + for (size_t i = 0; i < db_list.size(); i++) { + assert(i < db_list.size()); + TablePropertiesCollection ptc; + assert(db_list[i] != nullptr); + s = db_list[i]->GetPropertiesOfAllTables(&ptc); + if (!s.ok()) { + return s; + } + for (auto id = ptc.begin(); id != ptc.end(); id++) { + OffsetableCacheKey base; + // We only want to save cache entries that are portable to another + // DB::Open, so only save entries with stable keys. + bool is_stable; + BlockBasedTable::SetupBaseCacheKey(id->second.get(), + /*cur_db_session_id*/ "", + /*cur_file_num*/ 0, &base, &is_stable); + if (is_stable) { + Slice prefix_slice = base.CommonPrefixSlice(); + assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize); + prefix_filter_.insert(prefix_slice.ToString()); + } + } + } + return s; +} + +// This is the main function to dump out the cache block entries to the writer. +// The writer may create a file or write to other systems. Currently, we will +// iterate the whole block cache, get the blocks, and write them to the writer +IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() { + // Prepare stage, check the parameters. + if (cache_ == nullptr) { + return IOStatus::InvalidArgument("Cache is null"); + } + if (writer_ == nullptr) { + return IOStatus::InvalidArgument("CacheDumpWriter is null"); + } + // Set the system clock + if (options_.clock == nullptr) { + return IOStatus::InvalidArgument("System clock is null"); + } + clock_ = options_.clock; + // We copy the Cache Deleter Role Map as its member. + role_map_ = CopyCacheDeleterRoleMap(); + // Set the sequence number + sequence_num_ = 0; + + // Dump stage, first, we write the hader + IOStatus io_s = WriteHeader(); + if (!io_s.ok()) { + return io_s; + } + + // Then, we iterate the block cache and dump out the blocks that are not + // filtered out. + cache_->ApplyToAllEntries(DumpOneBlockCallBack(), {}); + + // Finally, write the footer + io_s = WriteFooter(); + if (!io_s.ok()) { + return io_s; + } + io_s = writer_->Close(); + return io_s; +} + +// Check if we need to filter out the block based on its key +bool CacheDumperImpl::ShouldFilterOut(const Slice& key) { + if (key.size() < OffsetableCacheKey::kCommonPrefixSize) { + return /*filter out*/ true; + } + Slice key_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize); + std::string prefix = key_prefix.ToString(); + // Filter out if not found + return prefix_filter_.find(prefix) == prefix_filter_.end(); +} + +// This is the callback function which will be applied to +// Cache::ApplyToAllEntries. In this callback function, we will get the block +// type, decide if the block needs to be dumped based on the filter, and write +// the block through the provided writer. +std::function +CacheDumperImpl::DumpOneBlockCallBack() { + return [&](const Slice& key, void* value, size_t /*charge*/, + Cache::DeleterFn deleter) { + // Step 1: get the type of the block from role_map_ + auto e = role_map_.find(deleter); + CacheEntryRole role; + CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax; + if (e == role_map_.end()) { + role = CacheEntryRole::kMisc; + } else { + role = e->second; + } + bool filter_out = false; + + // Step 2: based on the key prefix, check if the block should be filter out. + if (ShouldFilterOut(key)) { + filter_out = true; + } + + // Step 3: based on the block type, get the block raw pointer and length. + const char* block_start = nullptr; + size_t block_len = 0; + switch (role) { + case CacheEntryRole::kDataBlock: + type = CacheDumpUnitType::kData; + block_start = (static_cast(value))->data(); + block_len = (static_cast(value))->size(); + break; + case CacheEntryRole::kFilterBlock: + type = CacheDumpUnitType::kFilter; + block_start = (static_cast(value)) + ->GetBlockContentsData() + .data(); + block_len = (static_cast(value)) + ->GetBlockContentsData() + .size(); + break; + case CacheEntryRole::kFilterMetaBlock: + type = CacheDumpUnitType::kFilterMetaBlock; + block_start = (static_cast(value))->data(); + block_len = (static_cast(value))->size(); + break; + case CacheEntryRole::kIndexBlock: + type = CacheDumpUnitType::kIndex; + block_start = (static_cast(value))->data(); + block_len = (static_cast(value))->size(); + break; + case CacheEntryRole::kDeprecatedFilterBlock: + // Obsolete + filter_out = true; + break; + case CacheEntryRole::kMisc: + filter_out = true; + break; + case CacheEntryRole::kOtherBlock: + filter_out = true; + break; + case CacheEntryRole::kWriteBuffer: + filter_out = true; + break; + default: + filter_out = true; + } + + // Step 4: if the block should not be filter out, write the block to the + // CacheDumpWriter + if (!filter_out && block_start != nullptr) { + WriteBlock(type, key, Slice(block_start, block_len)) + .PermitUncheckedError(); + } + }; +} + +// Write the block to the writer. It takes the timestamp of the +// block being copied from block cache, block type, key, block pointer, +// block size and block checksum as the input. When writing the dumper raw +// block, we first create the dump unit and encoude it to a string. Then, +// we calculate the checksum of the whole dump unit string and store it in +// the dump unit metadata. +// First, we write the metadata first, which is a fixed size string. Then, we +// Append the dump unit string to the writer. +IOStatus CacheDumperImpl::WriteBlock(CacheDumpUnitType type, const Slice& key, + const Slice& value) { + uint64_t timestamp = clock_->NowMicros(); + uint32_t value_checksum = crc32c::Value(value.data(), value.size()); + + // First, serialize the block information in a string + DumpUnit dump_unit; + dump_unit.timestamp = timestamp; + dump_unit.key = key; + dump_unit.type = type; + dump_unit.value_len = value.size(); + dump_unit.value = const_cast(value.data()); + dump_unit.value_checksum = value_checksum; + std::string encoded_data; + CacheDumperHelper::EncodeDumpUnit(dump_unit, &encoded_data); + + // Second, create the metadata, which contains a sequence number, the dump + // unit string checksum and the string size. The sequence number monotonically + // increases from 0. + DumpUnitMeta unit_meta; + unit_meta.sequence_num = sequence_num_; + sequence_num_++; + unit_meta.dump_unit_checksum = + crc32c::Value(encoded_data.data(), encoded_data.size()); + unit_meta.dump_unit_size = encoded_data.size(); + std::string encoded_meta; + CacheDumperHelper::EncodeDumpUnitMeta(unit_meta, &encoded_meta); + + // We write the metadata first. + assert(writer_ != nullptr); + IOStatus io_s = writer_->WriteMetadata(encoded_meta); + if (!io_s.ok()) { + return io_s; + } + // followed by the dump unit. + return writer_->WritePacket(encoded_data); +} + +// Before we write any block, we write the header first to store the cache dump +// format version, rocksdb version, and brief intro. +IOStatus CacheDumperImpl::WriteHeader() { + std::string header_key = "header"; + std::ostringstream s; + s << kTraceMagic << "\t" + << "Cache dump format version: " << kCacheDumpMajorVersion << "." + << kCacheDumpMinorVersion << "\t" + << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t" + << "Format: dump_unit_metadata , dump_unit cache_value\n"; + std::string header_value(s.str()); + CacheDumpUnitType type = CacheDumpUnitType::kHeader; + return WriteBlock(type, header_key, header_value); +} + +// Write the footer after all the blocks are stored to indicate the ending. +IOStatus CacheDumperImpl::WriteFooter() { + std::string footer_key = "footer"; + std::string footer_value("cache dump completed"); + CacheDumpUnitType type = CacheDumpUnitType::kFooter; + return WriteBlock(type, footer_key, footer_value); +} + +// This is the main function to restore the cache entries to secondary cache. +// First, we check if all the arguments are valid. Then, we read the block +// sequentially from the reader and insert them to the secondary cache. +IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() { + // TODO: remove this line when options are used in the loader + (void)options_; + // Step 1: we check if all the arguments are valid + if (secondary_cache_ == nullptr) { + return IOStatus::InvalidArgument("Secondary Cache is null"); + } + if (reader_ == nullptr) { + return IOStatus::InvalidArgument("CacheDumpReader is null"); + } + // we copy the Cache Deleter Role Map as its member. + role_map_ = CopyCacheDeleterRoleMap(); + + // Step 2: read the header + // TODO: we need to check the cache dump format version and RocksDB version + // after the header is read out. + IOStatus io_s; + DumpUnit dump_unit; + std::string data; + io_s = ReadHeader(&data, &dump_unit); + if (!io_s.ok()) { + return io_s; + } + + // Step 3: read out the rest of the blocks from the reader. The loop will stop + // either I/O status is not ok or we reach to the the end. + while (io_s.ok()) { + dump_unit.reset(); + data.clear(); + // read the content and store in the dump_unit + io_s = ReadCacheBlock(&data, &dump_unit); + if (!io_s.ok()) { + break; + } + if (dump_unit.type == CacheDumpUnitType::kFooter) { + break; + } + // Create the uncompressed_block based on the information in the dump_unit + // (There is no block trailer here compatible with block-based SST file.) + Slice content = + Slice(static_cast(dump_unit.value), dump_unit.value_len); + Status s = secondary_cache_->InsertSaved(dump_unit.key, content); + if (!s.ok()) { + io_s = status_to_io_status(std::move(s)); + } + } + if (dump_unit.type == CacheDumpUnitType::kFooter) { + return IOStatus::OK(); + } else { + return io_s; + } +} + +// Read and copy the dump unit metadata to std::string data, decode and create +// the unit metadata based on the string +IOStatus CacheDumpedLoaderImpl::ReadDumpUnitMeta(std::string* data, + DumpUnitMeta* unit_meta) { + assert(reader_ != nullptr); + assert(data != nullptr); + assert(unit_meta != nullptr); + IOStatus io_s = reader_->ReadMetadata(data); + if (!io_s.ok()) { + return io_s; + } + return status_to_io_status( + CacheDumperHelper::DecodeDumpUnitMeta(*data, unit_meta)); +} + +// Read and copy the dump unit to std::string data, decode and create the unit +// based on the string +IOStatus CacheDumpedLoaderImpl::ReadDumpUnit(size_t len, std::string* data, + DumpUnit* unit) { + assert(reader_ != nullptr); + assert(data != nullptr); + assert(unit != nullptr); + IOStatus io_s = reader_->ReadPacket(data); + if (!io_s.ok()) { + return io_s; + } + if (data->size() != len) { + return IOStatus::Corruption( + "The data being read out does not match the size stored in metadata!"); + } + Slice block; + return status_to_io_status(CacheDumperHelper::DecodeDumpUnit(*data, unit)); +} + +// Read the header +IOStatus CacheDumpedLoaderImpl::ReadHeader(std::string* data, + DumpUnit* dump_unit) { + DumpUnitMeta header_meta; + header_meta.reset(); + std::string meta_string; + IOStatus io_s = ReadDumpUnitMeta(&meta_string, &header_meta); + if (!io_s.ok()) { + return io_s; + } + + io_s = ReadDumpUnit(header_meta.dump_unit_size, data, dump_unit); + if (!io_s.ok()) { + return io_s; + } + uint32_t unit_checksum = crc32c::Value(data->data(), data->size()); + if (unit_checksum != header_meta.dump_unit_checksum) { + return IOStatus::Corruption("Read header unit corrupted!"); + } + return io_s; +} + +// Read the blocks after header is read out +IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data, + DumpUnit* dump_unit) { + // According to the write process, we read the dump_unit_metadata first + DumpUnitMeta unit_meta; + unit_meta.reset(); + std::string unit_string; + IOStatus io_s = ReadDumpUnitMeta(&unit_string, &unit_meta); + if (!io_s.ok()) { + return io_s; + } + + // Based on the information in the dump_unit_metadata, we read the dump_unit + // and verify if its content is correct. + io_s = ReadDumpUnit(unit_meta.dump_unit_size, data, dump_unit); + if (!io_s.ok()) { + return io_s; + } + uint32_t unit_checksum = crc32c::Value(data->data(), data->size()); + if (unit_checksum != unit_meta.dump_unit_checksum) { + return IOStatus::Corruption( + "Checksum does not match! Read dumped unit corrupted!"); + } + return io_s; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/cache_dump_load_impl.h b/src/rocksdb/utilities/cache_dump_load_impl.h new file mode 100644 index 000000000..9ca1ff45a --- /dev/null +++ b/src/rocksdb/utilities/cache_dump_load_impl.h @@ -0,0 +1,359 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "rocksdb/utilities/cache_dump_load.h" +#include "table/block_based/block.h" +#include "table/block_based/block_like_traits.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { + +// the read buffer size of for the default CacheDumpReader +const unsigned int kDumpReaderBufferSize = 1024; // 1KB +static const unsigned int kSizePrefixLen = 4; + +enum CacheDumpUnitType : unsigned char { + kHeader = 1, + kFooter = 2, + kData = 3, + kFilter = 4, + kProperties = 5, + kCompressionDictionary = 6, + kRangeDeletion = 7, + kHashIndexPrefixes = 8, + kHashIndexMetadata = 9, + kMetaIndex = 10, + kIndex = 11, + kDeprecatedFilterBlock = 12, // OBSOLETE / DEPRECATED + kFilterMetaBlock = 13, + kBlockTypeMax, +}; + +// The metadata of a dump unit. After it is serilized, its size is fixed 16 +// bytes. +struct DumpUnitMeta { + // sequence number is a monotonically increasing number to indicate the order + // of the blocks being written. Header is 0. + uint32_t sequence_num; + // The Crc32c checksum of its dump unit. + uint32_t dump_unit_checksum; + // The dump unit size after the dump unit is serilized to a string. + uint64_t dump_unit_size; + + void reset() { + sequence_num = 0; + dump_unit_checksum = 0; + dump_unit_size = 0; + } +}; + +// The data structure to hold a block and its information. +struct DumpUnit { + // The timestamp when the block is identified, copied, and dumped from block + // cache + uint64_t timestamp; + // The type of the block + CacheDumpUnitType type; + // The key of this block when the block is referenced by this Cache + Slice key; + // The block size + size_t value_len; + // The Crc32c checksum of the block + uint32_t value_checksum; + // Pointer to the block. Note that, in the dump process, it points to a memory + // buffer copied from cache block. The buffer is freed when we process the + // next block. In the load process, we use an std::string to store the + // serialized dump_unit read from the reader. So it points to the memory + // address of the begin of the block in this string. + void* value; + + DumpUnit() { reset(); } + + void reset() { + timestamp = 0; + type = CacheDumpUnitType::kBlockTypeMax; + key.clear(); + value_len = 0; + value_checksum = 0; + value = nullptr; + } +}; + +// The default implementation of the Cache Dumper +class CacheDumperImpl : public CacheDumper { + public: + CacheDumperImpl(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer) + : options_(dump_options), cache_(cache), writer_(std::move(writer)) {} + ~CacheDumperImpl() { writer_.reset(); } + Status SetDumpFilter(std::vector db_list) override; + IOStatus DumpCacheEntriesToWriter() override; + + private: + IOStatus WriteBlock(CacheDumpUnitType type, const Slice& key, + const Slice& value); + IOStatus WriteHeader(); + IOStatus WriteFooter(); + bool ShouldFilterOut(const Slice& key); + std::function + DumpOneBlockCallBack(); + + CacheDumpOptions options_; + std::shared_ptr cache_; + std::unique_ptr writer_; + UnorderedMap role_map_; + SystemClock* clock_; + uint32_t sequence_num_; + // The cache key prefix filter. Currently, we use db_session_id as the prefix, + // so using std::set to store the prefixes as filter is enough. Further + // improvement can be applied like BloomFilter or others to speedup the + // filtering. + std::set prefix_filter_; +}; + +// The default implementation of CacheDumpedLoader +class CacheDumpedLoaderImpl : public CacheDumpedLoader { + public: + CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& /*toptions*/, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader) + : options_(dump_options), + secondary_cache_(secondary_cache), + reader_(std::move(reader)) {} + ~CacheDumpedLoaderImpl() {} + IOStatus RestoreCacheEntriesToSecondaryCache() override; + + private: + IOStatus ReadDumpUnitMeta(std::string* data, DumpUnitMeta* unit_meta); + IOStatus ReadDumpUnit(size_t len, std::string* data, DumpUnit* unit); + IOStatus ReadHeader(std::string* data, DumpUnit* dump_unit); + IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit); + + CacheDumpOptions options_; + std::shared_ptr secondary_cache_; + std::unique_ptr reader_; + UnorderedMap role_map_; +}; + +// The default implementation of CacheDumpWriter. We write the blocks to a file +// sequentially. +class ToFileCacheDumpWriter : public CacheDumpWriter { + public: + explicit ToFileCacheDumpWriter( + std::unique_ptr&& file_writer) + : file_writer_(std::move(file_writer)) {} + + ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); } + + // Write the serialized metadata to the file + virtual IOStatus WriteMetadata(const Slice& metadata) override { + assert(file_writer_ != nullptr); + std::string prefix; + PutFixed32(&prefix, static_cast(metadata.size())); + IOStatus io_s = file_writer_->Append(Slice(prefix)); + if (!io_s.ok()) { + return io_s; + } + io_s = file_writer_->Append(metadata); + return io_s; + } + + // Write the serialized data to the file + virtual IOStatus WritePacket(const Slice& data) override { + assert(file_writer_ != nullptr); + std::string prefix; + PutFixed32(&prefix, static_cast(data.size())); + IOStatus io_s = file_writer_->Append(Slice(prefix)); + if (!io_s.ok()) { + return io_s; + } + io_s = file_writer_->Append(data); + return io_s; + } + + // Reset the writer + virtual IOStatus Close() override { + file_writer_.reset(); + return IOStatus::OK(); + } + + private: + std::unique_ptr file_writer_; +}; + +// The default implementation of CacheDumpReader. It is implemented based on +// RandomAccessFileReader. Note that, we keep an internal variable to remember +// the current offset. +class FromFileCacheDumpReader : public CacheDumpReader { + public: + explicit FromFileCacheDumpReader( + std::unique_ptr&& reader) + : file_reader_(std::move(reader)), + offset_(0), + buffer_(new char[kDumpReaderBufferSize]) {} + + ~FromFileCacheDumpReader() { delete[] buffer_; } + + virtual IOStatus ReadMetadata(std::string* metadata) override { + uint32_t metadata_len = 0; + IOStatus io_s = ReadSizePrefix(&metadata_len); + if (!io_s.ok()) { + return io_s; + } + return Read(metadata_len, metadata); + } + + virtual IOStatus ReadPacket(std::string* data) override { + uint32_t data_len = 0; + IOStatus io_s = ReadSizePrefix(&data_len); + if (!io_s.ok()) { + return io_s; + } + return Read(data_len, data); + } + + private: + IOStatus ReadSizePrefix(uint32_t* len) { + std::string prefix; + IOStatus io_s = Read(kSizePrefixLen, &prefix); + if (!io_s.ok()) { + return io_s; + } + Slice encoded_slice(prefix); + if (!GetFixed32(&encoded_slice, len)) { + return IOStatus::Corruption("Decode size prefix string failed"); + } + return IOStatus::OK(); + } + + IOStatus Read(size_t len, std::string* data) { + assert(file_reader_ != nullptr); + IOStatus io_s; + + unsigned int bytes_to_read = static_cast(len); + unsigned int to_read = bytes_to_read > kDumpReaderBufferSize + ? kDumpReaderBufferSize + : bytes_to_read; + + while (to_read > 0) { + io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, + buffer_, nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + if (result_.size() < to_read) { + return IOStatus::Corruption("Corrupted cache dump file."); + } + data->append(result_.data(), result_.size()); + + offset_ += to_read; + bytes_to_read -= to_read; + to_read = bytes_to_read > kDumpReaderBufferSize ? kDumpReaderBufferSize + : bytes_to_read; + } + return io_s; + } + std::unique_ptr file_reader_; + Slice result_; + size_t offset_; + char* buffer_; +}; + +// The cache dump and load helper class +class CacheDumperHelper { + public: + // serialize the dump_unit_meta to a string, it is fixed 16 bytes size. + static void EncodeDumpUnitMeta(const DumpUnitMeta& meta, std::string* data) { + assert(data); + PutFixed32(data, static_cast(meta.sequence_num)); + PutFixed32(data, static_cast(meta.dump_unit_checksum)); + PutFixed64(data, meta.dump_unit_size); + } + + // Serialize the dump_unit to a string. + static void EncodeDumpUnit(const DumpUnit& dump_unit, std::string* data) { + assert(data); + PutFixed64(data, dump_unit.timestamp); + data->push_back(dump_unit.type); + PutLengthPrefixedSlice(data, dump_unit.key); + PutFixed32(data, static_cast(dump_unit.value_len)); + PutFixed32(data, dump_unit.value_checksum); + PutLengthPrefixedSlice(data, + Slice((char*)dump_unit.value, dump_unit.value_len)); + } + + // Deserialize the dump_unit_meta from a string + static Status DecodeDumpUnitMeta(const std::string& encoded_data, + DumpUnitMeta* unit_meta) { + assert(unit_meta != nullptr); + Slice encoded_slice = Slice(encoded_data); + if (!GetFixed32(&encoded_slice, &(unit_meta->sequence_num))) { + return Status::Incomplete("Decode dumped unit meta sequence_num failed"); + } + if (!GetFixed32(&encoded_slice, &(unit_meta->dump_unit_checksum))) { + return Status::Incomplete( + "Decode dumped unit meta dump_unit_checksum failed"); + } + if (!GetFixed64(&encoded_slice, &(unit_meta->dump_unit_size))) { + return Status::Incomplete( + "Decode dumped unit meta dump_unit_size failed"); + } + return Status::OK(); + } + + // Deserialize the dump_unit from a string. + static Status DecodeDumpUnit(const std::string& encoded_data, + DumpUnit* dump_unit) { + assert(dump_unit != nullptr); + Slice encoded_slice = Slice(encoded_data); + + // Decode timestamp + if (!GetFixed64(&encoded_slice, &dump_unit->timestamp)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the block type + dump_unit->type = static_cast(encoded_slice[0]); + encoded_slice.remove_prefix(1); + // Decode the key + if (!GetLengthPrefixedSlice(&encoded_slice, &(dump_unit->key))) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the value size + uint32_t value_len; + if (!GetFixed32(&encoded_slice, &value_len)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + dump_unit->value_len = static_cast(value_len); + // Decode the value checksum + if (!GetFixed32(&encoded_slice, &(dump_unit->value_checksum))) { + return Status::Incomplete("Decode dumped unit string failed"); + } + // Decode the block content and copy to the memory space whose pointer + // will be managed by the cache finally. + Slice block; + if (!GetLengthPrefixedSlice(&encoded_slice, &block)) { + return Status::Incomplete("Decode dumped unit string failed"); + } + dump_unit->value = (void*)block.data(); + assert(block.size() == dump_unit->value_len); + return Status::OK(); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc new file mode 100644 index 000000000..4e48d63aa --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/cassandra/cassandra_compaction_filter.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +static std::unordered_map + cassandra_filter_type_info = { +#ifndef ROCKSDB_LITE + {"purge_ttl_on_expiration", + {offsetof(struct CassandraOptions, purge_ttl_on_expiration), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"gc_grace_period_in_seconds", + {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +CassandraCompactionFilter::CassandraCompactionFilter( + bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds) + : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) { + RegisterOptions(&options_, &cassandra_filter_type_info); +} + +CompactionFilter::Decision CassandraCompactionFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + bool value_changed = false; + RowValue row_value = + RowValue::Deserialize(existing_value.data(), existing_value.size()); + RowValue compacted = + options_.purge_ttl_on_expiration + ? row_value.RemoveExpiredColumns(&value_changed) + : row_value.ConvertExpiredColumnsToTombstones(&value_changed); + + if (value_type == ValueType::kValue) { + compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds); + } + + if (compacted.Empty()) { + return Decision::kRemove; + } + + if (value_changed) { + compacted.Serialize(new_value); + return Decision::kChangeValue; + } + + return Decision::kKeep; +} + +CassandraCompactionFilterFactory::CassandraCompactionFilterFactory( + bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds) + : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) { + RegisterOptions(&options_, &cassandra_filter_type_info); +} + +std::unique_ptr +CassandraCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context&) { + std::unique_ptr result(new CassandraCompactionFilter( + options_.purge_ttl_on_expiration, options_.gc_grace_period_in_seconds)); + return result; +} + +#ifndef ROCKSDB_LITE +int RegisterCassandraObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + CassandraValueMergeOperator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CassandraValueMergeOperator(0)); + return guard->get(); + }); + library.AddFactory( + CassandraCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { + return new CassandraCompactionFilter(false, 0); + }); + library.AddFactory( + CassandraCompactionFilterFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CassandraCompactionFilterFactory(false, 0)); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h new file mode 100644 index 000000000..0325a4c39 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +/** + * Compaction filter for removing expired Cassandra data with ttl. + * If option `purge_ttl_on_expiration` is set to true, expired data + * will be directly purged. Otherwise expired data will be converted + * tombstones first, then be eventally removed after gc grace period. + * `purge_ttl_on_expiration` should only be on in the case all the + * writes have same ttl setting, otherwise it could bring old data back. + * + * Compaction filter is also in charge of removing tombstone that has been + * promoted to kValue type after serials of merging in compaction. + */ +class CassandraCompactionFilter : public CompactionFilter { + public: + explicit CassandraCompactionFilter(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds); + static const char* kClassName() { return "CassandraCompactionFilter"; } + const char* Name() const override { return kClassName(); } + + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* skip_until) const override; + + private: + CassandraOptions options_; +}; + +class CassandraCompactionFilterFactory : public CompactionFilterFactory { + public: + explicit CassandraCompactionFilterFactory(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds); + ~CassandraCompactionFilterFactory() override {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; + static const char* kClassName() { return "CassandraCompactionFilterFactory"; } + const char* Name() const override { return kClassName(); } + + private: + CassandraOptions options_; +}; +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/cassandra_format_test.cc b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc new file mode 100644 index 000000000..4f12947ad --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc @@ -0,0 +1,377 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "test_util/testharness.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/serialize.h" +#include "utilities/cassandra/test_utils.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +TEST(ColumnTest, Column) { + char data[4] = {'d', 'a', 't', 'a'}; + int8_t mask = 0; + int8_t index = 1; + int64_t timestamp = 1494022807044; + Column c = Column(mask, index, timestamp, sizeof(data), data); + + EXPECT_EQ(c.Index(), index); + EXPECT_EQ(c.Timestamp(), timestamp); + EXPECT_EQ(c.Size(), 14 + sizeof(data)); + + // Verify the serialization. + std::string dest; + dest.reserve(c.Size() * 2); + c.Serialize(&dest); + + EXPECT_EQ(dest.size(), c.Size()); + std::size_t offset = 0; + EXPECT_EQ(Deserialize(dest.c_str(), offset), mask); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), timestamp); + offset += sizeof(int64_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), sizeof(data)); + offset += sizeof(int32_t); + EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0); + + // Verify the deserialization. + std::string saved_dest = dest; + std::shared_ptr c1 = Column::Deserialize(saved_dest.c_str(), 0); + EXPECT_EQ(c1->Index(), index); + EXPECT_EQ(c1->Timestamp(), timestamp); + EXPECT_EQ(c1->Size(), 14 + sizeof(data)); + + c1->Serialize(&dest); + EXPECT_EQ(dest.size(), 2 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); + + // Verify the ColumnBase::Deserialization. + saved_dest = dest; + std::shared_ptr c2 = + ColumnBase::Deserialize(saved_dest.c_str(), c.Size()); + c2->Serialize(&dest); + EXPECT_EQ(dest.size(), 3 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); +} + +TEST(ExpiringColumnTest, ExpiringColumn) { + char data[4] = {'d', 'a', 't', 'a'}; + int8_t mask = ColumnTypeMask::EXPIRATION_MASK; + int8_t index = 3; + int64_t timestamp = 1494022807044; + int32_t ttl = 3600; + ExpiringColumn c = + ExpiringColumn(mask, index, timestamp, sizeof(data), data, ttl); + + EXPECT_EQ(c.Index(), index); + EXPECT_EQ(c.Timestamp(), timestamp); + EXPECT_EQ(c.Size(), 18 + sizeof(data)); + + // Verify the serialization. + std::string dest; + dest.reserve(c.Size() * 2); + c.Serialize(&dest); + + EXPECT_EQ(dest.size(), c.Size()); + std::size_t offset = 0; + EXPECT_EQ(Deserialize(dest.c_str(), offset), mask); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), timestamp); + offset += sizeof(int64_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), sizeof(data)); + offset += sizeof(int32_t); + EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0); + offset += sizeof(data); + EXPECT_EQ(Deserialize(dest.c_str(), offset), ttl); + + // Verify the deserialization. + std::string saved_dest = dest; + std::shared_ptr c1 = + ExpiringColumn::Deserialize(saved_dest.c_str(), 0); + EXPECT_EQ(c1->Index(), index); + EXPECT_EQ(c1->Timestamp(), timestamp); + EXPECT_EQ(c1->Size(), 18 + sizeof(data)); + + c1->Serialize(&dest); + EXPECT_EQ(dest.size(), 2 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); + + // Verify the ColumnBase::Deserialization. + saved_dest = dest; + std::shared_ptr c2 = + ColumnBase::Deserialize(saved_dest.c_str(), c.Size()); + c2->Serialize(&dest); + EXPECT_EQ(dest.size(), 3 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); +} + +TEST(TombstoneTest, TombstoneCollectable) { + int32_t now = (int32_t)time(nullptr); + int32_t gc_grace_seconds = 16440; + int32_t time_delta_seconds = 10; + EXPECT_TRUE( + Tombstone(ColumnTypeMask::DELETION_MASK, 0, + now - gc_grace_seconds - time_delta_seconds, + ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds)) + .Collectable(gc_grace_seconds)); + EXPECT_FALSE( + Tombstone(ColumnTypeMask::DELETION_MASK, 0, + now - gc_grace_seconds + time_delta_seconds, + ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds)) + .Collectable(gc_grace_seconds)); +} + +TEST(TombstoneTest, Tombstone) { + int8_t mask = ColumnTypeMask::DELETION_MASK; + int8_t index = 2; + int32_t local_deletion_time = 1494022807; + int64_t marked_for_delete_at = 1494022807044; + Tombstone c = + Tombstone(mask, index, local_deletion_time, marked_for_delete_at); + + EXPECT_EQ(c.Index(), index); + EXPECT_EQ(c.Timestamp(), marked_for_delete_at); + EXPECT_EQ(c.Size(), 14); + + // Verify the serialization. + std::string dest; + dest.reserve(c.Size() * 2); + c.Serialize(&dest); + + EXPECT_EQ(dest.size(), c.Size()); + std::size_t offset = 0; + EXPECT_EQ(Deserialize(dest.c_str(), offset), mask); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), local_deletion_time); + offset += sizeof(int32_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), marked_for_delete_at); + + // Verify the deserialization. + std::shared_ptr c1 = Tombstone::Deserialize(dest.c_str(), 0); + EXPECT_EQ(c1->Index(), index); + EXPECT_EQ(c1->Timestamp(), marked_for_delete_at); + EXPECT_EQ(c1->Size(), 14); + + c1->Serialize(&dest); + EXPECT_EQ(dest.size(), 2 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); + + // Verify the ColumnBase::Deserialization. + std::shared_ptr c2 = + ColumnBase::Deserialize(dest.c_str(), c.Size()); + c2->Serialize(&dest); + EXPECT_EQ(dest.size(), 3 * c.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); +} + +class RowValueTest : public testing::Test {}; + +TEST(RowValueTest, RowTombstone) { + int32_t local_deletion_time = 1494022807; + int64_t marked_for_delete_at = 1494022807044; + RowValue r = RowValue(local_deletion_time, marked_for_delete_at); + + EXPECT_EQ(r.Size(), 12); + EXPECT_EQ(r.IsTombstone(), true); + EXPECT_EQ(r.LastModifiedTime(), marked_for_delete_at); + + // Verify the serialization. + std::string dest; + dest.reserve(r.Size() * 2); + r.Serialize(&dest); + + EXPECT_EQ(dest.size(), r.Size()); + std::size_t offset = 0; + EXPECT_EQ(Deserialize(dest.c_str(), offset), local_deletion_time); + offset += sizeof(int32_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), marked_for_delete_at); + + // Verify the deserialization. + RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size()); + EXPECT_EQ(r1.Size(), 12); + EXPECT_EQ(r1.IsTombstone(), true); + EXPECT_EQ(r1.LastModifiedTime(), marked_for_delete_at); + + r1.Serialize(&dest); + EXPECT_EQ(dest.size(), 2 * r.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == + 0); +} + +TEST(RowValueTest, RowWithColumns) { + std::vector> columns; + int64_t last_modified_time = 1494022807048; + std::size_t columns_data_size = 0; + + char e_data[5] = {'e', 'd', 'a', 't', 'a'}; + int8_t e_index = 0; + int64_t e_timestamp = 1494022807044; + int32_t e_ttl = 3600; + columns.push_back(std::shared_ptr( + new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index, e_timestamp, + sizeof(e_data), e_data, e_ttl))); + columns_data_size += columns[0]->Size(); + + char c_data[4] = {'d', 'a', 't', 'a'}; + int8_t c_index = 1; + int64_t c_timestamp = 1494022807048; + columns.push_back(std::shared_ptr( + new Column(0, c_index, c_timestamp, sizeof(c_data), c_data))); + columns_data_size += columns[1]->Size(); + + int8_t t_index = 2; + int32_t t_local_deletion_time = 1494022801; + int64_t t_marked_for_delete_at = 1494022807043; + columns.push_back(std::shared_ptr( + new Tombstone(ColumnTypeMask::DELETION_MASK, t_index, + t_local_deletion_time, t_marked_for_delete_at))); + columns_data_size += columns[2]->Size(); + + RowValue r = RowValue(std::move(columns), last_modified_time); + + EXPECT_EQ(r.Size(), columns_data_size + 12); + EXPECT_EQ(r.IsTombstone(), false); + EXPECT_EQ(r.LastModifiedTime(), last_modified_time); + + // Verify the serialization. + std::string dest; + dest.reserve(r.Size() * 2); + r.Serialize(&dest); + + EXPECT_EQ(dest.size(), r.Size()); + std::size_t offset = 0; + EXPECT_EQ(Deserialize(dest.c_str(), offset), + std::numeric_limits::max()); + offset += sizeof(int32_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), + std::numeric_limits::min()); + offset += sizeof(int64_t); + + // Column0: ExpiringColumn + EXPECT_EQ(Deserialize(dest.c_str(), offset), + ColumnTypeMask::EXPIRATION_MASK); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), e_index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), e_timestamp); + offset += sizeof(int64_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), sizeof(e_data)); + offset += sizeof(int32_t); + EXPECT_TRUE(std::memcmp(e_data, dest.c_str() + offset, sizeof(e_data)) == 0); + offset += sizeof(e_data); + EXPECT_EQ(Deserialize(dest.c_str(), offset), e_ttl); + offset += sizeof(int32_t); + + // Column1: Column + EXPECT_EQ(Deserialize(dest.c_str(), offset), 0); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), c_index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), c_timestamp); + offset += sizeof(int64_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), sizeof(c_data)); + offset += sizeof(int32_t); + EXPECT_TRUE(std::memcmp(c_data, dest.c_str() + offset, sizeof(c_data)) == 0); + offset += sizeof(c_data); + + // Column2: Tombstone + EXPECT_EQ(Deserialize(dest.c_str(), offset), + ColumnTypeMask::DELETION_MASK); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), t_index); + offset += sizeof(int8_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), t_local_deletion_time); + offset += sizeof(int32_t); + EXPECT_EQ(Deserialize(dest.c_str(), offset), t_marked_for_delete_at); + + // Verify the deserialization. + RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size()); + EXPECT_EQ(r1.Size(), columns_data_size + 12); + EXPECT_EQ(r1.IsTombstone(), false); + EXPECT_EQ(r1.LastModifiedTime(), last_modified_time); + + r1.Serialize(&dest); + EXPECT_EQ(dest.size(), 2 * r.Size()); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == + 0); +} + +TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) { + int64_t now = time(nullptr); + + auto row_value = CreateTestRowValue( + {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kExpiringColumn, 2, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}); + + bool changed = false; + auto purged = row_value.RemoveExpiredColumns(&changed); + EXPECT_TRUE(changed); + EXPECT_EQ(purged.get_columns().size(), 3); + VerifyRowValueColumns(purged.get_columns(), 0, kColumn, 0, + ToMicroSeconds(now)); + VerifyRowValueColumns(purged.get_columns(), 1, kExpiringColumn, 2, + ToMicroSeconds(now)); + VerifyRowValueColumns(purged.get_columns(), 2, kTombstone, 3, + ToMicroSeconds(now)); + + purged.RemoveExpiredColumns(&changed); + EXPECT_FALSE(changed); +} + +TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) { + int64_t now = time(nullptr); + + auto row_value = CreateTestRowValue( + {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kExpiringColumn, 2, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}); + + bool changed = false; + auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed); + EXPECT_TRUE(changed); + EXPECT_EQ(compacted.get_columns().size(), 4); + VerifyRowValueColumns(compacted.get_columns(), 0, kColumn, 0, + ToMicroSeconds(now)); + VerifyRowValueColumns(compacted.get_columns(), 1, kTombstone, 1, + ToMicroSeconds(now - 10)); + VerifyRowValueColumns(compacted.get_columns(), 2, kExpiringColumn, 2, + ToMicroSeconds(now)); + VerifyRowValueColumns(compacted.get_columns(), 3, kTombstone, 3, + ToMicroSeconds(now)); + + compacted.ConvertExpiredColumnsToTombstones(&changed); + EXPECT_FALSE(changed); +} +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc new file mode 100644 index 000000000..c5be836e8 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc @@ -0,0 +1,446 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/testharness.h" +#include "util/cast_util.h" +#include "util/random.h" +#include "utilities/cassandra/cassandra_compaction_filter.h" +#include "utilities/cassandra/merge_operator.h" +#include "utilities/cassandra/test_utils.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +// Path to the database on file system +const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test"); + +class CassandraStore { + public: + explicit CassandraStore(std::shared_ptr db) + : db_(db), write_option_(), get_option_() { + assert(db); + } + + bool Append(const std::string& key, const RowValue& val) { + std::string result; + val.Serialize(&result); + Slice valSlice(result.data(), result.size()); + auto s = db_->Merge(write_option_, key, valSlice); + + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + bool Put(const std::string& key, const RowValue& val) { + std::string result; + val.Serialize(&result); + Slice valSlice(result.data(), result.size()); + auto s = db_->Put(write_option_, key, valSlice); + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + Status Flush() { + Status s = dbfull()->TEST_FlushMemTable(); + if (s.ok()) { + s = dbfull()->TEST_WaitForCompact(); + } + return s; + } + + Status Compact() { + return dbfull()->TEST_CompactRange(0, nullptr, nullptr, + db_->DefaultColumnFamily()); + } + + std::tuple Get(const std::string& key) { + std::string result; + auto s = db_->Get(get_option_, key, &result); + + if (s.ok()) { + return std::make_tuple( + true, RowValue::Deserialize(result.data(), result.size())); + } + + if (!s.IsNotFound()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + } + + return std::make_tuple(false, RowValue(0, 0)); + } + + private: + std::shared_ptr db_; + WriteOptions write_option_; + ReadOptions get_option_; + + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } +}; + +class TestCompactionFilterFactory : public CompactionFilterFactory { + public: + explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds) + : purge_ttl_on_expiration_(purge_ttl_on_expiration), + gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new CassandraCompactionFilter( + purge_ttl_on_expiration_, gc_grace_period_in_seconds_)); + } + + const char* Name() const override { return "TestCompactionFilterFactory"; } + + private: + bool purge_ttl_on_expiration_; + int32_t gc_grace_period_in_seconds_; +}; + +// The class for unit-testing +class CassandraFunctionalTest : public testing::Test { + public: + CassandraFunctionalTest() { + EXPECT_OK( + DestroyDB(kDbName, Options())); // Start each test with a fresh DB + } + + std::shared_ptr OpenDb() { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset( + new CassandraValueMergeOperator(gc_grace_period_in_seconds_)); + auto* cf_factory = new TestCompactionFilterFactory( + purge_ttl_on_expiration_, gc_grace_period_in_seconds_); + options.compaction_filter_factory.reset(cf_factory); + EXPECT_OK(DB::Open(options, kDbName, &db)); + return std::shared_ptr(db); + } + + bool purge_ttl_on_expiration_ = false; + int32_t gc_grace_period_in_seconds_ = 100; +}; + +// THE TEST CASES BEGIN HERE + +TEST_F(CassandraFunctionalTest, SimpleMergeTest) { + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)), + CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)), + })); + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)), + CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)), + CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)), + })); + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)), + CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)), + CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)), + })); + + auto ret = store.Get("k1"); + + ASSERT_TRUE(std::get<0>(ret)); + RowValue& merged = std::get<1>(ret); + EXPECT_EQ(merged.get_columns().size(), 5); + VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0, + ToMicroSeconds(now + 6)); + VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1, + ToMicroSeconds(now + 8)); + VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2, + ToMicroSeconds(now + 7)); + VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7, + ToMicroSeconds(now + 17)); + VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11, + ToMicroSeconds(now + 11)); +} + +constexpr int64_t kTestTimeoutSecs = 600; + +TEST_F(CassandraFunctionalTest, + CompactionShouldConvertExpiredColumnsToTombstone) { + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 20)), // expired + CreateTestColumnSpec( + kExpiringColumn, 1, + ToMicroSeconds(now - kTtl + kTestTimeoutSecs)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))})); + + ASSERT_OK(store.Flush()); + + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))})); + + ASSERT_OK(store.Flush()); + ASSERT_OK(store.Compact()); + + auto ret = store.Get("k1"); + ASSERT_TRUE(std::get<0>(ret)); + RowValue& merged = std::get<1>(ret); + EXPECT_EQ(merged.get_columns().size(), 4); + VerifyRowValueColumns(merged.get_columns(), 0, kTombstone, 0, + ToMicroSeconds(now - 10)); + VerifyRowValueColumns(merged.get_columns(), 1, kExpiringColumn, 1, + ToMicroSeconds(now - kTtl + kTestTimeoutSecs)); + VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 2, + ToMicroSeconds(now)); + VerifyRowValueColumns(merged.get_columns(), 3, kTombstone, 3, + ToMicroSeconds(now)); +} + +TEST_F(CassandraFunctionalTest, + CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) { + purge_ttl_on_expiration_ = true; + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 20)), // expired + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))})); + + ASSERT_OK(store.Flush()); + + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))})); + + ASSERT_OK(store.Flush()); + ASSERT_OK(store.Compact()); + + auto ret = store.Get("k1"); + ASSERT_TRUE(std::get<0>(ret)); + RowValue& merged = std::get<1>(ret); + EXPECT_EQ(merged.get_columns().size(), 3); + VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 1, + ToMicroSeconds(now)); + VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 2, + ToMicroSeconds(now)); + VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 3, + ToMicroSeconds(now)); +} + +TEST_F(CassandraFunctionalTest, + CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) { + purge_ttl_on_expiration_ = true; + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Append("k1", CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 20)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 20)), + })); + + ASSERT_OK(store.Flush()); + + store.Append("k1", CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), + })); + + ASSERT_OK(store.Flush()); + ASSERT_OK(store.Compact()); + ASSERT_FALSE(std::get<0>(store.Get("k1"))); +} + +TEST_F(CassandraFunctionalTest, + CompactionShouldRemoveTombstoneExceedingGCGracePeriod) { + purge_ttl_on_expiration_ = true; + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Append("k1", + CreateTestRowValue( + {CreateTestColumnSpec( + kTombstone, 0, + ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now))})); + + store.Append("k2", CreateTestRowValue({CreateTestColumnSpec( + kColumn, 0, ToMicroSeconds(now))})); + + ASSERT_OK(store.Flush()); + + store.Append("k1", CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)), + })); + + ASSERT_OK(store.Flush()); + ASSERT_OK(store.Compact()); + + auto ret = store.Get("k1"); + ASSERT_TRUE(std::get<0>(ret)); + RowValue& gced = std::get<1>(ret); + EXPECT_EQ(gced.get_columns().size(), 1); + VerifyRowValueColumns(gced.get_columns(), 0, kColumn, 1, ToMicroSeconds(now)); +} + +TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) { + purge_ttl_on_expiration_ = true; + CassandraStore store(OpenDb()); + int64_t now = time(nullptr); + + store.Put("k1", + CreateTestRowValue({ + CreateTestColumnSpec( + kTombstone, 0, + ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), + })); + + ASSERT_OK(store.Flush()); + ASSERT_OK(store.Compact()); + ASSERT_FALSE(std::get<0>(store.Get("k1"))); +} + +#ifndef ROCKSDB_LITE +TEST_F(CassandraFunctionalTest, LoadMergeOperator) { + ConfigOptions config_options; + std::shared_ptr mo; + config_options.ignore_unsupported_options = false; + + ASSERT_NOK(MergeOperator::CreateFromString( + config_options, CassandraValueMergeOperator::kClassName(), &mo)); + + config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects, + "cassandra"); + + ASSERT_OK(MergeOperator::CreateFromString( + config_options, CassandraValueMergeOperator::kClassName(), &mo)); + ASSERT_NE(mo, nullptr); + ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName()); + mo.reset(); + ASSERT_OK(MergeOperator::CreateFromString( + config_options, + std::string("operands_limit=20;gc_grace_period_in_seconds=42;id=") + + CassandraValueMergeOperator::kClassName(), + &mo)); + ASSERT_NE(mo, nullptr); + ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName()); + const auto* opts = mo->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->gc_grace_period_in_seconds, 42); + ASSERT_EQ(opts->operands_limit, 20); +} + +TEST_F(CassandraFunctionalTest, LoadCompactionFilter) { + ConfigOptions config_options; + const CompactionFilter* filter = nullptr; + config_options.ignore_unsupported_options = false; + + ASSERT_NOK(CompactionFilter::CreateFromString( + config_options, CassandraCompactionFilter::kClassName(), &filter)); + config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects, + "cassandra"); + + ASSERT_OK(CompactionFilter::CreateFromString( + config_options, CassandraCompactionFilter::kClassName(), &filter)); + ASSERT_NE(filter, nullptr); + ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName()); + delete filter; + filter = nullptr; + ASSERT_OK(CompactionFilter::CreateFromString( + config_options, + std::string( + "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") + + CassandraCompactionFilter::kClassName(), + &filter)); + ASSERT_NE(filter, nullptr); + ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName()); + const auto* opts = filter->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->gc_grace_period_in_seconds, 42); + ASSERT_TRUE(opts->purge_ttl_on_expiration); + delete filter; +} + +TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) { + ConfigOptions config_options; + std::shared_ptr factory; + + config_options.ignore_unsupported_options = false; + ASSERT_NOK(CompactionFilterFactory::CreateFromString( + config_options, CassandraCompactionFilterFactory::kClassName(), + &factory)); + config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects, + "cassandra"); + + ASSERT_OK(CompactionFilterFactory::CreateFromString( + config_options, CassandraCompactionFilterFactory::kClassName(), + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName()); + factory.reset(); + ASSERT_OK(CompactionFilterFactory::CreateFromString( + config_options, + std::string( + "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") + + CassandraCompactionFilterFactory::kClassName(), + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName()); + const auto* opts = factory->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->gc_grace_period_in_seconds, 42); + ASSERT_TRUE(opts->purge_ttl_on_expiration); +} +#endif // ROCKSDB_LITE + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/cassandra/cassandra_options.h b/src/rocksdb/utilities/cassandra/cassandra_options.h new file mode 100644 index 000000000..efa73a308 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_options.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class ObjectLibrary; +namespace cassandra { +struct CassandraOptions { + static const char* kName() { return "CassandraOptions"; } + CassandraOptions(int32_t _gc_grace_period_in_seconds, size_t _operands_limit, + bool _purge_ttl_on_expiration = false) + : operands_limit(_operands_limit), + gc_grace_period_in_seconds(_gc_grace_period_in_seconds), + purge_ttl_on_expiration(_purge_ttl_on_expiration) {} + // Limit on the number of merge operands. + size_t operands_limit; + + // How long (in seconds) tombstoned data remains before it is purged + int32_t gc_grace_period_in_seconds; + + // If is set to true, expired data will be directly purged. + // Otherwise expired data will be converted tombstones first, + // then be eventually removed after gc grace period. This value should + // only true if all writes have same ttl setting, otherwise it could bring old + // data back. + bool purge_ttl_on_expiration; +}; +#ifndef ROCKSDB_LITE +extern "C" { +int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg); +} // extern "C" +#endif // ROCKSDB_LITE +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc new file mode 100644 index 000000000..0b4a89287 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "test_util/testharness.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/test_utils.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +class RowValueMergeTest : public testing::Test {}; + +TEST(RowValueMergeTest, Merge) { + std::vector row_values; + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kTombstone, 0, 5), + CreateTestColumnSpec(kColumn, 1, 8), + CreateTestColumnSpec(kExpiringColumn, 2, 5), + })); + + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 0, 2), + CreateTestColumnSpec(kExpiringColumn, 1, 5), + CreateTestColumnSpec(kTombstone, 2, 7), + CreateTestColumnSpec(kExpiringColumn, 7, 17), + })); + + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, 6), + CreateTestColumnSpec(kTombstone, 1, 5), + CreateTestColumnSpec(kColumn, 2, 4), + CreateTestColumnSpec(kTombstone, 11, 11), + })); + + RowValue merged = RowValue::Merge(std::move(row_values)); + EXPECT_FALSE(merged.IsTombstone()); + EXPECT_EQ(merged.get_columns().size(), 5); + VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0, 6); + VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1, 8); + VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2, 7); + VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7, 17); + VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11, 11); +} + +TEST(RowValueMergeTest, MergeWithRowTombstone) { + std::vector row_values; + + // A row tombstone. + row_values.push_back(CreateRowTombstone(11)); + + // This row's timestamp is smaller than tombstone. + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 0, 5), + CreateTestColumnSpec(kColumn, 1, 6), + })); + + // Some of the column's row is smaller, some is larger. + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 2, 10), + CreateTestColumnSpec(kColumn, 3, 12), + })); + + // All of the column's rows are larger than tombstone. + row_values.push_back(CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 4, 13), + CreateTestColumnSpec(kColumn, 5, 14), + })); + + RowValue merged = RowValue::Merge(std::move(row_values)); + EXPECT_FALSE(merged.IsTombstone()); + EXPECT_EQ(merged.get_columns().size(), 3); + VerifyRowValueColumns(merged.get_columns(), 0, kColumn, 3, 12); + VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 4, 13); + VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 5, 14); + + // If the tombstone's timestamp is the latest, then it returns a + // row tombstone. + row_values.push_back(CreateRowTombstone(15)); + + row_values.push_back(CreateRowTombstone(17)); + + merged = RowValue::Merge(std::move(row_values)); + EXPECT_TRUE(merged.IsTombstone()); + EXPECT_EQ(merged.LastModifiedTime(), 17); +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc new file mode 100644 index 000000000..c14d8fd80 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" +#include "utilities/cassandra/serialize.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +TEST(SerializeTest, SerializeI64) { + std::string dest; + Serialize(0, &dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x00'}), + dest); + + dest.clear(); + Serialize(1, &dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x01'}), + dest); + + dest.clear(); + Serialize(-1, &dest); + EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', + '\xff'}), + dest); + + dest.clear(); + Serialize(9223372036854775807, &dest); + EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', + '\xff'}), + dest); + + dest.clear(); + Serialize(-9223372036854775807, &dest); + EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x01'}), + dest); +} + +TEST(SerializeTest, DeserializeI64) { + std::string dest; + std::size_t offset = dest.size(); + Serialize(0, &dest); + EXPECT_EQ(0, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(1, &dest); + EXPECT_EQ(1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-1, &dest); + EXPECT_EQ(-1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-9223372036854775807, &dest); + EXPECT_EQ(-9223372036854775807, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(9223372036854775807, &dest); + EXPECT_EQ(9223372036854775807, Deserialize(dest.c_str(), offset)); +} + +TEST(SerializeTest, SerializeI32) { + std::string dest; + Serialize(0, &dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00'}), dest); + + dest.clear(); + Serialize(1, &dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x01'}), dest); + + dest.clear(); + Serialize(-1, &dest); + EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff'}), dest); + + dest.clear(); + Serialize(2147483647, &dest); + EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff'}), dest); + + dest.clear(); + Serialize(-2147483648LL, &dest); + EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00'}), dest); +} + +TEST(SerializeTest, DeserializeI32) { + std::string dest; + std::size_t offset = dest.size(); + Serialize(0, &dest); + EXPECT_EQ(0, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(1, &dest); + EXPECT_EQ(1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-1, &dest); + EXPECT_EQ(-1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(2147483647, &dest); + EXPECT_EQ(2147483647, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-2147483648LL, &dest); + EXPECT_EQ(-2147483648LL, Deserialize(dest.c_str(), offset)); +} + +TEST(SerializeTest, SerializeI8) { + std::string dest; + Serialize(0, &dest); + EXPECT_EQ(std::string({'\x00'}), dest); + + dest.clear(); + Serialize(1, &dest); + EXPECT_EQ(std::string({'\x01'}), dest); + + dest.clear(); + Serialize(-1, &dest); + EXPECT_EQ(std::string({'\xff'}), dest); + + dest.clear(); + Serialize(127, &dest); + EXPECT_EQ(std::string({'\x7f'}), dest); + + dest.clear(); + Serialize(-128, &dest); + EXPECT_EQ(std::string({'\x80'}), dest); +} + +TEST(SerializeTest, DeserializeI8) { + std::string dest; + std::size_t offset = dest.size(); + Serialize(0, &dest); + EXPECT_EQ(0, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(1, &dest); + EXPECT_EQ(1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-1, &dest); + EXPECT_EQ(-1, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(127, &dest); + EXPECT_EQ(127, Deserialize(dest.c_str(), offset)); + + offset = dest.size(); + Serialize(-128, &dest); + EXPECT_EQ(-128, Deserialize(dest.c_str(), offset)); +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/cassandra/format.cc b/src/rocksdb/utilities/cassandra/format.cc new file mode 100644 index 000000000..cc1dd2f28 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/format.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "format.h" + +#include +#include +#include + +#include "utilities/cassandra/serialize.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +namespace { +const int32_t kDefaultLocalDeletionTime = std::numeric_limits::max(); +const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits::min(); +} // namespace + +ColumnBase::ColumnBase(int8_t mask, int8_t index) + : mask_(mask), index_(index) {} + +std::size_t ColumnBase::Size() const { return sizeof(mask_) + sizeof(index_); } + +int8_t ColumnBase::Mask() const { return mask_; } + +int8_t ColumnBase::Index() const { return index_; } + +void ColumnBase::Serialize(std::string* dest) const { + ROCKSDB_NAMESPACE::cassandra::Serialize(mask_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(index_, dest); +} + +std::shared_ptr ColumnBase::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + if ((mask & ColumnTypeMask::DELETION_MASK) != 0) { + return Tombstone::Deserialize(src, offset); + } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) { + return ExpiringColumn::Deserialize(src, offset); + } else { + return Column::Deserialize(src, offset); + } +} + +Column::Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value) + : ColumnBase(mask, index), + timestamp_(timestamp), + value_size_(value_size), + value_(value) {} + +int64_t Column::Timestamp() const { return timestamp_; } + +std::size_t Column::Size() const { + return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) + + value_size_; +} + +void Column::Serialize(std::string* dest) const { + ColumnBase::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(timestamp_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(value_size_, dest); + dest->append(value_, value_size_); +} + +std::shared_ptr Column::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int64_t timestamp = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(timestamp); + int32_t value_size = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(value_size); + return std::make_shared(mask, index, timestamp, value_size, + src + offset); +} + +ExpiringColumn::ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, + int32_t value_size, const char* value, + int32_t ttl) + : Column(mask, index, timestamp, value_size, value), ttl_(ttl) {} + +std::size_t ExpiringColumn::Size() const { + return Column::Size() + sizeof(ttl_); +} + +void ExpiringColumn::Serialize(std::string* dest) const { + Column::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(ttl_, dest); +} + +std::chrono::time_point ExpiringColumn::TimePoint() + const { + return std::chrono::time_point( + std::chrono::microseconds(Timestamp())); +} + +std::chrono::seconds ExpiringColumn::Ttl() const { + return std::chrono::seconds(ttl_); +} + +bool ExpiringColumn::Expired() const { + return TimePoint() + Ttl() < std::chrono::system_clock::now(); +} + +std::shared_ptr ExpiringColumn::ToTombstone() const { + auto expired_at = (TimePoint() + Ttl()).time_since_epoch(); + int32_t local_deletion_time = static_cast( + std::chrono::duration_cast(expired_at).count()); + int64_t marked_for_delete_at = + std::chrono::duration_cast(expired_at).count(); + return std::make_shared( + static_cast(ColumnTypeMask::DELETION_MASK), Index(), + local_deletion_time, marked_for_delete_at); +} + +std::shared_ptr ExpiringColumn::Deserialize( + const char* src, std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int64_t timestamp = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(timestamp); + int32_t value_size = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(value_size); + const char* value = src + offset; + offset += value_size; + int32_t ttl = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + return std::make_shared(mask, index, timestamp, value_size, + value, ttl); +} + +Tombstone::Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at) + : ColumnBase(mask, index), + local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at) {} + +int64_t Tombstone::Timestamp() const { return marked_for_delete_at_; } + +std::size_t Tombstone::Size() const { + return ColumnBase::Size() + sizeof(local_deletion_time_) + + sizeof(marked_for_delete_at_); +} + +void Tombstone::Serialize(std::string* dest) const { + ColumnBase::Serialize(dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(local_deletion_time_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(marked_for_delete_at_, dest); +} + +bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const { + auto local_deleted_at = std::chrono::time_point( + std::chrono::seconds(local_deletion_time_)); + auto gc_grace_period = std::chrono::seconds(gc_grace_period_in_seconds); + return local_deleted_at + gc_grace_period < std::chrono::system_clock::now(); +} + +std::shared_ptr Tombstone::Deserialize(const char* src, + std::size_t offset) { + int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(mask); + int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(index); + int32_t local_deletion_time = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int32_t); + int64_t marked_for_delete_at = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + return std::make_shared(mask, index, local_deletion_time, + marked_for_delete_at); +} + +RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at) + : local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at), + columns_(), + last_modified_time_(0) {} + +RowValue::RowValue(Columns columns, int64_t last_modified_time) + : local_deletion_time_(kDefaultLocalDeletionTime), + marked_for_delete_at_(kDefaultMarkedForDeleteAt), + columns_(std::move(columns)), + last_modified_time_(last_modified_time) {} + +std::size_t RowValue::Size() const { + std::size_t size = + sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_); + for (const auto& column : columns_) { + size += column->Size(); + } + return size; +} + +int64_t RowValue::LastModifiedTime() const { + if (IsTombstone()) { + return marked_for_delete_at_; + } else { + return last_modified_time_; + } +} + +bool RowValue::IsTombstone() const { + return marked_for_delete_at_ > kDefaultMarkedForDeleteAt; +} + +void RowValue::Serialize(std::string* dest) const { + ROCKSDB_NAMESPACE::cassandra::Serialize(local_deletion_time_, dest); + ROCKSDB_NAMESPACE::cassandra::Serialize(marked_for_delete_at_, dest); + for (const auto& column : columns_) { + column->Serialize(dest); + } +} + +RowValue RowValue::RemoveExpiredColumns(bool* changed) const { + *changed = false; + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + std::shared_ptr expiring_column = + std::static_pointer_cast(column); + + if (expiring_column->Expired()) { + *changed = true; + continue; + } + } + + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const { + *changed = false; + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + std::shared_ptr expiring_column = + std::static_pointer_cast(column); + + if (expiring_column->Expired()) { + std::shared_ptr tombstone = expiring_column->ToTombstone(); + new_columns.push_back(tombstone); + *changed = true; + continue; + } + } + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const { + Columns new_columns; + for (auto& column : columns_) { + if (column->Mask() == ColumnTypeMask::DELETION_MASK) { + std::shared_ptr tombstone = + std::static_pointer_cast(column); + + if (tombstone->Collectable(gc_grace_period)) { + continue; + } + } + + new_columns.push_back(column); + } + return RowValue(std::move(new_columns), last_modified_time_); +} + +bool RowValue::Empty() const { return columns_.empty(); } + +RowValue RowValue::Deserialize(const char* src, std::size_t size) { + std::size_t offset = 0; + assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_)); + int32_t local_deletion_time = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int32_t); + int64_t marked_for_delete_at = + ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); + offset += sizeof(int64_t); + if (offset == size) { + return RowValue(local_deletion_time, marked_for_delete_at); + } + + assert(local_deletion_time == kDefaultLocalDeletionTime); + assert(marked_for_delete_at == kDefaultMarkedForDeleteAt); + Columns columns; + int64_t last_modified_time = 0; + while (offset < size) { + auto c = ColumnBase::Deserialize(src, offset); + offset += c->Size(); + assert(offset <= size); + last_modified_time = std::max(last_modified_time, c->Timestamp()); + columns.push_back(std::move(c)); + } + + return RowValue(std::move(columns), last_modified_time); +} + +// Merge multiple row values into one. +// For each column in rows with same index, we pick the one with latest +// timestamp. And we also take row tombstone into consideration, by iterating +// each row from reverse timestamp order, and stop once we hit the first +// row tombstone. +RowValue RowValue::Merge(std::vector&& values) { + assert(values.size() > 0); + if (values.size() == 1) { + return std::move(values[0]); + } + + // Merge columns by their last modified time, and skip once we hit + // a row tombstone. + std::sort(values.begin(), values.end(), + [](const RowValue& r1, const RowValue& r2) { + return r1.LastModifiedTime() > r2.LastModifiedTime(); + }); + + std::map> merged_columns; + int64_t tombstone_timestamp = 0; + + for (auto& value : values) { + if (value.IsTombstone()) { + if (merged_columns.size() == 0) { + return std::move(value); + } + tombstone_timestamp = value.LastModifiedTime(); + break; + } + for (auto& column : value.columns_) { + int8_t index = column->Index(); + if (merged_columns.find(index) == merged_columns.end()) { + merged_columns[index] = column; + } else { + if (column->Timestamp() > merged_columns[index]->Timestamp()) { + merged_columns[index] = column; + } + } + } + } + + int64_t last_modified_time = 0; + Columns columns; + for (auto& pair : merged_columns) { + // For some row, its last_modified_time > row tombstone_timestamp, but + // it might have rows whose timestamp is ealier than tombstone, so we + // ned to filter these rows. + if (pair.second->Timestamp() <= tombstone_timestamp) { + continue; + } + last_modified_time = std::max(last_modified_time, pair.second->Timestamp()); + columns.push_back(std::move(pair.second)); + } + return RowValue(std::move(columns), last_modified_time); +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/format.h b/src/rocksdb/utilities/cassandra/format.h new file mode 100644 index 000000000..1b2714735 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/format.h @@ -0,0 +1,183 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/** + * The encoding of Cassandra Row Value. + * + * A Cassandra Row Value could either be a row tombstone, + * or contains multiple columns, it has following fields: + * + * struct row_value { + * int32_t local_deletion_time; // Time in second when the row is deleted, + * // only used for Cassandra tombstone gc. + * int64_t marked_for_delete_at; // Ms that marked this row is deleted. + * struct column_base columns[]; // For non tombstone row, all columns + * // are stored here. + * } + * + * If the local_deletion_time and marked_for_delete_at is set, then this is + * a tombstone, otherwise it contains multiple columns. + * + * There are three type of Columns: Normal Column, Expiring Column and Column + * Tombstone, which have following fields: + * + * // Identify the type of the column. + * enum mask { + * DELETION_MASK = 0x01, + * EXPIRATION_MASK = 0x02, + * }; + * + * struct column { + * int8_t mask = 0; + * int8_t index; + * int64_t timestamp; + * int32_t value_length; + * char value[value_length]; + * } + * + * struct expiring_column { + * int8_t mask = mask.EXPIRATION_MASK; + * int8_t index; + * int64_t timestamp; + * int32_t value_length; + * char value[value_length]; + * int32_t ttl; + * } + * + * struct tombstone_column { + * int8_t mask = mask.DELETION_MASK; + * int8_t index; + * int32_t local_deletion_time; // Similar to row_value's field. + * int64_t marked_for_delete_at; + * } + */ + +#pragma once +#include +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +// Identify the type of the column. +enum ColumnTypeMask { + DELETION_MASK = 0x01, + EXPIRATION_MASK = 0x02, +}; + +class ColumnBase { + public: + ColumnBase(int8_t mask, int8_t index); + virtual ~ColumnBase() = default; + + virtual int64_t Timestamp() const = 0; + virtual int8_t Mask() const; + virtual int8_t Index() const; + virtual std::size_t Size() const; + virtual void Serialize(std::string* dest) const; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int8_t mask_; + int8_t index_; +}; + +class Column : public ColumnBase { + public: + Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value); + + virtual int64_t Timestamp() const override; + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int64_t timestamp_; + int32_t value_size_; + const char* value_; +}; + +class Tombstone : public ColumnBase { + public: + Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at); + + virtual int64_t Timestamp() const override; + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + bool Collectable(int32_t gc_grace_period) const; + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int32_t local_deletion_time_; + int64_t marked_for_delete_at_; +}; + +class ExpiringColumn : public Column { + public: + ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, + int32_t value_size, const char* value, int32_t ttl); + + virtual std::size_t Size() const override; + virtual void Serialize(std::string* dest) const override; + bool Expired() const; + std::shared_ptr ToTombstone() const; + + static std::shared_ptr Deserialize(const char* src, + std::size_t offset); + + private: + int32_t ttl_; + std::chrono::time_point TimePoint() const; + std::chrono::seconds Ttl() const; +}; + +using Columns = std::vector>; + +class RowValue { + public: + // Create a Row Tombstone. + RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at); + // Create a Row containing columns. + RowValue(Columns columns, int64_t last_modified_time); + RowValue(const RowValue& /*that*/) = delete; + RowValue(RowValue&& /*that*/) noexcept = default; + RowValue& operator=(const RowValue& /*that*/) = delete; + RowValue& operator=(RowValue&& /*that*/) = default; + + std::size_t Size() const; + bool IsTombstone() const; + // For Tombstone this returns the marked_for_delete_at_, + // otherwise it returns the max timestamp of containing columns. + int64_t LastModifiedTime() const; + void Serialize(std::string* dest) const; + RowValue RemoveExpiredColumns(bool* changed) const; + RowValue ConvertExpiredColumnsToTombstones(bool* changed) const; + RowValue RemoveTombstones(int32_t gc_grace_period) const; + bool Empty() const; + + static RowValue Deserialize(const char* src, std::size_t size); + // Merge multiple rows according to their timestamp. + static RowValue Merge(std::vector&& values); + + const Columns& get_columns() { return columns_; } + + private: + int32_t local_deletion_time_; + int64_t marked_for_delete_at_; + Columns columns_; + int64_t last_modified_time_; +}; + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/merge_operator.cc b/src/rocksdb/utilities/cassandra/merge_operator.cc new file mode 100644 index 000000000..bde5dcbad --- /dev/null +++ b/src/rocksdb/utilities/cassandra/merge_operator.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "merge_operator.h" + +#include + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/cassandra/format.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +static std::unordered_map + merge_operator_options_info = { +#ifndef ROCKSDB_LITE + {"gc_grace_period_in_seconds", + {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"operands_limit", + {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +CassandraValueMergeOperator::CassandraValueMergeOperator( + int32_t gc_grace_period_in_seconds, size_t operands_limit) + : options_(gc_grace_period_in_seconds, operands_limit) { + RegisterOptions(&options_, &merge_operator_options_info); +} + +// Implementation for the merge operation (merges two Cassandra values) +bool CassandraValueMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Clear the *new_value for writing. + merge_out->new_value.clear(); + std::vector row_values; + if (merge_in.existing_value) { + row_values.push_back(RowValue::Deserialize( + merge_in.existing_value->data(), merge_in.existing_value->size())); + } + + for (auto& operand : merge_in.operand_list) { + row_values.push_back(RowValue::Deserialize(operand.data(), operand.size())); + } + + RowValue merged = RowValue::Merge(std::move(row_values)); + merged = merged.RemoveTombstones(options_.gc_grace_period_in_seconds); + merge_out->new_value.reserve(merged.Size()); + merged.Serialize(&(merge_out->new_value)); + + return true; +} + +bool CassandraValueMergeOperator::PartialMergeMulti( + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + std::vector row_values; + for (auto& operand : operand_list) { + row_values.push_back(RowValue::Deserialize(operand.data(), operand.size())); + } + RowValue merged = RowValue::Merge(std::move(row_values)); + new_value->reserve(merged.Size()); + merged.Serialize(new_value); + return true; +} + +} // namespace cassandra + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/merge_operator.h b/src/rocksdb/utilities/cassandra/merge_operator.h new file mode 100644 index 000000000..af8725db7 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/merge_operator.h @@ -0,0 +1,44 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/cassandra/cassandra_options.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { + +/** + * A MergeOperator for rocksdb that implements Cassandra row value merge. + */ +class CassandraValueMergeOperator : public MergeOperator { + public: + explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds, + size_t operands_limit = 0); + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "CassandraValueMergeOperator"; } + + virtual bool AllowSingleOperand() const override { return true; } + + virtual bool ShouldMerge(const std::vector& operands) const override { + return options_.operands_limit > 0 && + operands.size() >= options_.operands_limit; + } + + private: + CassandraOptions options_; +}; +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/serialize.h b/src/rocksdb/utilities/cassandra/serialize.h new file mode 100644 index 000000000..4bd552bfc --- /dev/null +++ b/src/rocksdb/utilities/cassandra/serialize.h @@ -0,0 +1,81 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/** + * Helper functions which serialize and deserialize integers + * into bytes in big endian. + */ + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +namespace { +const int64_t kCharMask = 0xFFLL; +const int32_t kBitsPerByte = 8; +} // namespace + +template +void Serialize(T val, std::string* dest); + +template +T Deserialize(const char* src, std::size_t offset = 0); + +// Specializations +template <> +inline void Serialize(int8_t t, std::string* dest) { + dest->append(1, static_cast(t & kCharMask)); +} + +template <> +inline void Serialize(int32_t t, std::string* dest) { + for (unsigned long i = 0; i < sizeof(int32_t); i++) { + dest->append( + 1, static_cast((t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) & + kCharMask)); + } +} + +template <> +inline void Serialize(int64_t t, std::string* dest) { + for (unsigned long i = 0; i < sizeof(int64_t); i++) { + dest->append( + 1, static_cast((t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) & + kCharMask)); + } +} + +template <> +inline int8_t Deserialize(const char* src, std::size_t offset) { + return static_cast(src[offset]); +} + +template <> +inline int32_t Deserialize(const char* src, std::size_t offset) { + int32_t result = 0; + for (unsigned long i = 0; i < sizeof(int32_t); i++) { + result |= static_cast(static_cast(src[offset + i])) + << ((sizeof(int32_t) - 1 - i) * kBitsPerByte); + } + return result; +} + +template <> +inline int64_t Deserialize(const char* src, std::size_t offset) { + int64_t result = 0; + for (unsigned long i = 0; i < sizeof(int64_t); i++) { + result |= static_cast(static_cast(src[offset + i])) + << ((sizeof(int64_t) - 1 - i) * kBitsPerByte); + } + return result; +} + +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/test_utils.cc b/src/rocksdb/utilities/cassandra/test_utils.cc new file mode 100644 index 000000000..ec6e5752d --- /dev/null +++ b/src/rocksdb/utilities/cassandra/test_utils.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_utils.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +const char kData[] = {'d', 'a', 't', 'a'}; +const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'}; +const int32_t kTtl = 86400; +const int8_t kColumn = 0; +const int8_t kTombstone = 1; +const int8_t kExpiringColumn = 2; + +std::shared_ptr CreateTestColumn(int8_t mask, int8_t index, + int64_t timestamp) { + if ((mask & ColumnTypeMask::DELETION_MASK) != 0) { + return std::shared_ptr( + new Tombstone(mask, index, ToSeconds(timestamp), timestamp)); + } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) { + return std::shared_ptr(new ExpiringColumn( + mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl)); + } else { + return std::shared_ptr( + new Column(mask, index, timestamp, sizeof(kData), kData)); + } +} + +std::tuple CreateTestColumnSpec(int8_t mask, + int8_t index, + int64_t timestamp) { + return std::make_tuple(mask, index, timestamp); +} + +RowValue CreateTestRowValue( + std::vector> column_specs) { + std::vector> columns; + int64_t last_modified_time = 0; + for (auto spec : column_specs) { + auto c = CreateTestColumn(std::get<0>(spec), std::get<1>(spec), + std::get<2>(spec)); + last_modified_time = std::max(last_modified_time, c->Timestamp()); + columns.push_back(std::move(c)); + } + return RowValue(std::move(columns), last_modified_time); +} + +RowValue CreateRowTombstone(int64_t timestamp) { + return RowValue(ToSeconds(timestamp), timestamp); +} + +void VerifyRowValueColumns( + const std::vector> &columns, + std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index, + int64_t expected_timestamp) { + EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp()); + EXPECT_EQ(expected_mask, columns[index_of_vector]->Mask()); + EXPECT_EQ(expected_index, columns[index_of_vector]->Index()); +} + +int64_t ToMicroSeconds(int64_t seconds) { return seconds * (int64_t)1000000; } + +int32_t ToSeconds(int64_t microseconds) { + return (int32_t)(microseconds / (int64_t)1000000); +} +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/cassandra/test_utils.h b/src/rocksdb/utilities/cassandra/test_utils.h new file mode 100644 index 000000000..be23f7076 --- /dev/null +++ b/src/rocksdb/utilities/cassandra/test_utils.h @@ -0,0 +1,42 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "test_util/testharness.h" +#include "utilities/cassandra/format.h" +#include "utilities/cassandra/serialize.h" + +namespace ROCKSDB_NAMESPACE { +namespace cassandra { +extern const char kData[]; +extern const char kExpiringData[]; +extern const int32_t kTtl; +extern const int8_t kColumn; +extern const int8_t kTombstone; +extern const int8_t kExpiringColumn; + +std::shared_ptr CreateTestColumn(int8_t mask, int8_t index, + int64_t timestamp); + +std::tuple CreateTestColumnSpec(int8_t mask, + int8_t index, + int64_t timestamp); + +RowValue CreateTestRowValue( + std::vector> column_specs); + +RowValue CreateRowTombstone(int64_t timestamp); + +void VerifyRowValueColumns( + const std::vector> &columns, + std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index, + int64_t expected_timestamp); + +int64_t ToMicroSeconds(int64_t seconds); +int32_t ToSeconds(int64_t microseconds); +} // namespace cassandra +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc new file mode 100644 index 000000000..44ce70b1b --- /dev/null +++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc @@ -0,0 +1,469 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef ROCKSDB_LITE + +#include "utilities/checkpoint/checkpoint_impl.h" + +#include +#include +#include +#include +#include +#include + +#include "db/wal_manager.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/checkpoint.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/file_checksum_helper.h" + +namespace ROCKSDB_NAMESPACE { + +Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) { + *checkpoint_ptr = new CheckpointImpl(db); + return Status::OK(); +} + +Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/, + uint64_t /*log_size_for_flush*/, + uint64_t* /*sequence_number_ptr*/) { + return Status::NotSupported(""); +} + +void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path, + Logger* info_log) { + std::vector subchildren; + Status s = db_->GetEnv()->FileExists(full_private_path); + if (s.IsNotFound()) { + return; + } + ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + s = db_->GetEnv()->GetChildren(full_private_path, &subchildren); + if (s.ok()) { + for (auto& subchild : subchildren) { + std::string subchild_path = full_private_path + "/" + subchild; + s = db_->GetEnv()->DeleteFile(subchild_path); + ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(), + s.ToString().c_str()); + } + } + // finally delete the private dir + s = db_->GetEnv()->DeleteDir(full_private_path); + ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); +} + +Status Checkpoint::ExportColumnFamily( + ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/, + ExportImportFilesMetaData** /*metadata*/) { + return Status::NotSupported(""); +} + +// Builds an openable snapshot of RocksDB +Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush, + uint64_t* sequence_number_ptr) { + DBOptions db_options = db_->GetDBOptions(); + + Status s = db_->GetEnv()->FileExists(checkpoint_dir); + if (s.ok()) { + return Status::InvalidArgument("Directory exists"); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + ROCKS_LOG_INFO( + db_options.info_log, + "Started the snapshot process -- creating snapshot in directory %s", + checkpoint_dir.c_str()); + + size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/'); + if (final_nonslash_idx == std::string::npos) { + // npos means it's only slashes or empty. Non-empty means it's the root + // directory, but it shouldn't be because we verified above the directory + // doesn't exist. + assert(checkpoint_dir.empty()); + return Status::InvalidArgument("invalid checkpoint directory name"); + } + + std::string full_private_path = + checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp"; + ROCKS_LOG_INFO(db_options.info_log, + "Snapshot process -- using temporary directory %s", + full_private_path.c_str()); + CleanStagingDirectory(full_private_path, db_options.info_log.get()); + // create snapshot directory + s = db_->GetEnv()->CreateDir(full_private_path); + uint64_t sequence_number = 0; + if (s.ok()) { + // enable file deletions + s = db_->DisableFileDeletions(); + const bool disabled_file_deletions = s.ok(); + + if (s.ok() || s.IsNotSupported()) { + s = CreateCustomCheckpoint( + [&](const std::string& src_dirname, const std::string& fname, + FileType) { + ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s", + fname.c_str()); + return db_->GetFileSystem()->LinkFile( + src_dirname + "/" + fname, full_private_path + "/" + fname, + IOOptions(), nullptr); + } /* link_file_cb */, + [&](const std::string& src_dirname, const std::string& fname, + uint64_t size_limit_bytes, FileType, + const std::string& /* checksum_func_name */, + const std::string& /* checksum_val */, + const Temperature temperature) { + ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str()); + return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname, + full_private_path + "/" + fname, size_limit_bytes, + db_options.use_fsync, nullptr, temperature); + } /* copy_file_cb */, + [&](const std::string& fname, const std::string& contents, FileType) { + ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str()); + return CreateFile(db_->GetFileSystem(), + full_private_path + "/" + fname, contents, + db_options.use_fsync); + } /* create_file_cb */, + &sequence_number, log_size_for_flush); + + // we copied all the files, enable file deletions + if (disabled_file_deletions) { + Status ss = db_->EnableFileDeletions(false); + assert(ss.ok()); + ss.PermitUncheckedError(); + } + } + } + + if (s.ok()) { + // move tmp private backup to real snapshot directory + s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir); + } + if (s.ok()) { + std::unique_ptr checkpoint_directory; + s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(), + &checkpoint_directory, nullptr); + if (s.ok() && checkpoint_directory != nullptr) { + s = checkpoint_directory->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); + } + } + + if (s.ok()) { + if (sequence_number_ptr != nullptr) { + *sequence_number_ptr = sequence_number; + } + // here we know that we succeeded and installed the new snapshot + ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good"); + ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64, + sequence_number); + } else { + // clean all the files we might have created + ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s", + s.ToString().c_str()); + CleanStagingDirectory(full_private_path, db_options.info_log.get()); + } + return s; +} + +Status CheckpointImpl::CreateCustomCheckpoint( + std::function + link_file_cb, + std::function< + Status(const std::string& src_dirname, const std::string& src_fname, + uint64_t size_limit_bytes, FileType type, + const std::string& checksum_func_name, + const std::string& checksum_val, const Temperature temperature)> + copy_file_cb, + std::function + create_file_cb, + uint64_t* sequence_number, uint64_t log_size_for_flush, + bool get_live_table_checksum) { + *sequence_number = db_->GetLatestSequenceNumber(); + + LiveFilesStorageInfoOptions opts; + opts.include_checksum_info = get_live_table_checksum; + opts.wal_size_for_flush = log_size_for_flush; + + std::vector infos; + { + Status s = db_->GetLiveFilesStorageInfo(opts, &infos); + if (!s.ok()) { + return s; + } + } + + // Verify that everything except WAL files are in same directory + // (db_paths / cf_paths not supported) + std::unordered_set dirs; + for (auto& info : infos) { + if (info.file_type != kWalFile) { + dirs.insert(info.directory); + } + } + if (dirs.size() > 1) { + return Status::NotSupported( + "db_paths / cf_paths not supported for Checkpoint nor BackupEngine"); + } + + bool same_fs = true; + + for (auto& info : infos) { + Status s; + if (!info.replacement_contents.empty()) { + // Currently should only be used for CURRENT file. + assert(info.file_type == kCurrentFile); + + if (info.size != info.replacement_contents.size()) { + s = Status::Corruption("Inconsistent size metadata for " + + info.relative_filename); + } else { + s = create_file_cb(info.relative_filename, info.replacement_contents, + info.file_type); + } + } else { + if (same_fs && !info.trim_to_size) { + s = link_file_cb(info.directory, info.relative_filename, + info.file_type); + if (s.IsNotSupported()) { + same_fs = false; + s = Status::OK(); + } + s.MustCheck(); + } + if (!same_fs || info.trim_to_size) { + assert(info.file_checksum_func_name.empty() == + !opts.include_checksum_info); + // no assertion on file_checksum because empty is used for both "not + // set" and "unknown" + if (opts.include_checksum_info) { + s = copy_file_cb(info.directory, info.relative_filename, info.size, + info.file_type, info.file_checksum_func_name, + info.file_checksum, info.temperature); + } else { + s = copy_file_cb(info.directory, info.relative_filename, info.size, + info.file_type, kUnknownFileChecksumFuncName, + kUnknownFileChecksum, info.temperature); + } + } + } + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +// Exports all live SST files of a specified Column Family onto export_dir, +// returning SST files information in metadata. +Status CheckpointImpl::ExportColumnFamily( + ColumnFamilyHandle* handle, const std::string& export_dir, + ExportImportFilesMetaData** metadata) { + auto cfh = static_cast_with_check(handle); + const auto cf_name = cfh->GetName(); + const auto db_options = db_->GetDBOptions(); + + assert(metadata != nullptr); + assert(*metadata == nullptr); + auto s = db_->GetEnv()->FileExists(export_dir); + if (s.ok()) { + return Status::InvalidArgument("Specified export_dir exists"); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + const auto final_nonslash_idx = export_dir.find_last_not_of('/'); + if (final_nonslash_idx == std::string::npos) { + return Status::InvalidArgument("Specified export_dir invalid"); + } + ROCKS_LOG_INFO(db_options.info_log, + "[%s] export column family onto export directory %s", + cf_name.c_str(), export_dir.c_str()); + + // Create a temporary export directory. + const auto tmp_export_dir = + export_dir.substr(0, final_nonslash_idx + 1) + ".tmp"; + s = db_->GetEnv()->CreateDir(tmp_export_dir); + + if (s.ok()) { + s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle); + } + + ColumnFamilyMetaData db_metadata; + if (s.ok()) { + // Export live sst files with file deletions disabled. + s = db_->DisableFileDeletions(); + if (s.ok()) { + db_->GetColumnFamilyMetaData(handle, &db_metadata); + + s = ExportFilesInMetaData( + db_options, db_metadata, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s", + cf_name.c_str(), fname.c_str()); + return db_->GetEnv()->LinkFile(src_dirname + fname, + tmp_export_dir + fname); + } /*link_file_cb*/, + [&](const std::string& src_dirname, const std::string& fname) { + ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s", + cf_name.c_str(), fname.c_str()); + return CopyFile(db_->GetFileSystem(), src_dirname + fname, + tmp_export_dir + fname, 0, db_options.use_fsync, + nullptr, Temperature::kUnknown); + } /*copy_file_cb*/); + + const auto enable_status = db_->EnableFileDeletions(false /*force*/); + if (s.ok()) { + s = enable_status; + } + } + } + + auto moved_to_user_specified_dir = false; + if (s.ok()) { + // Move temporary export directory to the actual export directory. + s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir); + } + + if (s.ok()) { + // Fsync export directory. + moved_to_user_specified_dir = true; + std::unique_ptr dir_ptr; + s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr, + nullptr); + if (s.ok()) { + assert(dir_ptr != nullptr); + s = dir_ptr->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); + } + } + + if (s.ok()) { + // Export of files succeeded. Fill in the metadata information. + auto result_metadata = new ExportImportFilesMetaData(); + result_metadata->db_comparator_name = handle->GetComparator()->Name(); + for (const auto& level_metadata : db_metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + LiveFileMetaData live_file_metadata; + live_file_metadata.size = file_metadata.size; + live_file_metadata.name = std::move(file_metadata.name); + live_file_metadata.file_number = file_metadata.file_number; + live_file_metadata.db_path = export_dir; + live_file_metadata.smallest_seqno = file_metadata.smallest_seqno; + live_file_metadata.largest_seqno = file_metadata.largest_seqno; + live_file_metadata.smallestkey = std::move(file_metadata.smallestkey); + live_file_metadata.largestkey = std::move(file_metadata.largestkey); + live_file_metadata.oldest_blob_file_number = + file_metadata.oldest_blob_file_number; + live_file_metadata.level = level_metadata.level; + result_metadata->files.push_back(live_file_metadata); + } + *metadata = result_metadata; + } + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.", + cf_name.c_str()); + } else { + // Failure: Clean up all the files/directories created. + ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s", + cf_name.c_str(), s.ToString().c_str()); + std::vector subchildren; + const auto cleanup_dir = + moved_to_user_specified_dir ? export_dir : tmp_export_dir; + db_->GetEnv()->GetChildren(cleanup_dir, &subchildren); + for (const auto& subchild : subchildren) { + const auto subchild_path = cleanup_dir + "/" + subchild; + const auto status = db_->GetEnv()->DeleteFile(subchild_path); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s", + subchild_path.c_str(), status.ToString().c_str()); + } + } + const auto status = db_->GetEnv()->DeleteDir(cleanup_dir); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s", + cleanup_dir.c_str(), status.ToString().c_str()); + } + } + return s; +} + +Status CheckpointImpl::ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb) { + Status s; + auto hardlink_file = true; + + // Copy/hard link files in metadata. + size_t num_files = 0; + for (const auto& level_metadata : metadata.levels) { + for (const auto& file_metadata : level_metadata.files) { + uint64_t number; + FileType type; + const auto ok = ParseFileName(file_metadata.name, &number, &type); + if (!ok) { + s = Status::Corruption("Could not parse file name"); + break; + } + + // We should only get sst files here. + assert(type == kTableFile); + assert(file_metadata.size > 0 && file_metadata.name[0] == '/'); + const auto src_fname = file_metadata.name; + ++num_files; + + if (hardlink_file) { + s = link_file_cb(db_->GetName(), src_fname); + if (num_files == 1 && s.IsNotSupported()) { + // Fallback to copy if link failed due to cross-device directories. + hardlink_file = false; + s = Status::OK(); + } + } + if (!hardlink_file) { + s = copy_file_cb(db_->GetName(), src_fname); + } + if (!s.ok()) { + break; + } + } + } + ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt, + num_files); + + return s; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.h b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h new file mode 100644 index 000000000..2947330cc --- /dev/null +++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h @@ -0,0 +1,66 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "file/filename.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/checkpoint.h" + +namespace ROCKSDB_NAMESPACE { + +class CheckpointImpl : public Checkpoint { + public: + explicit CheckpointImpl(DB* db) : db_(db) {} + + Status CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush, + uint64_t* sequence_number_ptr) override; + + Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata) override; + + // Checkpoint logic can be customized by providing callbacks for link, copy, + // or create. + Status CreateCustomCheckpoint( + std::function + link_file_cb, + std::function + copy_file_cb, + std::function + create_file_cb, + uint64_t* sequence_number, uint64_t log_size_for_flush, + bool get_live_table_checksum = false); + + private: + void CleanStagingDirectory(const std::string& path, Logger* info_log); + + // Export logic customization by providing callbacks for link or copy. + Status ExportFilesInMetaData( + const DBOptions& db_options, const ColumnFamilyMetaData& metadata, + std::function + link_file_cb, + std::function + copy_file_cb); + + private: + DB* db_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_test.cc b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc new file mode 100644 index 000000000..3da753d5f --- /dev/null +++ b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc @@ -0,0 +1,974 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Syncpoint prevents us building and running tests in release +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/checkpoint.h" + +#ifndef OS_WIN +#include +#endif +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { +class CheckpointTest : public testing::Test { + protected: + // Sequence of option configurations to try + enum OptionConfig { + kDefault = 0, + }; + int option_config_; + + public: + std::string dbname_; + std::string alternative_wal_dir_; + Env* env_; + DB* db_; + Options last_options_; + std::vector handles_; + std::string snapshot_name_; + std::string export_path_; + ColumnFamilyHandle* cfh_reverse_comp_; + ExportImportFilesMetaData* metadata_; + + CheckpointTest() : env_(Env::Default()) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath(env_, "checkpoint_test"); + alternative_wal_dir_ = dbname_ + "/wal"; + auto options = CurrentOptions(); + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + snapshot_name_ = test::PerThreadDBPath(env_, "snapshot"); + std::string snapshot_tmp_name = snapshot_name_ + ".tmp"; + EXPECT_OK(DestroyDB(snapshot_name_, options)); + test::DeleteDir(env_, snapshot_name_); + EXPECT_OK(DestroyDB(snapshot_tmp_name, options)); + test::DeleteDir(env_, snapshot_tmp_name); + Reopen(options); + export_path_ = test::PerThreadDBPath("/export"); + DestroyDir(env_, export_path_).PermitUncheckedError(); + cfh_reverse_comp_ = nullptr; + metadata_ = nullptr; + } + + ~CheckpointTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + if (cfh_reverse_comp_) { + EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_)); + cfh_reverse_comp_ = nullptr; + } + if (metadata_) { + delete metadata_; + metadata_ = nullptr; + } + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + EXPECT_OK(DestroyDB(dbname_, options)); + EXPECT_OK(DestroyDB(snapshot_name_, options)); + DestroyDir(env_, export_path_).PermitUncheckedError(); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.env = env_; + options.create_if_missing = true; + return options; + } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + Close(); + std::vector v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } + + void CompactAll() { + for (auto h : handles_) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr)); + } + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(const Options& options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); + } + + Status ReadOnlyReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + std::vector column_families; + for (const auto& cf : cfs) { + column_families.emplace_back(cf, options); + } + return DB::OpenForReadOnly(options, dbname_, column_families, &handles_, + &db_); + } + + Status TryReopen(const Options& options) { + Close(); + last_options_ = options; + return DB::Open(options, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + return db_->Put(wo, k, v); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } +}; + +TEST_F(CheckpointTest, GetSnapshotLink) { + for (uint64_t log_size_for_flush : {0, 1000000}) { + Options options; + DB* snapshotDB; + ReadOptions roptions; + std::string result; + Checkpoint* checkpoint; + + options = CurrentOptions(); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + + // Create a database + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + std::string key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + // Take a snapshot + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush)); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_OK(Flush()); + ASSERT_EQ("v2", Get(key)); + // Open snapshot and verify contents while DB is running + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB)); + ASSERT_OK(snapshotDB->Get(roptions, key, &result)); + ASSERT_EQ("v1", result); + delete snapshotDB; + snapshotDB = nullptr; + delete db_; + db_ = nullptr; + + // Destroy original DB + ASSERT_OK(DestroyDB(dbname_, options)); + + // Open snapshot and verify contents + options.create_if_missing = false; + dbname_ = snapshot_name_; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_EQ("v1", Get(key)); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + delete checkpoint; + + // Restore DB name + dbname_ = test::PerThreadDBPath(env_, "db_test"); + } +} + +TEST_F(CheckpointTest, CheckpointWithBlob) { + // Create a database with a blob file + Options options = CurrentOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + ASSERT_OK(Put(key, blob)); + ASSERT_OK(Flush()); + + // Create a checkpoint + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + std::unique_ptr checkpoint_guard(checkpoint); + + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + + // Make sure it contains the blob file + std::vector files; + ASSERT_OK(env_->GetChildren(snapshot_name_, &files)); + + bool blob_file_found = false; + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kWalFile; + + if (ParseFileName(file, &number, &type) && type == kBlobFile) { + blob_file_found = true; + break; + } + } + + ASSERT_TRUE(blob_file_found); + + // Make sure the checkpoint can be opened and the blob value read + options.create_if_missing = false; + DB* checkpoint_db = nullptr; + ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db)); + + std::unique_ptr checkpoint_db_guard(checkpoint_db); + + PinnableSlice value; + ASSERT_OK(checkpoint_db->Get( + ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value)); + + ASSERT_EQ(value, blob); +} + +TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) { + // Create a database + auto options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({}, options); + + // Helper to verify the number of files in metadata and export dir + auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata, + int num_files_expected) { + ASSERT_EQ(metadata.files.size(), num_files_expected); + std::vector subchildren; + ASSERT_OK(env_->GetChildren(export_path_, &subchildren)); + ASSERT_EQ(subchildren.size(), num_files_expected); + }; + + // Test DefaultColumnFamily + { + const auto key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export the Tables and verify + ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_)); + verify_files_exported(*metadata_, 1); + ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name()); + ASSERT_OK(DestroyDir(env_, export_path_)); + delete metadata_; + metadata_ = nullptr; + + // Check again after compaction + CompactAll(); + ASSERT_OK(Put(key, "v2")); + ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_)); + verify_files_exported(*metadata_, 2); + ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name()); + ASSERT_OK(DestroyDir(env_, export_path_)); + delete metadata_; + metadata_ = nullptr; + delete checkpoint; + } + + // Test non default column family with non default comparator + { + auto cf_options = CurrentOptions(); + cf_options.comparator = ReverseBytewiseComparator(); + ASSERT_OK(db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_)); + + const auto key = std::string("foo"); + ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export the Tables and verify + ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_, + &metadata_)); + verify_files_exported(*metadata_, 1); + ASSERT_EQ(metadata_->db_comparator_name, + ReverseBytewiseComparator()->Name()); + delete checkpoint; + } +} + +TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) { + // Create a database + auto options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({}, options); + + const auto key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + + // Export onto existing directory + ASSERT_OK(env_->CreateDirIfMissing(export_path_)); + ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_), + Status::InvalidArgument("Specified export_dir exists")); + ASSERT_OK(DestroyDir(env_, export_path_)); + + // Export with invalid directory specification + export_path_ = ""; + ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_), + Status::InvalidArgument("Specified export_dir invalid")); + delete checkpoint; +} + +TEST_F(CheckpointTest, CheckpointCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CheckpointTest::CheckpointCF:2", "DBImpl::GetLiveFiles:2"}, + {"DBImpl::GetLiveFiles:1", "CheckpointTest::CheckpointCF:1"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "one", "one")); + ASSERT_OK(Put(2, "two", "two")); + ASSERT_OK(Put(3, "three", "three")); + ASSERT_OK(Put(4, "four", "four")); + ASSERT_OK(Put(5, "five", "five")); + + DB* snapshotDB; + ReadOptions roptions; + std::string result; + std::vector cphandles; + + // Take a snapshot + ROCKSDB_NAMESPACE::port::Thread t([&]() { + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + }); + TEST_SYNC_POINT("CheckpointTest::CheckpointCF:1"); + ASSERT_OK(Put(0, "Default", "Default1")); + ASSERT_OK(Put(1, "one", "eleven")); + ASSERT_OK(Put(2, "two", "twelve")); + ASSERT_OK(Put(3, "three", "thirteen")); + ASSERT_OK(Put(4, "four", "fourteen")); + ASSERT_OK(Put(5, "five", "fifteen")); + TEST_SYNC_POINT("CheckpointTest::CheckpointCF:2"); + t.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_OK(Put(1, "one", "twentyone")); + ASSERT_OK(Put(2, "two", "twentytwo")); + ASSERT_OK(Put(3, "three", "twentythree")); + ASSERT_OK(Put(4, "four", "twentyfour")); + ASSERT_OK(Put(5, "five", "twentyfive")); + ASSERT_OK(Flush()); + + // Open snapshot and verify contents while DB is running + options.create_if_missing = false; + std::vector cfs; + cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"}; + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); + } + ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles, + &snapshotDB)); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result)); + ASSERT_EQ("Default1", result); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result)); + ASSERT_EQ("eleven", result); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result)); + for (auto h : cphandles) { + delete h; + } + cphandles.clear(); + delete snapshotDB; + snapshotDB = nullptr; +} + +TEST_F(CheckpointTest, CheckpointCFNoFlush) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "one", "one")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(2, "two", "two")); + + DB* snapshotDB; + ReadOptions roptions; + std::string result; + std::vector cphandles; + + // Take a snapshot + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) { + // Flush should never trigger. + FAIL(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + delete checkpoint; + ASSERT_OK(Put(1, "one", "two")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(2, "two", "twentytwo")); + Close(); + EXPECT_OK(DestroyDB(dbname_, options)); + + // Open snapshot and verify contents while DB is running + options.create_if_missing = false; + std::vector cfs; + cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"}; + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); + } + ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles, + &snapshotDB)); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result)); + ASSERT_EQ("Default", result); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result)); + ASSERT_EQ("one", result); + ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result)); + ASSERT_EQ("two", result); + for (auto h : cphandles) { + delete h; + } + cphandles.clear(); + delete snapshotDB; + snapshotDB = nullptr; +} + +TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) { + Options options = CurrentOptions(); + options.max_manifest_file_size = 0; // always rollover manifest for file add + Reopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// Get past the flush in the checkpoint thread before adding any keys to + // the db so the checkpoint thread won't hit the WriteManifest + // syncpoints. + {"CheckpointImpl::CreateCheckpoint:FlushDone", + "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"}, + // Roll the manifest during checkpointing right after live files are + // snapshotted. + {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1", + "VersionSet::LogAndApply:WriteManifest"}, + {"VersionSet::LogAndApply:WriteManifestDone", + "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t([&]() { + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + }); + TEST_SYNC_POINT( + "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"); + ASSERT_OK(Put("Default", "Default1")); + ASSERT_OK(Flush()); + t.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + DB* snapshotDB; + // Successful Open() implies that CURRENT pointed to the manifest in the + // checkpoint. + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB)); + delete snapshotDB; + snapshotDB = nullptr; +} + +TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) { + Close(); + const std::string dbname = test::PerThreadDBPath("transaction_testdb"); + ASSERT_OK(DestroyDB(dbname, CurrentOptions())); + test::DeleteDir(env_, dbname); + + Options options = CurrentOptions(); + options.allow_2pc = true; + // allow_2pc is implicitly set with tx prepare + // options.allow_2pc = true; + TransactionDBOptions txn_db_options; + TransactionDB* txdb; + Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb); + ASSERT_OK(s); + ColumnFamilyHandle* cfa; + ColumnFamilyHandle* cfb; + ColumnFamilyOptions cf_options; + ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFA", &cfa)); + + WriteOptions write_options; + // Insert something into CFB so lots of log files will be kept + // before creating the checkpoint. + ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFB", &cfb)); + ASSERT_OK(txdb->Put(write_options, cfb, "", "")); + + ReadOptions read_options; + std::string value; + TransactionOptions txn_options; + Transaction* txn = txdb->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + ASSERT_EQ(txdb->GetTransactionByName("xid"), txn); + + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa")); + ASSERT_OK(s); + // Writing prepare into middle of first WAL, then flush WALs many times + for (int i = 1; i <= 100000; i++) { + Transaction* tx = txdb->BeginTransaction(write_options, txn_options); + ASSERT_OK(tx->SetName("x")); + ASSERT_OK(tx->Put(Slice(std::to_string(i)), Slice("val"))); + ASSERT_OK(tx->Put(cfa, Slice("aaa"), Slice("111"))); + ASSERT_OK(tx->Prepare()); + ASSERT_OK(tx->Commit()); + if (i % 10000 == 0) { + ASSERT_OK(txdb->Flush(FlushOptions())); + } + if (i == 88888) { + ASSERT_OK(txn->Prepare()); + } + delete tx; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1", + "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"}, + {"CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit", + "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread t([&]() { + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(txdb, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + }); + TEST_SYNC_POINT( + "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"); + ASSERT_OK(txn->Commit()); + delete txn; + TEST_SYNC_POINT( + "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit"); + t.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // No more than two logs files should exist. + std::vector files; + ASSERT_OK(env_->GetChildren(snapshot_name_, &files)); + int num_log_files = 0; + for (auto& file : files) { + uint64_t num; + FileType type; + WalFileType log_type; + if (ParseFileName(file, &num, &type, &log_type) && type == kWalFile) { + num_log_files++; + } + } + // One flush after preapare + one outstanding file before checkpoint + one log + // file generated after checkpoint. + ASSERT_LE(num_log_files, 3); + + TransactionDB* snapshotDB; + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + std::vector cf_handles; + ASSERT_OK(TransactionDB::Open(options, txn_db_options, snapshot_name_, + column_families, &cf_handles, &snapshotDB)); + ASSERT_OK(snapshotDB->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(snapshotDB->Get(read_options, cf_handles[1], "foocfa", &value)); + ASSERT_EQ(value, "barcfa"); + + delete cfa; + delete cfb; + delete cf_handles[0]; + delete cf_handles[1]; + delete cf_handles[2]; + delete snapshotDB; + snapshotDB = nullptr; + delete txdb; +} + +TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) { + for (std::string checkpoint_dir : {"", "/", "////"}) { + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_TRUE( + checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument()); + delete checkpoint; + } +} + +TEST_F(CheckpointTest, CheckpointWithParallelWrites) { + // When run with TSAN, this exposes the data race fixed in + // https://github.com/facebook/rocksdb/pull/3603 + ASSERT_OK(Put("key1", "val1")); + port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); }); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + thread.join(); +} + +TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) { + Options options = CurrentOptions(); + std::unique_ptr env(new FaultInjectionTestEnv(env_)); + options.env = env.get(); + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + ASSERT_OK(env->DropUnsyncedFileData()); + + // make sure it's openable even though whatever data that wasn't synced got + // dropped. + options.env = env_; + DB* snapshot_db; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result)); + ASSERT_EQ("val1", get_result); + delete snapshot_db; + delete db_; + db_ = nullptr; +} + +TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) { + // Regression test for a bug where checkpoint failed on a DB where persisting + // OPTIONS file failed and the DB was opened with + // `fail_if_options_file_error == false`. + Options options = CurrentOptions(); + options.fail_if_options_file_error = false; + auto fault_fs = std::make_shared(FileSystem::Default()); + + // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one + // operation when inside the OPTIONS file persisting code. + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + fault_fs->SetRandomMetadataWriteError(1 /* one_in */); + SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) { + fault_fs->EnableMetadataWriteErrorInjection(); + }); + SyncPoint::GetInstance()->SetCallBack( + "FaultInjectionTestFS::InjectMetadataWriteError:Injected", + [fault_fs](void* /* arg */) { + fault_fs->DisableMetadataWriteErrorInjection(); + }); + options.env = fault_fs_env.get(); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + + // Make sure it's usable. + options.env = env_; + DB* snapshot_db; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result)); + ASSERT_EQ("val1", get_result); + delete snapshot_db; + delete db_; + db_ = nullptr; +} + +TEST_F(CheckpointTest, CheckpointReadOnlyDB) { + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + Close(); + Options options = CurrentOptions(); + ASSERT_OK(ReadOnlyReopen(options)); + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + checkpoint = nullptr; + Close(); + DB* snapshot_db = nullptr; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result)); + ASSERT_EQ("foo_value", get_result); + delete snapshot_db; +} + +TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + for (int i = 0; i != 3; ++i) { + ASSERT_OK(Put(i, "foo", "foo_value")); + ASSERT_OK(Flush(i)); + } + Close(); + Status s = ReadOnlyReopenWithColumnFamilies( + {kDefaultColumnFamilyName, "pikachu", "eevee"}, options); + ASSERT_OK(s); + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + checkpoint = nullptr; + Close(); + + std::vector column_families{ + {kDefaultColumnFamilyName, options}, + {"pikachu", options}, + {"eevee", options}}; + DB* snapshot_db = nullptr; + std::vector snapshot_handles; + s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles, + &snapshot_db); + ASSERT_OK(s); + ReadOptions read_opts; + for (int i = 0; i != 3; ++i) { + std::string get_result; + s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result); + ASSERT_OK(s); + ASSERT_EQ("foo_value", get_result); + } + + for (auto snapshot_h : snapshot_handles) { + delete snapshot_h; + } + snapshot_handles.clear(); + delete snapshot_db; +} + +TEST_F(CheckpointTest, CheckpointWithDbPath) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 0); + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + Flush(); + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + // Currently not supported + ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported()); + delete checkpoint; +} + +TEST_F(CheckpointTest, PutRaceWithCheckpointTrackedWalSync) { + // Repro for a race condition where a user write comes in after the checkpoint + // syncs WAL for `track_and_verify_wals_in_manifest` but before the + // corresponding MANIFEST update. With the bug, that scenario resulted in an + // unopenable DB with error "Corruption: Size mismatch: WAL ...". + Options options = CurrentOptions(); + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_env.get(); + options.track_and_verify_wals_in_manifest = true; + Reopen(options); + + ASSERT_OK(Put("key1", "val1")); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", + [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::unique_ptr checkpoint; + { + Checkpoint* checkpoint_ptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint_ptr)); + checkpoint.reset(checkpoint_ptr); + } + + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + + // Ensure callback ran. + ASSERT_EQ("val2", Get("key2")); + + Close(); + + // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the + // DB WAL. + fault_env->DropUnsyncedFileData(); + + // Before the bug fix, reopening the DB would fail because the MANIFEST's + // AddWal entry indicated the WAL should be synced through "key2" -> "val2". + Reopen(options); + + // Need to close before `fault_env` goes out of scope. + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/compaction_filters.cc b/src/rocksdb/utilities/compaction_filters.cc new file mode 100644 index 000000000..8763901c3 --- /dev/null +++ b/src/rocksdb/utilities/compaction_filters.cc @@ -0,0 +1,56 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +static int RegisterBuiltinCompactionFilters(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + RemoveEmptyValueCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { + return new RemoveEmptyValueCompactionFilter(); + }); + return 1; +} +#endif // ROCKSDB_LITE +Status CompactionFilter::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + const CompactionFilter** result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + CompactionFilter* filter = const_cast(*result); + Status status = LoadStaticObject(config_options, value, + nullptr, &filter); + if (status.ok()) { + *result = const_cast(filter); + } + return status; +} + +Status CompactionFilterFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + // Currently there are no builtin CompactionFilterFactories. + // If any are introduced, they need to be registered here. + Status status = LoadSharedObject( + config_options, value, nullptr, result); + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h new file mode 100644 index 000000000..803fa94ae --- /dev/null +++ b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h @@ -0,0 +1,41 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +// Abstract base class for building layered compaction filter on top of +// user compaction filter. +// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage. +class LayeredCompactionFilterBase : public CompactionFilter { + public: + LayeredCompactionFilterBase( + const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory) + : user_comp_filter_(_user_comp_filter), + user_comp_filter_from_factory_( + std::move(_user_comp_filter_from_factory)) { + if (!user_comp_filter_) { + user_comp_filter_ = user_comp_filter_from_factory_.get(); + } + } + + // Return a pointer to user compaction filter + const CompactionFilter* user_comp_filter() const { return user_comp_filter_; } + + const Customizable* Inner() const override { return user_comp_filter_; } + + protected: + const CompactionFilter* user_comp_filter_; + + private: + std::unique_ptr user_comp_filter_from_factory_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc new file mode 100644 index 000000000..b788dbf9b --- /dev/null +++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" + +#include + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/, + const Slice& /*key*/, + const Slice& existing_value, + std::string* /*new_value*/, + bool* /*value_changed*/) const { + // remove kv pairs that have empty values + return existing_value.empty(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h new file mode 100644 index 000000000..864ad15ff --- /dev/null +++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h @@ -0,0 +1,28 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#pragma once + +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class RemoveEmptyValueCompactionFilter : public CompactionFilter { + public: + static const char* kClassName() { return "RemoveEmptyValueCompactionFilter"; } + + const char* Name() const override { return kClassName(); } + + bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const override; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/convenience/info_log_finder.cc b/src/rocksdb/utilities/convenience/info_log_finder.cc new file mode 100644 index 000000000..fe62fd561 --- /dev/null +++ b/src/rocksdb/utilities/convenience/info_log_finder.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "rocksdb/utilities/info_log_finder.h" + +#include "file/filename.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +Status GetInfoLogList(DB* db, std::vector* info_log_list) { + if (!db) { + return Status::InvalidArgument("DB pointer is not valid"); + } + std::string parent_path; + const Options& options = db->GetOptions(); + return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir, + db->GetName(), &parent_path, info_log_list); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/counted_fs.cc b/src/rocksdb/utilities/counted_fs.cc new file mode 100644 index 000000000..e43f3a191 --- /dev/null +++ b/src/rocksdb/utilities/counted_fs.cc @@ -0,0 +1,379 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/counted_fs.h" + +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +class CountedSequentialFile : public FSSequentialFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedSequentialFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {} + + ~CountedSequentialFile() override { fs_->counters()->closes++; } + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus rv = target()->Read(n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + IOStatus rv = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } +}; + +class CountedRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedRandomAccessFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSRandomAccessFileOwnerWrapper(std::move(f)), fs_(fs) {} + + ~CountedRandomAccessFile() override { fs_->counters()->closes++; } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->MultiRead(reqs, num_reqs, options, dbg); + for (size_t r = 0; r < num_reqs; r++) { + fs_->counters()->reads.RecordOp(reqs[r].status, reqs[r].result.size()); + } + return rv; + } +}; + +class CountedWritableFile : public FSWritableFileOwnerWrapper { + private: + CountedFileSystem* fs_; + + public: + CountedWritableFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSWritableFileOwnerWrapper(std::move(f)), fs_(fs) {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->Append(data, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& info, + IODebugContext* dbg) override { + IOStatus rv = target()->Append(data, options, info, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->PositionedAppend(data, offset, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& info, + IODebugContext* dbg) override { + IOStatus rv = target()->PositionedAppend(data, offset, options, info, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + } + return rv; + } + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Flush(options, dbg); + if (rv.ok()) { + fs_->counters()->flushes++; + } + return rv; + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Sync(options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->fsyncs++; + } + return rv; + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->RangeSync(offset, nbytes, options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } +}; + +class CountedRandomRWFile : public FSRandomRWFileOwnerWrapper { + private: + mutable CountedFileSystem* fs_; + + public: + CountedRandomRWFile(std::unique_ptr&& f, + CountedFileSystem* fs) + : FSRandomRWFileOwnerWrapper(std::move(f)), fs_(fs) {} + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus rv = target()->Write(offset, data, options, dbg); + fs_->counters()->writes.RecordOp(rv, data.size()); + return rv; + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg); + fs_->counters()->reads.RecordOp(rv, result->size()); + return rv; + } + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Flush(options, dbg); + if (rv.ok()) { + fs_->counters()->flushes++; + } + return rv; + } + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Sync(options, dbg); + if (rv.ok()) { + fs_->counters()->syncs++; + } + return rv; + } + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->fsyncs++; + } + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = target()->Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + } + return rv; + } +}; + +class CountedDirectory : public FSDirectoryWrapper { + private: + mutable CountedFileSystem* fs_; + bool closed_ = false; + + public: + CountedDirectory(std::unique_ptr&& f, CountedFileSystem* fs) + : FSDirectoryWrapper(std::move(f)), fs_(fs) {} + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = FSDirectoryWrapper::Fsync(options, dbg); + if (rv.ok()) { + fs_->counters()->dsyncs++; + } + return rv; + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + IOStatus rv = FSDirectoryWrapper::Close(options, dbg); + if (rv.ok()) { + fs_->counters()->closes++; + fs_->counters()->dir_closes++; + closed_ = true; + } + return rv; + } + + IOStatus FsyncWithDirOptions(const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_options) override { + IOStatus rv = + FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, dir_options); + if (rv.ok()) { + fs_->counters()->dsyncs++; + } + return rv; + } + + ~CountedDirectory() { + if (!closed_) { + // TODO: fix DB+CF code to use explicit Close, not rely on destructor + fs_->counters()->closes++; + fs_->counters()->dir_closes++; + } + } +}; +} // anonymous namespace + +std::string FileOpCounters::PrintCounters() const { + std::stringstream ss; + ss << "Num files opened: " << opens.load(std::memory_order_relaxed) + << std::endl; + ss << "Num files deleted: " << deletes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num files renamed: " << renames.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Flush(): " << flushes.load(std::memory_order_relaxed) << std::endl; + ss << "Num Sync(): " << syncs.load(std::memory_order_relaxed) << std::endl; + ss << "Num Fsync(): " << fsyncs.load(std::memory_order_relaxed) << std::endl; + ss << "Num Dir Fsync(): " << dsyncs.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Close(): " << closes.load(std::memory_order_relaxed) << std::endl; + ss << "Num Dir Open(): " << dir_opens.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Dir Close(): " << dir_closes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Read(): " << reads.ops.load(std::memory_order_relaxed) + << std::endl; + ss << "Num Append(): " << writes.ops.load(std::memory_order_relaxed) + << std::endl; + ss << "Num bytes read: " << reads.bytes.load(std::memory_order_relaxed) + << std::endl; + ss << "Num bytes written: " << writes.bytes.load(std::memory_order_relaxed) + << std::endl; + return ss.str(); +} + +CountedFileSystem::CountedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + +IOStatus CountedFileSystem::NewSequentialFile( + const std::string& f, const FileOptions& options, + std::unique_ptr* r, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewSequentialFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedSequentialFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewRandomAccessFile( + const std::string& f, const FileOptions& options, + std::unique_ptr* r, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewRandomAccessFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedRandomAccessFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewWritableFile(const std::string& f, + const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewWritableFile(f, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + r->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->ReopenWritableFile(fname, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedWritableFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewRandomRWFile( + const std::string& name, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewRandomRWFile(name, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + result->reset(new CountedRandomRWFile(std::move(base), this)); + } + return s; +} + +IOStatus CountedFileSystem::NewDirectory(const std::string& name, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + std::unique_ptr base; + IOStatus s = target()->NewDirectory(name, options, &base, dbg); + if (s.ok()) { + counters_.opens++; + counters_.dir_opens++; + result->reset(new CountedDirectory(std::move(base), this)); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/counted_fs.h b/src/rocksdb/utilities/counted_fs.h new file mode 100644 index 000000000..cb8a8968f --- /dev/null +++ b/src/rocksdb/utilities/counted_fs.h @@ -0,0 +1,158 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; + +struct OpCounter { + std::atomic ops; + std::atomic bytes; + + OpCounter() : ops(0), bytes(0) {} + + void Reset() { + ops = 0; + bytes = 0; + } + void RecordOp(const IOStatus& io_s, size_t added_bytes) { + if (!io_s.IsNotSupported()) { + ops.fetch_add(1, std::memory_order_relaxed); + } + if (io_s.ok()) { + bytes.fetch_add(added_bytes, std::memory_order_relaxed); + } + } +}; + +struct FileOpCounters { + static const char* kName() { return "FileOpCounters"; } + + std::atomic opens; + std::atomic closes; + std::atomic deletes; + std::atomic renames; + std::atomic flushes; + std::atomic syncs; + std::atomic dsyncs; + std::atomic fsyncs; + std::atomic dir_opens; + std::atomic dir_closes; + OpCounter reads; + OpCounter writes; + + FileOpCounters() + : opens(0), + closes(0), + deletes(0), + renames(0), + flushes(0), + syncs(0), + dsyncs(0), + fsyncs(0), + dir_opens(0), + dir_closes(0) {} + + void Reset() { + opens = 0; + closes = 0; + deletes = 0; + renames = 0; + flushes = 0; + syncs = 0; + dsyncs = 0; + fsyncs = 0; + dir_opens = 0; + dir_closes = 0; + reads.Reset(); + writes.Reset(); + } + std::string PrintCounters() const; +}; + +// A FileSystem class that counts operations (reads, writes, opens, closes, etc) +class CountedFileSystem : public FileSystemWrapper { + public: + private: + FileOpCounters counters_; + + public: + explicit CountedFileSystem(const std::shared_ptr& base); + static const char* kClassName() { return "CountedFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& f, const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& f, const FileOptions& options, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewRandomRWFile(const std::string& name, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override { + IOStatus s = target()->DeleteFile(fname, options, dbg); + if (s.ok()) { + counters_.deletes++; + } + return s; + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + IOStatus st = target()->RenameFile(s, t, options, dbg); + if (st.ok()) { + counters_.renames++; + } + return st; + } + + const FileOpCounters* counters() const { return &counters_; } + + FileOpCounters* counters() { return &counters_; } + + const void* GetOptionsPtr(const std::string& name) const override { + if (name == FileOpCounters::kName()) { + return counters(); + } else { + return FileSystemWrapper::GetOptionsPtr(name); + } + } + + // Prints the counters to a string + std::string PrintCounters() const { return counters_.PrintCounters(); } + void ResetCounters() { counters_.Reset(); } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/debug.cc b/src/rocksdb/utilities/debug.cc new file mode 100644 index 000000000..f2c3bb513 --- /dev/null +++ b/src/rocksdb/utilities/debug.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/debug.h" + +#include "db/db_impl/db_impl.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { + +static std::unordered_map value_type_string_map = { + {"TypeDeletion", ValueType::kTypeDeletion}, + {"TypeValue", ValueType::kTypeValue}, + {"TypeMerge", ValueType::kTypeMerge}, + {"TypeLogData", ValueType::kTypeLogData}, + {"TypeColumnFamilyDeletion", ValueType::kTypeColumnFamilyDeletion}, + {"TypeColumnFamilyValue", ValueType::kTypeColumnFamilyValue}, + {"TypeColumnFamilyMerge", ValueType::kTypeColumnFamilyMerge}, + {"TypeSingleDeletion", ValueType::kTypeSingleDeletion}, + {"TypeColumnFamilySingleDeletion", + ValueType::kTypeColumnFamilySingleDeletion}, + {"TypeBeginPrepareXID", ValueType::kTypeBeginPrepareXID}, + {"TypeEndPrepareXID", ValueType::kTypeEndPrepareXID}, + {"TypeCommitXID", ValueType::kTypeCommitXID}, + {"TypeRollbackXID", ValueType::kTypeRollbackXID}, + {"TypeNoop", ValueType::kTypeNoop}, + {"TypeColumnFamilyRangeDeletion", + ValueType::kTypeColumnFamilyRangeDeletion}, + {"TypeRangeDeletion", ValueType::kTypeRangeDeletion}, + {"TypeColumnFamilyBlobIndex", ValueType::kTypeColumnFamilyBlobIndex}, + {"TypeBlobIndex", ValueType::kTypeBlobIndex}, + {"TypeBeginPersistedPrepareXID", ValueType::kTypeBeginPersistedPrepareXID}, + {"TypeBeginUnprepareXID", ValueType::kTypeBeginUnprepareXID}, + {"TypeDeletionWithTimestamp", ValueType::kTypeDeletionWithTimestamp}, + {"TypeCommitXIDAndTimestamp", ValueType::kTypeCommitXIDAndTimestamp}, + {"TypeWideColumnEntity", ValueType::kTypeWideColumnEntity}, + {"TypeColumnFamilyWideColumnEntity", + ValueType::kTypeColumnFamilyWideColumnEntity}}; + +std::string KeyVersion::GetTypeName() const { + std::string type_name; + if (SerializeEnum(value_type_string_map, + static_cast(type), &type_name)) { + return type_name; + } else { + return "Invalid"; + } +} + +Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, + std::vector* key_versions) { + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key, + max_num_ikeys, key_versions); +} + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions) { + if (nullptr == db) { + return Status::InvalidArgument("db cannot be null."); + } + if (nullptr == cfh) { + return Status::InvalidArgument("Column family handle cannot be null."); + } + if (nullptr == key_versions) { + return Status::InvalidArgument("key_versions cannot be null."); + } + key_versions->clear(); + + DBImpl* idb = static_cast(db->GetRootDB()); + auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator); + ReadOptions read_options; + Arena arena; + ScopedArenaIterator iter( + idb->NewInternalIterator(read_options, &arena, kMaxSequenceNumber, cfh)); + + if (!begin_key.empty()) { + InternalKey ikey; + ikey.SetMinPossibleForUserKey(begin_key); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + size_t num_keys = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + Status pik_status = + ParseInternalKey(iter->key(), &ikey, true /* log_err_key */); // TODO + if (!pik_status.ok()) { + return pik_status; + } + + if (!end_key.empty() && + icmp.user_comparator()->Compare(ikey.user_key, end_key) > 0) { + break; + } + + key_versions->emplace_back(ikey.user_key.ToString() /* _user_key */, + iter->value().ToString() /* _value */, + ikey.sequence /* _sequence */, + static_cast(ikey.type) /* _type */); + if (++num_keys >= max_num_ikeys) { + break; + } + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/env_mirror.cc b/src/rocksdb/utilities/env_mirror.cc new file mode 100644 index 000000000..3ea323b42 --- /dev/null +++ b/src/rocksdb/utilities/env_mirror.cc @@ -0,0 +1,275 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/env_mirror.h" + +namespace ROCKSDB_NAMESPACE { + +// An implementation of Env that mirrors all work over two backend +// Env's. This is useful for debugging purposes. +class SequentialFileMirror : public SequentialFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit SequentialFileMirror(std::string f) : fname(f) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + Slice aslice; + Status as = a_->Read(n, &aslice, scratch); + if (as == Status::OK()) { + char* bscratch = new char[n]; + Slice bslice; +#ifndef NDEBUG + size_t off = 0; +#endif + size_t left = aslice.size(); + while (left) { + Status bs = b_->Read(left, &bslice, bscratch); +#ifndef NDEBUG + assert(as == bs); + assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); + off += bslice.size(); +#endif + left -= bslice.size(); + } + delete[] bscratch; + *result = aslice; + } else { + Status bs = b_->Read(n, result, scratch); + assert(as == bs); + } + return as; + } + + Status Skip(uint64_t n) override { + Status as = a_->Skip(n); + Status bs = b_->Skip(n); + assert(as == bs); + return as; + } + Status InvalidateCache(size_t offset, size_t length) override { + Status as = a_->InvalidateCache(offset, length); + Status bs = b_->InvalidateCache(offset, length); + assert(as == bs); + return as; + }; +}; + +class RandomAccessFileMirror : public RandomAccessFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit RandomAccessFileMirror(std::string f) : fname(f) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + Status as = a_->Read(offset, n, result, scratch); + if (as == Status::OK()) { + char* bscratch = new char[n]; + Slice bslice; + size_t off = 0; + size_t left = result->size(); + while (left) { + Status bs = b_->Read(offset + off, left, &bslice, bscratch); + assert(as == bs); + assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); + off += bslice.size(); + left -= bslice.size(); + } + delete[] bscratch; + } else { + Status bs = b_->Read(offset, n, result, scratch); + assert(as == bs); + } + return as; + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + // NOTE: not verified + return a_->GetUniqueId(id, max_size); + } +}; + +class WritableFileMirror : public WritableFile { + public: + std::unique_ptr a_, b_; + std::string fname; + explicit WritableFileMirror(std::string f, const EnvOptions& options) + : WritableFile(options), fname(f) {} + + Status Append(const Slice& data) override { + Status as = a_->Append(data); + Status bs = b_->Append(data); + assert(as == bs); + return as; + } + Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + Status as = a_->PositionedAppend(data, offset); + Status bs = b_->PositionedAppend(data, offset); + assert(as == bs); + return as; + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } + Status Truncate(uint64_t size) override { + Status as = a_->Truncate(size); + Status bs = b_->Truncate(size); + assert(as == bs); + return as; + } + Status Close() override { + Status as = a_->Close(); + Status bs = b_->Close(); + assert(as == bs); + return as; + } + Status Flush() override { + Status as = a_->Flush(); + Status bs = b_->Flush(); + assert(as == bs); + return as; + } + Status Sync() override { + Status as = a_->Sync(); + Status bs = b_->Sync(); + assert(as == bs); + return as; + } + Status Fsync() override { + Status as = a_->Fsync(); + Status bs = b_->Fsync(); + assert(as == bs); + return as; + } + bool IsSyncThreadSafe() const override { + bool as = a_->IsSyncThreadSafe(); + assert(as == b_->IsSyncThreadSafe()); + return as; + } + void SetIOPriority(Env::IOPriority pri) override { + a_->SetIOPriority(pri); + b_->SetIOPriority(pri); + } + Env::IOPriority GetIOPriority() override { + // NOTE: we don't verify this one + return a_->GetIOPriority(); + } + uint64_t GetFileSize() override { + uint64_t as = a_->GetFileSize(); + assert(as == b_->GetFileSize()); + return as; + } + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + // NOTE: we don't verify this one + return a_->GetPreallocationStatus(block_size, last_allocated_block); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + // NOTE: we don't verify this one + return a_->GetUniqueId(id, max_size); + } + Status InvalidateCache(size_t offset, size_t length) override { + Status as = a_->InvalidateCache(offset, length); + Status bs = b_->InvalidateCache(offset, length); + assert(as == bs); + return as; + } + + protected: + Status Allocate(uint64_t offset, uint64_t length) override { + Status as = a_->Allocate(offset, length); + Status bs = b_->Allocate(offset, length); + assert(as == bs); + return as; + } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status as = a_->RangeSync(offset, nbytes); + Status bs = b_->RangeSync(offset, nbytes); + assert(as == bs); + return as; + } +}; + +Status EnvMirror::NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) { + return a_->NewSequentialFile(f, r, options); + } + SequentialFileMirror* mf = new SequentialFileMirror(f); + Status as = a_->NewSequentialFile(f, &mf->a_, options); + Status bs = b_->NewSequentialFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) { + return a_->NewRandomAccessFile(f, r, options); + } + RandomAccessFileMirror* mf = new RandomAccessFileMirror(f); + Status as = a_->NewRandomAccessFile(f, &mf->a_, options); + Status bs = b_->NewRandomAccessFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::NewWritableFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options); + WritableFileMirror* mf = new WritableFileMirror(f, options); + Status as = a_->NewWritableFile(f, &mf->a_, options); + Status bs = b_->NewWritableFile(f, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +Status EnvMirror::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) { + if (fname.find("/proc/") == 0) + return a_->ReuseWritableFile(fname, old_fname, r, options); + WritableFileMirror* mf = new WritableFileMirror(fname, options); + Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options); + Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options); + assert(as == bs); + if (as.ok()) + r->reset(mf); + else + delete mf; + return as; +} + +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/utilities/env_mirror_test.cc b/src/rocksdb/utilities/env_mirror_test.cc new file mode 100644 index 000000000..c372de1da --- /dev/null +++ b/src/rocksdb/utilities/env_mirror_test.cc @@ -0,0 +1,226 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/env_mirror.h" + +#include "env/mock_env.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class EnvMirrorTest : public testing::Test { + public: + Env* default_; + MockEnv *a_, *b_; + EnvMirror* env_; + const EnvOptions soptions_; + + EnvMirrorTest() + : default_(Env::Default()), + a_(new MockEnv(default_)), + b_(new MockEnv(default_)), + env_(new EnvMirror(a_, b_)) {} + ~EnvMirrorTest() { + delete env_; + delete a_; + delete b_; + } +}; + +TEST_F(EnvMirrorTest, Basics) { + uint64_t file_size; + std::unique_ptr writable_file; + std::vector children; + + ASSERT_OK(env_->CreateDir("/dir")); + + // Check that the directory is empty. + ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_OK(env_->FileExists("/dir/f")); + ASSERT_OK(a_->FileExists("/dir/f")); + ASSERT_OK(b_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + ASSERT_OK(a_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + ASSERT_OK(b_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + ASSERT_OK(a_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + ASSERT_OK(b_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/f")); + ASSERT_OK(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + ASSERT_OK(a_->FileExists("/dir/g")); + ASSERT_OK(a_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + ASSERT_OK(b_->FileExists("/dir/g")); + ASSERT_OK(b_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that opening non-existent file fails. + std::unique_ptr seq_file; + std::unique_ptr rand_file; + ASSERT_TRUE( + !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE( + !env_->NewRandomAccessFile("/dir/non_existent", &rand_file, soptions_) + .ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); +} + +TEST_F(EnvMirrorTest, ReadWrite) { + std::unique_ptr writable_file; + std::unique_ptr seq_file; + std::unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST_F(EnvMirrorTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST_F(EnvMirrorTest, Misc) { + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + std::unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST_F(EnvMirrorTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + std::unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + std::unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete[] scratch; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int argc, char** argv) { + fprintf(stderr, "SKIPPED as EnvMirror is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/env_timed.cc b/src/rocksdb/utilities/env_timed.cc new file mode 100644 index 000000000..1eb723146 --- /dev/null +++ b/src/rocksdb/utilities/env_timed.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "utilities/env_timed.h" + +#include "env/composite_env_wrapper.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +TimedFileSystem::TimedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} +IOStatus TimedFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_sequential_file_nanos); + return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_random_access_file_nanos); + return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_writable_file_nanos); + return FileSystemWrapper::NewWritableFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_reuse_writable_file_nanos); + return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options, result, + dbg); +} + +IOStatus TimedFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_random_rw_file_nanos); + return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg); +} + +IOStatus TimedFileSystem::NewDirectory(const std::string& name, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_directory_nanos); + return FileSystemWrapper::NewDirectory(name, options, result, dbg); +} + +IOStatus TimedFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_file_exists_nanos); + return FileSystemWrapper::FileExists(fname, options, dbg); +} + +IOStatus TimedFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_children_nanos); + return FileSystemWrapper::GetChildren(dir, options, result, dbg); +} + +IOStatus TimedFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_children_file_attributes_nanos); + return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, + dbg); +} + +IOStatus TimedFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_delete_file_nanos); + return FileSystemWrapper::DeleteFile(fname, options, dbg); +} + +IOStatus TimedFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_create_dir_nanos); + return FileSystemWrapper::CreateDir(dirname, options, dbg); +} + +IOStatus TimedFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_create_dir_if_missing_nanos); + return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg); +} + +IOStatus TimedFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_delete_dir_nanos); + return FileSystemWrapper::DeleteDir(dirname, options, dbg); +} + +IOStatus TimedFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_file_size_nanos); + return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); +} + +IOStatus TimedFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_get_file_modification_time_nanos); + return FileSystemWrapper::GetFileModificationTime(fname, options, file_mtime, + dbg); +} + +IOStatus TimedFileSystem::RenameFile(const std::string& src, + const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_rename_file_nanos); + return FileSystemWrapper::RenameFile(src, dst, options, dbg); +} + +IOStatus TimedFileSystem::LinkFile(const std::string& src, + const std::string& dst, + const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_link_file_nanos); + return FileSystemWrapper::LinkFile(src, dst, options, dbg); +} + +IOStatus TimedFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_lock_file_nanos); + return FileSystemWrapper::LockFile(fname, options, lock, dbg); +} + +IOStatus TimedFileSystem::UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_unlock_file_nanos); + return FileSystemWrapper::UnlockFile(lock, options, dbg); +} + +IOStatus TimedFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) { + PERF_TIMER_GUARD(env_new_logger_nanos); + return FileSystemWrapper::NewLogger(fname, options, result, dbg); +} + +std::shared_ptr NewTimedFileSystem( + const std::shared_ptr& base) { + return std::make_shared(base); +} + +// An environment that measures function call times for filesystem +// operations, reporting results to variables in PerfContext. +Env* NewTimedEnv(Env* base_env) { + std::shared_ptr timed_fs = + NewTimedFileSystem(base_env->GetFileSystem()); + return new CompositeEnvWrapper(base_env, timed_fs); +} + +#else // ROCKSDB_LITE + +Env* NewTimedEnv(Env* /*base_env*/) { return nullptr; } + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/env_timed.h b/src/rocksdb/utilities/env_timed.h new file mode 100644 index 000000000..2d34fd590 --- /dev/null +++ b/src/rocksdb/utilities/env_timed.h @@ -0,0 +1,97 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include "rocksdb/file_system.h" +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +class TimedFileSystem : public FileSystemWrapper { + public: + explicit TimedFileSystem(const std::shared_ptr& base); + + static const char* kClassName() { return "TimedFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dst, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dst, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override; +}; + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/env_timed_test.cc b/src/rocksdb/utilities/env_timed_test.cc new file mode 100644 index 000000000..6e392579d --- /dev/null +++ b/src/rocksdb/utilities/env_timed_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/env.h" +#include "rocksdb/perf_context.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class TimedEnvTest : public testing::Test {}; + +TEST_F(TimedEnvTest, BasicTest) { + SetPerfLevel(PerfLevel::kEnableTime); + ASSERT_EQ(0, get_perf_context()->env_new_writable_file_nanos); + + std::unique_ptr mem_env(NewMemEnv(Env::Default())); + std::unique_ptr timed_env(NewTimedEnv(mem_env.get())); + std::unique_ptr writable_file; + ASSERT_OK(timed_env->NewWritableFile("f", &writable_file, EnvOptions())); + + ASSERT_GT(get_perf_context()->env_new_writable_file_nanos, 0); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else // ROCKSDB_LITE +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as TimedEnv is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/fault_injection_env.cc b/src/rocksdb/utilities/fault_injection_env.cc new file mode 100644 index 000000000..b0495a8c1 --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_env.cc @@ -0,0 +1,555 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "utilities/fault_injection_env.h" + +#include +#include + +#include "util/random.h" +namespace ROCKSDB_NAMESPACE { + +// Assume a filename, and not a directory name like "/foo/bar/" +std::string GetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// A basic file truncation function suitable for this test. +Status Truncate(Env* env, const std::string& filename, uint64_t length) { + std::unique_ptr orig_file; + const EnvOptions options; + Status s = env->NewSequentialFile(filename, &orig_file, options); + if (!s.ok()) { + fprintf(stderr, "Cannot open file %s for truncation: %s\n", + filename.c_str(), s.ToString().c_str()); + return s; + } + + std::unique_ptr scratch(new char[length]); + ROCKSDB_NAMESPACE::Slice result; + s = orig_file->Read(length, &result, scratch.get()); +#ifdef OS_WIN + orig_file.reset(); +#endif + if (s.ok()) { + std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; + std::unique_ptr tmp_file; + s = env->NewWritableFile(tmp_name, &tmp_file, options); + if (s.ok()) { + s = tmp_file->Append(result); + if (s.ok()) { + s = env->RenameFile(tmp_name, filename); + } else { + fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), + filename.c_str(), s.ToString().c_str()); + env->DeleteFile(tmp_name); + } + } + } + if (!s.ok()) { + fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), + s.ToString().c_str()); + } + + return s; +} + +// Trim the tailing "/" in the end of `str` +std::string TrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair of a full path. +std::pair GetDirAndName(const std::string& name) { + std::string dirname = GetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +Status FileState::DropUnsyncedData(Env* env) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + return Truncate(env, filename_, sync_pos); +} + +Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + assert(pos_ >= sync_pos); + int range = static_cast(pos_ - sync_pos); + uint64_t truncated_size = + static_cast(sync_pos) + rand->Uniform(range); + return Truncate(env, filename_, truncated_size); +} + +Status TestDirectory::Fsync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + env_->SyncDir(dirname_); + return dir_->Fsync(); +} + +Status TestDirectory::Close() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return dir_->Close(); +} + +TestRandomAccessFile::TestRandomAccessFile( + std::unique_ptr&& target, FaultInjectionTestEnv* env) + : target_(std::move(target)), env_(env) { + assert(target_); + assert(env_); +} + +Status TestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + assert(env_); + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + + assert(target_); + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomAccessFile::Prefetch(uint64_t offset, size_t n) { + assert(env_); + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + + assert(target_); + return target_->Prefetch(offset, n); +} + +Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(env_); + if (!env_->IsFilesystemActive()) { + const Status s = env_->GetError(); + + assert(reqs); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].status = s; + } + + return s; + } + + assert(target_); + return target_->MultiRead(reqs, num_reqs); +} + +TestWritableFile::TestWritableFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : state_(fname), + target_(std::move(f)), + writable_file_opened_(true), + env_(env) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestWritableFile::~TestWritableFile() { + if (writable_file_opened_) { + Close().PermitUncheckedError(); + } +} + +Status TestWritableFile::Append(const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + Status s = target_->Append(data); + if (s.ok()) { + state_.pos_ += data.size(); + env_->WritableFileAppended(state_); + } + return s; +} + +Status TestWritableFile::Close() { + writable_file_opened_ = false; + Status s = target_->Close(); + if (s.ok()) { + env_->WritableFileClosed(state_); + } + return s; +} + +Status TestWritableFile::Flush() { + Status s = target_->Flush(); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return s; +} + +Status TestWritableFile::Sync() { + if (!env_->IsFilesystemActive()) { + return Status::IOError("FaultInjectionTestEnv: not active"); + } + // No need to actual sync. + state_.pos_at_last_sync_ = state_.pos_; + env_->WritableFileSynced(state_); + return Status::OK(); +} + +TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : target_(std::move(f)), file_opened_(true), env_(env) { + assert(target_ != nullptr); +} + +TestRandomRWFile::~TestRandomRWFile() { + if (file_opened_) { + Close().PermitUncheckedError(); + } +} + +Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Write(offset, data); +} + +Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomRWFile::Close() { + file_opened_ = false; + return target_->Close(); +} + +Status TestRandomRWFile::Flush() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Flush(); +} + +Status TestRandomRWFile::Sync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Sync(); +} + +Status FaultInjectionTestEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + std::unique_ptr r; + Status s = target()->NewDirectory(name, &r); + assert(s.ok()); + if (!s.ok()) { + return s; + } + result->reset(new TestDirectory(this, TrimDirname(name), r.release())); + return Status::OK(); +} + +Status FaultInjectionTestEnv::NewWritableFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + // Not allow overwriting files + Status s = target()->FileExists(fname); + if (s.ok()) { + return Status::Corruption("File already exists."); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + s = target()->NewWritableFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::ReopenWritableFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + + bool exists; + Status s, exists_s = target()->FileExists(fname); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + s = exists_s; + exists = false; + } + + if (s.ok()) { + s = target()->ReopenWritableFile(fname, result, soptions); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestEnv` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestEnv`'s control. + if (s.ok()) { + bool should_track; + { + MutexLock l(&mutex_); + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `Env` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `Env` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } else { + should_track = false; + } + } + if (should_track) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + } + } + return s; +} + +Status FaultInjectionTestEnv::NewRandomRWFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = target()->NewRandomRWFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::NewRandomAccessFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + + assert(target()); + const Status s = target()->NewRandomAccessFile(fname, result, soptions); + if (!s.ok()) { + return s; + } + + assert(result); + result->reset(new TestRandomAccessFile(std::move(*result), this)); + + return Status::OK(); +} + +Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = EnvWrapper::DeleteFile(f); + if (s.ok()) { + UntrackFile(f); + } + return s; +} + +Status FaultInjectionTestEnv::RenameFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::RenameFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + +Status FaultInjectionTestEnv::LinkFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::LinkFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + +void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + db_file_state_[state.filename_] = state; + open_managed_files_.erase(state.filename_); + } +} + +void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +// For every file that is not fully synced, make a call to `func` with +// FileState of the file as the parameter. +Status FaultInjectionTestEnv::DropFileData( + std::function func) { + Status s; + MutexLock l(&mutex_); + for (std::map::const_iterator it = + db_file_state_.begin(); + s.ok() && it != db_file_state_.end(); ++it) { + const FileState& state = it->second; + if (!state.IsFullySynced()) { + s = func(target(), state); + } + } + return s; +} + +Status FaultInjectionTestEnv::DropUnsyncedFileData() { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropUnsyncedData(env); + }); +} + +Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropRandomUnsyncedData(env, rnd); + }); +} + +Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (std::string name : pair.second) { + Status s = DeleteFile(pair.first + "/" + name); + if (!s.ok()) { + return s; + } + } + } + return Status::OK(); +} +void FaultInjectionTestEnv::ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); +} + +void FaultInjectionTestEnv::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = GetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_managed_files_.erase(f); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/fault_injection_env.h b/src/rocksdb/utilities/fault_injection_env.h new file mode 100644 index 000000000..549bfe716 --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_env.h @@ -0,0 +1,258 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#pragma once + +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/env.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { +class Random; +class TestWritableFile; +class FaultInjectionTestEnv; + +struct FileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + + explicit FileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) {} + + FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + Status DropUnsyncedData(Env* env) const; + + Status DropRandomUnsyncedData(Env* env, Random* rand) const; +}; + +class TestRandomAccessFile : public RandomAccessFile { + public: + TestRandomAccessFile(std::unique_ptr&& target, + FaultInjectionTestEnv* env); + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + Status Prefetch(uint64_t offset, size_t n) override; + + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override; + + private: + std::unique_ptr target_; + FaultInjectionTestEnv* env_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestWritableFile : public WritableFile { + public: + explicit TestWritableFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestWritableFile(); + virtual Status Append(const Slice& data) override; + virtual Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + virtual Status Truncate(uint64_t size) override { + return target_->Truncate(size); + } + virtual Status Close() override; + virtual Status Flush() override; + virtual Status Sync() override; + virtual bool IsSyncThreadSafe() const override { return true; } + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + virtual Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /*verification_info*/) override { + return PositionedAppend(data, offset); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; + + private: + FileState state_; + std::unique_ptr target_; + bool writable_file_opened_; + FaultInjectionTestEnv* env_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestRandomRWFile : public RandomRWFile { + public: + explicit TestRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestRandomRWFile(); + Status Write(uint64_t offset, const Slice& data) override; + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + Status Close() override; + Status Flush() override; + Status Sync() override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestEnv* env_; +}; + +class TestDirectory : public Directory { + public: + explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, + Directory* dir) + : env_(env), dirname_(dirname), dir_(dir) {} + ~TestDirectory() {} + + virtual Status Fsync() override; + virtual Status Close() override; + + private: + FaultInjectionTestEnv* env_; + std::string dirname_; + std::unique_ptr dir_; +}; + +class FaultInjectionTestEnv : public EnvWrapper { + public: + explicit FaultInjectionTestEnv(Env* base) + : EnvWrapper(base), filesystem_active_(true) {} + virtual ~FaultInjectionTestEnv() { error_.PermitUncheckedError(); } + + static const char* kClassName() { return "FaultInjectionTestEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override; + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + + virtual Status DeleteFile(const std::string& f) override; + + virtual Status RenameFile(const std::string& s, + const std::string& t) override; + + virtual Status LinkFile(const std::string& s, const std::string& t) override; + +// Undef to eliminate clash on Windows +#undef GetFreeSpace + virtual Status GetFreeSpace(const std::string& path, + uint64_t* disk_free) override { + if (!IsFilesystemActive() && + error_.subcode() == IOStatus::SubCode::kNoSpace) { + *disk_free = 0; + return Status::OK(); + } else { + return target()->GetFreeSpace(path, disk_free); + } + } + + void WritableFileClosed(const FileState& state); + + void WritableFileSynced(const FileState& state); + + void WritableFileAppended(const FileState& state); + + // For every file that is not fully synced, make a call to `func` with + // FileState of the file as the parameter. + Status DropFileData(std::function func); + + Status DropUnsyncedFileData(); + + Status DropRandomUnsyncedFileData(Random* rnd); + + Status DeleteFilesCreatedAfterLastDirSync(); + + void ResetState(); + + void UntrackFile(const std::string& f); + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + void SetFilesystemActiveNoLock( + bool active, Status error = Status::Corruption("Not active")) { + error.PermitUncheckedError(); + filesystem_active_ = active; + if (!active) { + error_ = error; + } + error.PermitUncheckedError(); + } + void SetFilesystemActive(bool active, + Status error = Status::Corruption("Not active")) { + error.PermitUncheckedError(); + MutexLock l(&mutex_); + SetFilesystemActiveNoLock(active, error); + error.PermitUncheckedError(); + } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } + Status GetError() { return error_; } + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set open_managed_files_; + std::unordered_map> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes + Status error_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/fault_injection_fs.cc b/src/rocksdb/utilities/fault_injection_fs.cc new file mode 100644 index 000000000..549051856 --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_fs.cc @@ -0,0 +1,1032 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom FileSystem to keep track of the state of a file +// system the last "Sync". The data being written is cached in a "buffer". +// Only when "Sync" is called, the data will be persistent. It can simulate +// file data loss (or entire files) not protected by a "Sync". For any of the +// FileSystem related operations, by specify the "IOStatus Error", a specific +// error can be returned when file system is not activated. + +#include "utilities/fault_injection_fs.h" + +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "port/lang.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kNewFileNoOverwrite = ""; + +// Assume a filename, and not a directory name like "/foo/bar/" +std::string TestFSGetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// Trim the tailing "/" in the end of `str` +std::string TestFSTrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair of a full path. +std::pair TestFSGetDirAndName( + const std::string& name) { + std::string dirname = TestFSGetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +// Calculate the checksum of the data with corresponding checksum +// type. If name does not match, no checksum is returned. +void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data, + size_t size, std::string* checksum) { + if (checksum_type == ChecksumType::kCRC32c) { + uint32_t v_crc32c = crc32c::Extend(0, data, size); + PutFixed32(checksum, v_crc32c); + return; + } else if (checksum_type == ChecksumType::kxxHash) { + uint32_t v = XXH32(data, size, 0); + PutFixed32(checksum, v); + } + return; +} + +IOStatus FSFileState::DropUnsyncedData() { + buffer_.resize(0); + return IOStatus::OK(); +} + +IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) { + int range = static_cast(buffer_.size()); + size_t truncated_size = static_cast(rand->Uniform(range)); + buffer_.resize(truncated_size); + return IOStatus::OK(); +} + +IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + fs_->SyncDir(dirname_); + IOStatus s = dir_->Fsync(options, dbg); + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return s; +} + +IOStatus TestFSDirectory::Close(const IOOptions& options, IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = dir_->Close(options, dbg); + return s; +} + +IOStatus TestFSDirectory::FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + fs_->SyncDir(dirname_); + IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return s; +} + +TestFSWritableFile::TestFSWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : state_(fname), + file_opts_(file_opts), + target_(std::move(f)), + writable_file_opened_(true), + fs_(fs) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestFSWritableFile::~TestFSWritableFile() { + if (writable_file_opened_) { + Close(IOOptions(), nullptr).PermitUncheckedError(); + } +} + +IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (target_->use_direct_io()) { + target_->Append(data, options, dbg).PermitUncheckedError(); + } else { + state_.buffer_.append(data.data(), data.size()); + state_.pos_ += data.size(); + fs_->WritableFileAppended(state_); + } + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is +// simulated. +IOStatus TestFSWritableFile::Append( + const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->ShouldDataCorruptionBeforeWrite()) { + return IOStatus::Corruption("Data is corrupted!"); + } + + // Calculate the checksum + std::string checksum; + CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(), + data.size(), &checksum); + if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && + checksum != verification_info.checksum.ToString()) { + std::string msg = "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString() + + "current data checksum: " + checksum; + return IOStatus::Corruption(msg); + } + if (target_->use_direct_io()) { + target_->Append(data, options, dbg).PermitUncheckedError(); + } else { + state_.buffer_.append(data.data(), data.size()); + state_.pos_ += data.size(); + fs_->WritableFileAppended(state_); + } + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +IOStatus TestFSWritableFile::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->ShouldDataCorruptionBeforeWrite()) { + return IOStatus::Corruption("Data is corrupted!"); + } + + // Calculate the checksum + std::string checksum; + CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(), + data.size(), &checksum); + if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum && + checksum != verification_info.checksum.ToString()) { + std::string msg = "Data is corrupted! Origin data checksum: " + + verification_info.checksum.ToString() + + "current data checksum: " + checksum; + return IOStatus::Corruption(msg); + } + target_->PositionedAppend(data, offset, options, dbg); + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; +} + +IOStatus TestFSWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + writable_file_opened_ = false; + IOStatus io_s; + if (!target_->use_direct_io()) { + io_s = target_->Append(state_.buffer_, options, dbg); + } + if (io_s.ok()) { + state_.buffer_.resize(0); + // Ignore sync errors + target_->Sync(options, dbg).PermitUncheckedError(); + io_s = target_->Close(options, dbg); + } + if (io_s.ok()) { + fs_->WritableFileClosed(state_); + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return io_s; +} + +IOStatus TestFSWritableFile::Flush(const IOOptions&, IODebugContext*) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (fs_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return IOStatus::OK(); +} + +IOStatus TestFSWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + if (target_->use_direct_io()) { + // For Direct IO mode, we don't buffer anything in TestFSWritableFile. + // So just return + return IOStatus::OK(); + } + IOStatus io_s = target_->Append(state_.buffer_, options, dbg); + state_.buffer_.resize(0); + // Ignore sync errors + target_->Sync(options, dbg).PermitUncheckedError(); + state_.pos_at_last_sync_ = state_.pos_; + fs_->WritableFileSynced(state_); + return io_s; +} + +IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + MutexLock l(&mutex_); + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + // Assumes caller passes consecutive byte ranges. + uint64_t sync_limit = offset + nbytes; + uint64_t buf_begin = + state_.pos_at_last_sync_ < 0 ? 0 : state_.pos_at_last_sync_; + + IOStatus io_s; + if (sync_limit < buf_begin) { + return io_s; + } + uint64_t num_to_sync = std::min(static_cast(state_.buffer_.size()), + sync_limit - buf_begin); + Slice buf_to_sync(state_.buffer_.data(), num_to_sync); + io_s = target_->Append(buf_to_sync, options, dbg); + state_.buffer_ = state_.buffer_.substr(num_to_sync); + // Ignore sync errors + target_->RangeSync(offset, nbytes, options, dbg).PermitUncheckedError(); + state_.pos_at_last_sync_ = offset + num_to_sync; + fs_->WritableFileSynced(state_); + return io_s; +} + +TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : target_(std::move(f)), file_opened_(true), fs_(fs) { + assert(target_ != nullptr); +} + +TestFSRandomRWFile::~TestFSRandomRWFile() { + if (file_opened_) { + Close(IOOptions(), nullptr).PermitUncheckedError(); + } +} + +IOStatus TestFSRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Write(offset, data, options, dbg); +} + +IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Read(offset, n, options, result, scratch, dbg); +} + +IOStatus TestFSRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + file_opened_ = false; + return target_->Close(options, dbg); +} + +IOStatus TestFSRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Flush(options, dbg); +} + +IOStatus TestFSRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + return target_->Sync(options, dbg); +} + +TestFSRandomAccessFile::TestFSRandomAccessFile( + const std::string& /*fname*/, std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : target_(std::move(f)), fs_(fs) { + assert(target_ != nullptr); +} + +IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, + IODebugContext* dbg) const { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = target_->Read(offset, n, options, result, scratch, dbg); + if (s.ok()) { + s = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(), + scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr); + } + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error"); + } + return s; +} + +IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg); + bool injected_error = false; + for (size_t i = 0; i < num_reqs; i++) { + if (!reqs[i].status.ok()) { + // Already seeing an error. + break; + } + bool this_injected_error; + reqs[i].status = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq, + &(reqs[i].result), use_direct_io(), reqs[i].scratch, + /*need_count_increase=*/true, + /*fault_injected=*/&this_injected_error); + injected_error |= this_injected_error; + } + if (s.ok()) { + s = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr, + use_direct_io(), nullptr, /*need_count_increase=*/!injected_error, + /*fault_injected=*/nullptr); + } + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error"); + } + return s; +} + +size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + if (fs_->ShouldFailGetUniqueId()) { + return 0; + } else { + return target_->GetUniqueId(id, max_size); + } +} +IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + IOStatus s = target()->Read(n, options, result, scratch, dbg); + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected seq read error"); + } + return s; +} + +IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + IOStatus s = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + if (s.ok() && fs_->ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected seq positioned read error"); + } + return s; +} + +IOStatus FaultInjectionTestFS::NewDirectory( + const std::string& name, const IOOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + std::unique_ptr r; + IOStatus io_s = target()->NewDirectory(name, options, &r, dbg); + if (!io_s.ok()) { + return io_s; + } + result->reset( + new TestFSDirectory(this, TestFSTrimDirname(name), r.release())); + return IOStatus::OK(); +} + +IOStatus FaultInjectionTestFS::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + if (ShouldUseDiretWritable(fname)) { + return target()->NewWritableFile(fname, file_opts, result, dbg); + } + + IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset( + new TestFSWritableFile(fname, file_opts, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + { + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + // The new file could overwrite an old one. Here we simplify + // the implementation by assuming no file of this name after + // dropping unsynced files. + list[dir_and_name.second] = kNewFileNoOverwrite; + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldUseDiretWritable(fname)) { + return target()->ReopenWritableFile(fname, file_opts, result, dbg); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + bool exists; + IOStatus io_s, + exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */); + if (exists_s.IsNotFound()) { + exists = false; + } else if (exists_s.ok()) { + exists = true; + } else { + io_s = exists_s; + exists = false; + } + + if (io_s.ok()) { + io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + } + + // Only track files we created. Files created outside of this + // `FaultInjectionTestFS` are not eligible for tracking/data dropping + // (for example, they may contain data a previous db_stress run expects to + // be recovered). This could be extended to track/drop data appended once + // the file is under `FaultInjectionTestFS`'s control. + if (io_s.ok()) { + bool should_track; + { + MutexLock l(&mutex_); + if (db_file_state_.find(fname) != db_file_state_.end()) { + // It was written by this `FileSystem` earlier. + assert(exists); + should_track = true; + } else if (!exists) { + // It was created by this `FileSystem` just now. + should_track = true; + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list[dir_and_name.second] = kNewFileNoOverwrite; + } else { + should_track = false; + } + } + if (should_track) { + result->reset( + new TestFSWritableFile(fname, file_opts, std::move(*result), this)); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldUseDiretWritable(fname)) { + return target()->NewRandomRWFile(fname, file_opts, result, dbg); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset(new TestFSRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + { + MutexLock l(&mutex_); + open_managed_files_.insert(fname); + auto dir_and_name = TestFSGetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + // It could be overwriting an old file, but we simplify the + // implementation by ignoring it. + list[dir_and_name.second] = kNewFileNoOverwrite; + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + if (ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected error when open random access file"); + } + IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr, + false, nullptr, + /*need_count_increase=*/true, + /*fault_injected=*/nullptr); + if (io_s.ok()) { + io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + } + if (io_s.ok()) { + result->reset(new TestFSRandomAccessFile(fname, std::move(*result), this)); + } + return io_s; +} + +IOStatus FaultInjectionTestFS::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + + if (ShouldInjectRandomReadError()) { + return IOStatus::IOError("Injected read error when creating seq file"); + } + IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg); + if (io_s.ok()) { + result->reset(new TestFSSequentialFile(std::move(*result), this)); + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg); + if (io_s.ok()) { + UntrackFile(f); + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::RenameFile(const std::string& s, + const std::string& t, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + // We preserve contents of overwritten files up to a size threshold. + // We could keep previous file in another name, but we need to worry about + // garbage collect the those files. We do it if it is needed later. + // We ignore I/O errors here for simplicity. + std::string previous_contents = kNewFileNoOverwrite; + if (target()->FileExists(t, IOOptions(), nullptr).ok()) { + uint64_t file_size; + if (target()->GetFileSize(t, IOOptions(), &file_size, nullptr).ok() && + file_size < 1024) { + ReadFileToString(target(), t, &previous_contents).PermitUncheckedError(); + } + } + IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg); + + if (io_s.ok()) { + { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = TestFSGetDirAndName(s); + auto tdn = TestFSGetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist[tdn.second] = previous_contents; + } + } + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + return io_s; +} + +IOStatus FaultInjectionTestFS::LinkFile(const std::string& s, + const std::string& t, + const IOOptions& options, + IODebugContext* dbg) { + if (!IsFilesystemActive()) { + return GetError(); + } + { + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + // Using the value in `dir_to_new_files_since_last_sync_` for the source file + // may be a more reasonable choice. + std::string previous_contents = kNewFileNoOverwrite; + + IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg); + + if (io_s.ok()) { + { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + } + + auto sdn = TestFSGetDirAndName(s); + auto tdn = TestFSGetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) != + dir_to_new_files_since_last_sync_[sdn.first].end()) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist[tdn.second] = previous_contents; + } + } + IOStatus in_s = InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + + return io_s; +} + +void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + db_file_state_[state.filename_] = state; + open_managed_files_.erase(state.filename_); + } +} + +void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) { + MutexLock l(&mutex_); + if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +IOStatus FaultInjectionTestFS::DropUnsyncedFileData() { + IOStatus io_s; + MutexLock l(&mutex_); + for (std::map::iterator it = db_file_state_.begin(); + io_s.ok() && it != db_file_state_.end(); ++it) { + FSFileState& fs_state = it->second; + if (!fs_state.IsFullySynced()) { + io_s = fs_state.DropUnsyncedData(); + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DropRandomUnsyncedFileData(Random* rnd) { + IOStatus io_s; + MutexLock l(&mutex_); + for (std::map::iterator it = db_file_state_.begin(); + io_s.ok() && it != db_file_state_.end(); ++it) { + FSFileState& fs_state = it->second; + if (!fs_state.IsFullySynced()) { + io_s = fs_state.DropRandomUnsyncedData(rnd); + } + } + return io_s; +} + +IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync( + const IOOptions& options, IODebugContext* dbg) { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (auto& file_pair : pair.second) { + if (file_pair.second == kNewFileNoOverwrite) { + IOStatus io_s = + DeleteFile(pair.first + "/" + file_pair.first, options, dbg); + if (!io_s.ok()) { + return io_s; + } + } else { + IOStatus io_s = + WriteStringToFile(target(), file_pair.second, + pair.first + "/" + file_pair.first, true); + if (!io_s.ok()) { + return io_s; + } + } + } + } + return IOStatus::OK(); +} + +void FaultInjectionTestFS::ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); +} + +void FaultInjectionTestFS::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = TestFSGetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_managed_files_.erase(f); +} + +IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( + ErrorOperation op, Slice* result, bool direct_io, char* scratch, + bool need_count_increase, bool* fault_injected) { + bool dummy_bool; + bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool; + ret_fault_injected = false; + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) { + return IOStatus::OK(); + } + + if (ctx->rand.OneIn(ctx->one_in)) { + if (ctx->count == 0) { + ctx->message = ""; + } + if (need_count_increase) { + ctx->count++; + } + if (ctx->callstack) { + free(ctx->callstack); + } + ctx->callstack = port::SaveStack(&ctx->frames); + + if (op != ErrorOperation::kMultiReadSingleReq) { + // Likely non-per read status code for MultiRead + ctx->message += "error; "; + ret_fault_injected = true; + return IOStatus::IOError(); + } else if (Random::GetTLSInstance()->OneIn(8)) { + assert(result); + // For a small chance, set the failure to status but turn the + // result to be empty, which is supposed to be caught for a check. + *result = Slice(); + ctx->message += "inject empty result; "; + ret_fault_injected = true; + } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) && + scratch != nullptr && result->data() == scratch) { + assert(result); + // With direct I/O, many extra bytes might be read so corrupting + // one byte might not cause checksum mismatch. Skip checksum + // corruption injection. + // We only corrupt data if the result is filled to `scratch`. For other + // cases, the data might not be able to be modified (e.g mmaped files) + // or has unintended side effects. + // For a small chance, set the failure to status but corrupt the + // result in a way that checksum checking is supposed to fail. + // Corrupt the last byte, which is supposed to be a checksum byte + // It would work for CRC. Not 100% sure for xxhash and will adjust + // if it is not the case. + const_cast(result->data())[result->size() - 1]++; + ctx->message += "corrupt last byte; "; + ret_fault_injected = true; + } else { + ctx->message += "error result multiget single; "; + ret_fault_injected = true; + return IOStatus::IOError(); + } + } + return IOStatus::OK(); +} + +bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name, + uint64_t* number, FileType* type) { + std::size_t found = file_name.find_last_of("/"); + std::string file = file_name.substr(found); + return ParseFileName(file, number, type); +} + +IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) { + MutexLock l(&mutex_); + if (!enable_write_error_injection_ || !write_error_one_in_) { + return IOStatus::OK(); + } + bool allowed_type = false; + + if (inject_for_all_file_types_) { + allowed_type = true; + } else { + uint64_t number; + FileType cur_type = kTempFile; + if (TryParseFileName(file_name, &number, &cur_type)) { + for (const auto& type : write_error_allowed_types_) { + if (cur_type == type) { + allowed_type = true; + } + } + } + } + + if (allowed_type) { + if (write_error_rand_.OneIn(write_error_one_in_)) { + return GetError(); + } + } + return IOStatus::OK(); +} + +IOStatus FaultInjectionTestFS::InjectMetadataWriteError() { + { + MutexLock l(&mutex_); + if (!enable_metadata_write_error_injection_ || + !metadata_write_error_one_in_ || + !write_error_rand_.OneIn(metadata_write_error_one_in_)) { + return IOStatus::OK(); + } + } + TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected"); + return IOStatus::IOError(); +} + +void FaultInjectionTestFS::PrintFaultBacktrace() { +#if defined(OS_LINUX) + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx == nullptr) { + return; + } + fprintf(stderr, "Injected error type = %d\n", ctx->type); + fprintf(stderr, "Message: %s\n", ctx->message.c_str()); + port::PrintAndFreeStack(ctx->callstack, ctx->frames); + ctx->callstack = nullptr; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/fault_injection_fs.h b/src/rocksdb/utilities/fault_injection_fs.h new file mode 100644 index 000000000..53c9ccb6f --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_fs.h @@ -0,0 +1,584 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom FileSystem to keep track of the state of a file +// system the last "Sync". The data being written is cached in a "buffer". +// Only when "Sync" is called, the data will be persistent. It can similate +// file data loss (or entire files) not protected by a "Sync". For any of the +// FileSystem related operations, by specify the "IOStatus Error", a specific +// error can be returned when file system is not activated. + +#pragma once + +#include +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/file_system.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFSWritableFile; +class FaultInjectionTestFS; + +struct FSFileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + std::string buffer_; + + explicit FSFileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) {} + + FSFileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + IOStatus DropUnsyncedData(); + + IOStatus DropRandomUnsyncedData(Random* rand); +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestFSWritableFile : public FSWritableFile { + public: + explicit TestFSWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + virtual ~TestFSWritableFile(); + virtual IOStatus Append(const Slice& data, const IOOptions&, + IODebugContext*) override; + virtual IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + virtual IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Truncate(size, options, dbg); + } + virtual IOStatus Close(const IOOptions& options, + IODebugContext* dbg) override; + virtual IOStatus Flush(const IOOptions&, IODebugContext*) override; + virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, + IODebugContext* dbg) override; + virtual bool IsSyncThreadSafe() const override { return true; } + virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + virtual size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; + + private: + FSFileState state_; // Need protection by mutex_ + FileOptions file_opts_; + std::unique_ptr target_; + bool writable_file_opened_; + FaultInjectionTestFS* fs_; + port::Mutex mutex_; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestFSRandomRWFile : public FSRandomRWFile { + public: + explicit TestFSRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + virtual ~TestFSRandomRWFile(); + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestFS* fs_; +}; + +class TestFSRandomAccessFile : public FSRandomAccessFile { + public: + explicit TestFSRandomAccessFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestFS* fs); + ~TestFSRandomAccessFile() override {} + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetUniqueId(char* id, size_t max_size) const override; + + private: + std::unique_ptr target_; + FaultInjectionTestFS* fs_; +}; + +class TestFSSequentialFile : public FSSequentialFileOwnerWrapper { + public: + explicit TestFSSequentialFile(std::unique_ptr&& f, + FaultInjectionTestFS* fs) + : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {} + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + private: + FaultInjectionTestFS* fs_; +}; + +class TestFSDirectory : public FSDirectory { + public: + explicit TestFSDirectory(FaultInjectionTestFS* fs, std::string dirname, + FSDirectory* dir) + : fs_(fs), dirname_(dirname), dir_(dir) {} + ~TestFSDirectory() {} + + virtual IOStatus Fsync(const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus Close(const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override; + + private: + FaultInjectionTestFS* fs_; + std::string dirname_; + std::unique_ptr dir_; +}; + +class FaultInjectionTestFS : public FileSystemWrapper { + public: + explicit FaultInjectionTestFS(const std::shared_ptr& base) + : FileSystemWrapper(base), + filesystem_active_(true), + filesystem_writable_(false), + thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)), + enable_write_error_injection_(false), + enable_metadata_write_error_injection_(false), + write_error_rand_(0), + write_error_one_in_(0), + metadata_write_error_one_in_(0), + read_error_one_in_(0), + ingest_data_corruption_before_write_(false), + fail_get_file_unique_id_(false) {} + virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); } + + static const char* kClassName() { return "FaultInjectionTestFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewDirectory(const std::string& name, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, + IODebugContext* dbg) override; + + virtual IOStatus LinkFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) override; + +// Undef to eliminate clash on Windows +#undef GetFreeSpace + virtual IOStatus GetFreeSpace(const std::string& path, + const IOOptions& options, uint64_t* disk_free, + IODebugContext* dbg) override { + IOStatus io_s; + if (!IsFilesystemActive() && + error_.subcode() == IOStatus::SubCode::kNoSpace) { + *disk_free = 0; + } else { + io_s = target()->GetFreeSpace(path, options, disk_free, dbg); + } + return io_s; + } + + void WritableFileClosed(const FSFileState& state); + + void WritableFileSynced(const FSFileState& state); + + void WritableFileAppended(const FSFileState& state); + + IOStatus DropUnsyncedFileData(); + + IOStatus DropRandomUnsyncedFileData(Random* rnd); + + IOStatus DeleteFilesCreatedAfterLastDirSync(const IOOptions& options, + IODebugContext* dbg); + + void ResetState(); + + void UntrackFile(const std::string& f); + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + + // Setting filesystem_writable_ makes NewWritableFile. ReopenWritableFile, + // and NewRandomRWFile bypass FaultInjectionTestFS and go directly to the + // target FS + bool IsFilesystemDirectWritable() { + MutexLock l(&mutex_); + return filesystem_writable_; + } + bool ShouldUseDiretWritable(const std::string& file_name) { + MutexLock l(&mutex_); + if (filesystem_writable_) { + return true; + } + FileType file_type = kTempFile; + uint64_t file_number = 0; + if (!TryParseFileName(file_name, &file_number, &file_type)) { + return false; + } + return skip_direct_writable_types_.find(file_type) != + skip_direct_writable_types_.end(); + } + void SetFilesystemActiveNoLock( + bool active, IOStatus error = IOStatus::Corruption("Not active")) { + error.PermitUncheckedError(); + filesystem_active_ = active; + if (!active) { + error_ = error; + } + } + void SetFilesystemActive( + bool active, IOStatus error = IOStatus::Corruption("Not active")) { + MutexLock l(&mutex_); + error.PermitUncheckedError(); + SetFilesystemActiveNoLock(active, error); + } + void SetFilesystemDirectWritable(bool writable) { + MutexLock l(&mutex_); + filesystem_writable_ = writable; + } + void AssertNoOpenFile() { assert(open_managed_files_.empty()); } + + IOStatus GetError() { return error_; } + + void SetFileSystemIOError(IOStatus io_error) { + MutexLock l(&mutex_); + io_error.PermitUncheckedError(); + error_ = io_error; + } + + // To simulate the data corruption before data is written in FS + void IngestDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + ingest_data_corruption_before_write_ = true; + } + + void NoDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + ingest_data_corruption_before_write_ = false; + } + + bool ShouldDataCorruptionBeforeWrite() { + MutexLock l(&mutex_); + return ingest_data_corruption_before_write_; + } + + void SetChecksumHandoffFuncType(const ChecksumType& func_type) { + MutexLock l(&mutex_); + checksum_handoff_func_tpye_ = func_type; + } + + const ChecksumType& GetChecksumHandoffFuncType() { + MutexLock l(&mutex_); + return checksum_handoff_func_tpye_; + } + + void SetFailGetUniqueId(bool flag) { + MutexLock l(&mutex_); + fail_get_file_unique_id_ = flag; + } + + bool ShouldFailGetUniqueId() { + MutexLock l(&mutex_); + return fail_get_file_unique_id_; + } + + // Specify what the operation, so we can inject the right type of error + enum ErrorOperation : char { + kRead = 0, + kMultiReadSingleReq = 1, + kMultiRead = 2, + kOpen, + }; + + // Set thread-local parameters for error injection. The first argument, + // seed is the seed for the random number generator, and one_in determines + // the probability of injecting error (i.e an error is injected with + // 1/one_in probability) + void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) { + struct ErrorContext* ctx = + static_cast(thread_local_error_->Get()); + if (ctx == nullptr) { + ctx = new ErrorContext(seed); + thread_local_error_->Reset(ctx); + } + ctx->one_in = one_in; + ctx->count = 0; + } + + static void DeleteThreadLocalErrorContext(void* p) { + ErrorContext* ctx = static_cast(p); + delete ctx; + } + + // This is to set the parameters for the write error injection. + // seed is the seed for the random number generator, and one_in determines + // the probability of injecting error (i.e an error is injected with + // 1/one_in probability). For write error, we can specify the error we + // want to inject. Types decides the file types we want to inject the + // error (e.g., Wal files, SST files), which is empty by default. + void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error, + bool inject_for_all_file_types, + const std::vector& types) { + MutexLock l(&mutex_); + Random tmp_rand(seed); + error.PermitUncheckedError(); + error_ = error; + write_error_rand_ = tmp_rand; + write_error_one_in_ = one_in; + inject_for_all_file_types_ = inject_for_all_file_types; + write_error_allowed_types_ = types; + } + + void SetSkipDirectWritableTypes(const std::set& types) { + MutexLock l(&mutex_); + skip_direct_writable_types_ = types; + } + + void SetRandomMetadataWriteError(int one_in) { + MutexLock l(&mutex_); + metadata_write_error_one_in_ = one_in; + } + // If the value is not 0, it is enabled. Otherwise, it is disabled. + void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; } + + bool ShouldInjectRandomReadError() { + return read_error_one_in() && + Random::GetTLSInstance()->OneIn(read_error_one_in()); + } + + // Inject an write error with randomlized parameter and the predefined + // error type. Only the allowed file types will inject the write error + IOStatus InjectWriteError(const std::string& file_name); + + // Ingest error to metadata operations. + IOStatus InjectMetadataWriteError(); + + // Inject an error. For a READ operation, a status of IOError(), a + // corruption in the contents of scratch, or truncation of slice + // are the types of error with equal probability. For OPEN, + // its always an IOError. + // fault_injected returns whether a fault is injected. It is needed + // because some fault is inected with IOStatus to be OK. + IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice, + bool direct_io, char* scratch, + bool need_count_increase, + bool* fault_injected); + + // Get the count of how many times we injected since the previous call + int GetAndResetErrorCount() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + int count = 0; + if (ctx != nullptr) { + count = ctx->count; + ctx->count = 0; + } + return count; + } + + void EnableErrorInjection() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx) { + ctx->enable_error_injection = true; + } + } + + void EnableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = true; + } + void EnableMetadataWriteErrorInjection() { + MutexLock l(&mutex_); + enable_metadata_write_error_injection_ = true; + } + + void DisableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = false; + } + + void DisableErrorInjection() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (ctx) { + ctx->enable_error_injection = false; + } + } + + void DisableMetadataWriteErrorInjection() { + MutexLock l(&mutex_); + enable_metadata_write_error_injection_ = false; + } + + int read_error_one_in() const { return read_error_one_in_.load(); } + + int write_error_one_in() const { return write_error_one_in_; } + + // We capture a backtrace every time a fault is injected, for debugging + // purposes. This call prints the backtrace to stderr and frees the + // saved callstack + void PrintFaultBacktrace(); + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set open_managed_files_; + // directory -> (file name -> file contents to recover) + // When data is recovered from unsyned parent directory, the files with + // empty file contents to recover is deleted. Those with non-empty ones + // will be recovered to content accordingly. + std::unordered_map> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes + bool filesystem_writable_; // Bypass FaultInjectionTestFS and go directly + // to underlying FS for writable files + IOStatus error_; + + enum ErrorType : int { + kErrorTypeStatus = 0, + kErrorTypeCorruption, + kErrorTypeTruncated, + kErrorTypeMax + }; + + struct ErrorContext { + Random rand; + int one_in; + int count; + bool enable_error_injection; + void* callstack; + std::string message; + int frames; + ErrorType type; + + explicit ErrorContext(uint32_t seed) + : rand(seed), + enable_error_injection(false), + callstack(nullptr), + frames(0) {} + ~ErrorContext() { + if (callstack) { + free(callstack); + } + } + }; + + std::unique_ptr thread_local_error_; + bool enable_write_error_injection_; + bool enable_metadata_write_error_injection_; + Random write_error_rand_; + int write_error_one_in_; + int metadata_write_error_one_in_; + std::atomic read_error_one_in_; + bool inject_for_all_file_types_; + std::vector write_error_allowed_types_; + // File types where direct writable is skipped. + std::set skip_direct_writable_types_; + bool ingest_data_corruption_before_write_; + ChecksumType checksum_handoff_func_tpye_; + bool fail_get_file_unique_id_; + + // Extract number of type from file name. Return false if failing to fine + // them. + bool TryParseFileName(const std::string& file_name, uint64_t* number, + FileType* type); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.cc b/src/rocksdb/utilities/fault_injection_secondary_cache.cc new file mode 100644 index 000000000..2758c2a19 --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_secondary_cache.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This class implements a custom SecondaryCache that randomly injects an +// error status into Inserts/Lookups based on a specified probability. + +#include "utilities/fault_injection_secondary_cache.h" + +namespace ROCKSDB_NAMESPACE { + +void FaultInjectionSecondaryCache::ResultHandle::UpdateHandleValue( + FaultInjectionSecondaryCache::ResultHandle* handle) { + ErrorContext* ctx = handle->cache_->GetErrorContext(); + if (!ctx->rand.OneIn(handle->cache_->prob_)) { + handle->value_ = handle->base_->Value(); + handle->size_ = handle->base_->Size(); + } + handle->base_.reset(); +} + +bool FaultInjectionSecondaryCache::ResultHandle::IsReady() { + bool ready = true; + if (base_) { + ready = base_->IsReady(); + if (ready) { + UpdateHandleValue(this); + } + } + return ready; +} + +void FaultInjectionSecondaryCache::ResultHandle::Wait() { + base_->Wait(); + UpdateHandleValue(this); +} + +void* FaultInjectionSecondaryCache::ResultHandle::Value() { return value_; } + +size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; } + +void FaultInjectionSecondaryCache::ResultHandle::WaitAll( + FaultInjectionSecondaryCache* cache, + std::vector handles) { + std::vector base_handles; + for (SecondaryCacheResultHandle* hdl : handles) { + FaultInjectionSecondaryCache::ResultHandle* handle = + static_cast(hdl); + if (!handle->base_) { + continue; + } + base_handles.emplace_back(handle->base_.get()); + } + + cache->base_->WaitAll(base_handles); + for (SecondaryCacheResultHandle* hdl : handles) { + FaultInjectionSecondaryCache::ResultHandle* handle = + static_cast(hdl); + if (handle->base_) { + UpdateHandleValue(handle); + } + } +} + +FaultInjectionSecondaryCache::ErrorContext* +FaultInjectionSecondaryCache::GetErrorContext() { + ErrorContext* ctx = static_cast(thread_local_error_->Get()); + if (!ctx) { + ctx = new ErrorContext(seed_); + thread_local_error_->Reset(ctx); + } + + return ctx; +} + +Status FaultInjectionSecondaryCache::Insert( + const Slice& key, void* value, const Cache::CacheItemHelper* helper) { + ErrorContext* ctx = GetErrorContext(); + if (ctx->rand.OneIn(prob_)) { + return Status::IOError(); + } + + return base_->Insert(key, value, helper); +} + +std::unique_ptr +FaultInjectionSecondaryCache::Lookup(const Slice& key, + const Cache::CreateCallback& create_cb, + bool wait, bool advise_erase, + bool& is_in_sec_cache) { + ErrorContext* ctx = GetErrorContext(); + if (base_is_compressed_sec_cache_) { + if (ctx->rand.OneIn(prob_)) { + return nullptr; + } else { + return base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache); + } + } else { + std::unique_ptr hdl = + base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache); + if (wait && ctx->rand.OneIn(prob_)) { + hdl.reset(); + } + return std::unique_ptr( + new FaultInjectionSecondaryCache::ResultHandle(this, std::move(hdl))); + } +} + +void FaultInjectionSecondaryCache::Erase(const Slice& key) { + base_->Erase(key); +} + +void FaultInjectionSecondaryCache::WaitAll( + std::vector handles) { + if (base_is_compressed_sec_cache_) { + ErrorContext* ctx = GetErrorContext(); + std::vector base_handles; + for (SecondaryCacheResultHandle* hdl : handles) { + if (ctx->rand.OneIn(prob_)) { + continue; + } + base_handles.push_back(hdl); + } + base_->WaitAll(base_handles); + } else { + FaultInjectionSecondaryCache::ResultHandle::WaitAll(this, handles); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.h b/src/rocksdb/utilities/fault_injection_secondary_cache.h new file mode 100644 index 000000000..5321df626 --- /dev/null +++ b/src/rocksdb/utilities/fault_injection_secondary_cache.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/secondary_cache.h" +#include "util/random.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +// This class implements a custom SecondaryCache that randomly injects an +// error status into Inserts/Lookups based on a specified probability. +// Its used by db_stress to verify correctness in the presence of +// secondary cache errors. +// +class FaultInjectionSecondaryCache : public SecondaryCache { + public: + explicit FaultInjectionSecondaryCache( + const std::shared_ptr& base, uint32_t seed, int prob) + : base_(base), + seed_(seed), + prob_(prob), + thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) { + if (std::strcmp(base_->Name(), "CompressedSecondaryCache") == 0) { + base_is_compressed_sec_cache_ = true; + } + } + + virtual ~FaultInjectionSecondaryCache() override {} + + const char* Name() const override { return "FaultInjectionSecondaryCache"; } + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) override; + + std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait, + bool advise_erase, bool& is_in_sec_cache) override; + + bool SupportForceErase() const override { return base_->SupportForceErase(); } + + void Erase(const Slice& key) override; + + void WaitAll(std::vector handles) override; + + Status SetCapacity(size_t capacity) override { + return base_->SetCapacity(capacity); + } + + Status GetCapacity(size_t& capacity) override { + return base_->GetCapacity(capacity); + } + + std::string GetPrintableOptions() const override { + return base_->GetPrintableOptions(); + } + + private: + class ResultHandle : public SecondaryCacheResultHandle { + public: + ResultHandle(FaultInjectionSecondaryCache* cache, + std::unique_ptr&& base) + : cache_(cache), base_(std::move(base)), value_(nullptr), size_(0) {} + + ~ResultHandle() override {} + + bool IsReady() override; + + void Wait() override; + + void* Value() override; + + size_t Size() override; + + static void WaitAll(FaultInjectionSecondaryCache* cache, + std::vector handles); + + private: + static void UpdateHandleValue(ResultHandle* handle); + + FaultInjectionSecondaryCache* cache_; + std::unique_ptr base_; + void* value_; + size_t size_; + }; + + static void DeleteThreadLocalErrorContext(void* p) { + ErrorContext* ctx = static_cast(p); + delete ctx; + } + + const std::shared_ptr base_; + uint32_t seed_; + int prob_; + bool base_is_compressed_sec_cache_{false}; + + struct ErrorContext { + Random rand; + + explicit ErrorContext(uint32_t seed) : rand(seed) {} + }; + std::unique_ptr thread_local_error_; + + ErrorContext* GetErrorContext(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/leveldb_options/leveldb_options.cc b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc new file mode 100644 index 000000000..125c3d956 --- /dev/null +++ b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/utilities/leveldb_options.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +LevelDBOptions::LevelDBOptions() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(nullptr), + write_buffer_size(4 << 20), + max_open_files(1000), + block_cache(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr) {} + +Options ConvertOptions(const LevelDBOptions& leveldb_options) { + Options options = Options(); + options.create_if_missing = leveldb_options.create_if_missing; + options.error_if_exists = leveldb_options.error_if_exists; + options.paranoid_checks = leveldb_options.paranoid_checks; + options.env = leveldb_options.env; + options.info_log.reset(leveldb_options.info_log); + options.write_buffer_size = leveldb_options.write_buffer_size; + options.max_open_files = leveldb_options.max_open_files; + options.compression = leveldb_options.compression; + + BlockBasedTableOptions table_options; + table_options.block_cache.reset(leveldb_options.block_cache); + table_options.block_size = leveldb_options.block_size; + table_options.block_restart_interval = leveldb_options.block_restart_interval; + table_options.filter_policy.reset(leveldb_options.filter_policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + return options; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/memory/memory_test.cc b/src/rocksdb/utilities/memory/memory_test.cc new file mode 100644 index 000000000..0b043af0e --- /dev/null +++ b/src/rocksdb/utilities/memory/memory_test.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_impl/db_impl.h" +#include "rocksdb/cache.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/stackable_db.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MemoryTest : public testing::Test { + public: + MemoryTest() : kDbDir(test::PerThreadDBPath("memory_test")), rnd_(301) { + assert(Env::Default()->CreateDirIfMissing(kDbDir).ok()); + } + + std::string GetDBName(int id) { return kDbDir + "db_" + std::to_string(id); } + + void UpdateUsagesHistory(const std::vector& dbs) { + std::map usage_by_type; + ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type)); + for (int i = 0; i < MemoryUtil::kNumUsageTypes; ++i) { + usage_history_[i].push_back( + usage_by_type[static_cast(i)]); + } + } + + void GetCachePointersFromTableFactory( + const TableFactory* factory, + std::unordered_set* cache_set) { + const auto bbto = factory->GetOptions(); + if (bbto != nullptr) { + cache_set->insert(bbto->block_cache.get()); + cache_set->insert(bbto->block_cache_compressed.get()); + } + } + + void GetCachePointers(const std::vector& dbs, + std::unordered_set* cache_set) { + cache_set->clear(); + + for (auto* db : dbs) { + assert(db); + + // Cache from DBImpl + StackableDB* sdb = dynamic_cast(db); + DBImpl* db_impl = dynamic_cast(sdb ? sdb->GetBaseDB() : db); + if (db_impl != nullptr) { + cache_set->insert(db_impl->TEST_table_cache()); + } + + // Cache from DBOptions + cache_set->insert(db->GetDBOptions().row_cache.get()); + + // Cache from table factories + std::unordered_map iopts_map; + if (db_impl != nullptr) { + ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map)); + } + for (auto pair : iopts_map) { + GetCachePointersFromTableFactory(pair.second->table_factory.get(), + cache_set); + } + } + } + + Status GetApproximateMemoryUsageByType( + const std::vector& dbs, + std::map* usage_by_type) { + std::unordered_set cache_set; + GetCachePointers(dbs, &cache_set); + + return MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, + usage_by_type); + } + + const std::string kDbDir; + Random rnd_; + std::vector usage_history_[MemoryUtil::kNumUsageTypes]; +}; + +TEST_F(MemoryTest, SharedBlockCacheTotal) { + std::vector dbs; + std::vector usage_by_type; + const int kNumDBs = 10; + const int kKeySize = 100; + const int kValueSize = 500; + Options opt; + opt.create_if_missing = true; + opt.write_buffer_size = kKeySize + kValueSize; + opt.max_write_buffer_number = 10; + opt.min_write_buffer_number_to_merge = 10; + opt.disable_auto_compactions = true; + BlockBasedTableOptions bbt_opts; + bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10); + for (int i = 0; i < kNumDBs; ++i) { + ASSERT_OK(DestroyDB(GetDBName(i), opt)); + DB* db = nullptr; + ASSERT_OK(DB::Open(opt, GetDBName(i), &db)); + dbs.push_back(db); + } + + std::vector keys_by_db[kNumDBs]; + + // Fill one memtable per Put to make memtable use more memory. + for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) { + for (int i = 0; i < kNumDBs; ++i) { + for (int j = 0; j < 100; ++j) { + keys_by_db[i].emplace_back(rnd_.RandomString(kKeySize)); + ASSERT_OK(dbs[i]->Put(WriteOptions(), keys_by_db[i].back(), + rnd_.RandomString(kValueSize))); + } + ASSERT_OK(dbs[i]->Flush(FlushOptions())); + } + } + for (int i = 0; i < kNumDBs; ++i) { + for (auto& key : keys_by_db[i]) { + std::string value; + ASSERT_OK(dbs[i]->Get(ReadOptions(), key, &value)); + } + UpdateUsagesHistory(dbs); + } + for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size(); + ++i) { + // Expect EQ as we didn't flush more memtables. + ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i], + usage_history_[MemoryUtil::kTableReadersTotal][i - 1]); + } + for (int i = 0; i < kNumDBs; ++i) { + delete dbs[i]; + } +} + +TEST_F(MemoryTest, MemTableAndTableReadersTotal) { + std::vector dbs; + std::vector usage_by_type; + std::vector> vec_handles; + const int kNumDBs = 10; + // These key/value sizes ensure each KV has its own memtable. Note that the + // minimum write_buffer_size allowed is 64 KB. + const int kKeySize = 100; + const int kValueSize = 1 << 16; + Options opt; + opt.create_if_missing = true; + opt.create_missing_column_families = true; + opt.write_buffer_size = kKeySize + kValueSize; + opt.max_write_buffer_number = 10; + opt.min_write_buffer_number_to_merge = 10; + opt.disable_auto_compactions = true; + + std::vector cf_descs = { + {kDefaultColumnFamilyName, ColumnFamilyOptions(opt)}, + {"one", ColumnFamilyOptions(opt)}, + {"two", ColumnFamilyOptions(opt)}, + }; + + for (int i = 0; i < kNumDBs; ++i) { + ASSERT_OK(DestroyDB(GetDBName(i), opt)); + std::vector handles; + dbs.emplace_back(); + vec_handles.emplace_back(); + ASSERT_OK(DB::Open(DBOptions(opt), GetDBName(i), cf_descs, + &vec_handles.back(), &dbs.back())); + } + + // Fill one memtable per Put to make memtable use more memory. + for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) { + for (int i = 0; i < kNumDBs; ++i) { + for (auto* handle : vec_handles[i]) { + ASSERT_OK(dbs[i]->Put(WriteOptions(), handle, + rnd_.RandomString(kKeySize), + rnd_.RandomString(kValueSize))); + UpdateUsagesHistory(dbs); + } + } + } + // Expect the usage history is monotonically increasing + for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size(); + ++i) { + ASSERT_GT(usage_history_[MemoryUtil::kMemTableTotal][i], + usage_history_[MemoryUtil::kMemTableTotal][i - 1]); + ASSERT_GT(usage_history_[MemoryUtil::kMemTableUnFlushed][i], + usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]); + ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i], + usage_history_[MemoryUtil::kTableReadersTotal][i - 1]); + } + + size_t usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size(); + std::vector iters; + + // Create an iterator and flush all memtables for each db + for (int i = 0; i < kNumDBs; ++i) { + iters.push_back(dbs[i]->NewIterator(ReadOptions())); + ASSERT_OK(dbs[i]->Flush(FlushOptions())); + + for (int j = 0; j < 100; ++j) { + std::string value; + ASSERT_NOK( + dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value)); + } + + UpdateUsagesHistory(dbs); + } + for (size_t i = usage_check_point; + i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) { + // Since memtables are pinned by iterators, we don't expect the + // memory usage of all the memtables decreases as they are pinned + // by iterators. + ASSERT_GE(usage_history_[MemoryUtil::kMemTableTotal][i], + usage_history_[MemoryUtil::kMemTableTotal][i - 1]); + // Expect the usage history from the "usage_decay_point" is + // monotonically decreasing. + ASSERT_LT(usage_history_[MemoryUtil::kMemTableUnFlushed][i], + usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]); + // Expect the usage history of the table readers increases + // as we flush tables. + ASSERT_GT(usage_history_[MemoryUtil::kTableReadersTotal][i], + usage_history_[MemoryUtil::kTableReadersTotal][i - 1]); + ASSERT_GT(usage_history_[MemoryUtil::kCacheTotal][i], + usage_history_[MemoryUtil::kCacheTotal][i - 1]); + } + usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size(); + for (int i = 0; i < kNumDBs; ++i) { + // iterator is not used. + ASSERT_OK(iters[i]->status()); + delete iters[i]; + UpdateUsagesHistory(dbs); + } + for (size_t i = usage_check_point; + i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) { + // Expect the usage of all memtables decreasing as we delete iterators. + ASSERT_LT(usage_history_[MemoryUtil::kMemTableTotal][i], + usage_history_[MemoryUtil::kMemTableTotal][i - 1]); + // Since the memory usage of un-flushed memtables is only affected + // by Put and flush, we expect EQ here as we only delete iterators. + ASSERT_EQ(usage_history_[MemoryUtil::kMemTableUnFlushed][i], + usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]); + // Expect EQ as we didn't flush more memtables. + ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i], + usage_history_[MemoryUtil::kTableReadersTotal][i - 1]); + } + + for (int i = 0; i < kNumDBs; ++i) { + for (auto* handle : vec_handles[i]) { + delete handle; + } + delete dbs[i]; + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !(defined NDEBUG) || !defined(OS_WIN) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + return 0; +#endif +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + printf("Skipped in RocksDBLite as utilities are not supported.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/memory/memory_util.cc b/src/rocksdb/utilities/memory/memory_util.cc new file mode 100644 index 000000000..13c81aec4 --- /dev/null +++ b/src/rocksdb/utilities/memory/memory_util.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/memory_util.h" + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +Status MemoryUtil::GetApproximateMemoryUsageByType( + const std::vector& dbs, + const std::unordered_set cache_set, + std::map* usage_by_type) { + usage_by_type->clear(); + + // MemTable + for (auto* db : dbs) { + uint64_t usage = 0; + if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &usage)) { + (*usage_by_type)[MemoryUtil::kMemTableTotal] += usage; + } + if (db->GetAggregatedIntProperty(DB::Properties::kCurSizeAllMemTables, + &usage)) { + (*usage_by_type)[MemoryUtil::kMemTableUnFlushed] += usage; + } + } + + // Table Readers + for (auto* db : dbs) { + uint64_t usage = 0; + if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, + &usage)) { + (*usage_by_type)[MemoryUtil::kTableReadersTotal] += usage; + } + } + + // Cache + for (const auto* cache : cache_set) { + if (cache != nullptr) { + (*usage_by_type)[MemoryUtil::kCacheTotal] += cache->GetUsage(); + } + } + + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/memory_allocators.h b/src/rocksdb/utilities/memory_allocators.h new file mode 100644 index 000000000..c9e77a5b7 --- /dev/null +++ b/src/rocksdb/utilities/memory_allocators.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/memory_allocator.h" + +namespace ROCKSDB_NAMESPACE { +// A memory allocator using new/delete +class DefaultMemoryAllocator : public MemoryAllocator { + public: + static const char* kClassName() { return "DefaultMemoryAllocator"; } + const char* Name() const override { return kClassName(); } + void* Allocate(size_t size) override { + return static_cast(new char[size]); + } + + void Deallocate(void* p) override { delete[] static_cast(p); } +}; + +// Base class for a MemoryAllocator. This implementation does nothing +// and implements the methods in failuse mode (assert if the methods are +// invoked). Implementations can extend this class and override these methods +// when they are enabled via compiler switches (e.g., the +// JeMallocMemoryAllocator can define these methods if ROCKSDB_JEMALLOC is +// defined at compile time. If compiled in "disabled" mode, this class provides +// default/failure implementations. If compiled in "enabled" mode, the derived +// class needs to provide the appopriate "enabled" methods for the "real" +// implementation. Failure of the "real" implementation to implement ovreride +// any of these methods will result in an assert failure. +class BaseMemoryAllocator : public MemoryAllocator { + public: + void* Allocate(size_t /*size*/) override { + assert(false); + return nullptr; + } + + void Deallocate(void* /*p*/) override { assert(false); } +}; + +// A Wrapped MemoryAllocator. Delegates the memory allcator functions to the +// wrapped one. +class MemoryAllocatorWrapper : public MemoryAllocator { + public: + // Initialize an MemoryAllocatorWrapper that delegates all calls to *t + explicit MemoryAllocatorWrapper(const std::shared_ptr& t); + ~MemoryAllocatorWrapper() override {} + + // Return the target to which to forward all calls + MemoryAllocator* target() const { return target_.get(); } + // Allocate a block of at least size. Has to be thread-safe. + void* Allocate(size_t size) override { return target_->Allocate(size); } + + // Deallocate previously allocated block. Has to be thread-safe. + void Deallocate(void* p) override { return target_->Deallocate(p); } + + // Returns the memory size of the block allocated at p. The default + // implementation that just returns the original allocation_size is fine. + size_t UsableSize(void* p, size_t allocation_size) const override { + return target_->UsableSize(p, allocation_size); + } + + const Customizable* Inner() const override { return target_.get(); } + + protected: + std::shared_ptr target_; +}; + +// A memory allocator that counts the number of allocations and deallocations +// This class is useful if the number of memory allocations/dellocations is +// important. +class CountedMemoryAllocator : public MemoryAllocatorWrapper { + public: + CountedMemoryAllocator() + : MemoryAllocatorWrapper(std::make_shared()), + allocations_(0), + deallocations_(0) {} + + explicit CountedMemoryAllocator(const std::shared_ptr& t) + : MemoryAllocatorWrapper(t), allocations_(0), deallocations_(0) {} + static const char* kClassName() { return "CountedMemoryAllocator"; } + const char* Name() const override { return kClassName(); } + std::string GetId() const override { return std::string(Name()); } + void* Allocate(size_t size) override { + allocations_++; + return MemoryAllocatorWrapper::Allocate(size); + } + + void Deallocate(void* p) override { + deallocations_++; + MemoryAllocatorWrapper::Deallocate(p); + } + uint64_t GetNumAllocations() const { return allocations_; } + uint64_t GetNumDeallocations() const { return deallocations_; } + + private: + std::atomic allocations_; + std::atomic deallocations_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators.cc b/src/rocksdb/utilities/merge_operators.cc new file mode 100644 index 000000000..c97e9ce25 --- /dev/null +++ b/src/rocksdb/utilities/merge_operators.cc @@ -0,0 +1,120 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/merge_operators.h" + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { +static bool LoadMergeOperator(const std::string& id, + std::shared_ptr* result) { + bool success = true; + // TODO: Hook the "name" up to the actual Name() of the MergeOperators? + // Requires these classes be moved into a header file... + if (id == "put" || id == "PutOperator") { + *result = MergeOperators::CreatePutOperator(); + } else if (id == "put_v1") { + *result = MergeOperators::CreateDeprecatedPutOperator(); + } else if (id == "uint64add" || id == "UInt64AddOperator") { + *result = MergeOperators::CreateUInt64AddOperator(); + } else if (id == "max" || id == "MaxOperator") { + *result = MergeOperators::CreateMaxOperator(); +#ifdef ROCKSDB_LITE + // The remainder of the classes are handled by the ObjectRegistry in + // non-LITE mode + } else if (id == StringAppendOperator::kNickName() || + id == StringAppendOperator::kClassName()) { + *result = MergeOperators::CreateStringAppendOperator(); + } else if (id == StringAppendTESTOperator::kNickName() || + id == StringAppendTESTOperator::kClassName()) { + *result = MergeOperators::CreateStringAppendTESTOperator(); + } else if (id == BytesXOROperator::kNickName() || + id == BytesXOROperator::kClassName()) { + *result = MergeOperators::CreateBytesXOROperator(); + } else if (id == SortList::kNickName() || id == SortList::kClassName()) { + *result = MergeOperators::CreateSortOperator(); +#endif // ROCKSDB_LITE + } else { + success = false; + } + return success; +} + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinMergeOperators(ObjectLibrary& library, + const std::string& /*arg*/) { + size_t num_types; + library.AddFactory( + ObjectLibrary::PatternEntry(StringAppendOperator::kClassName()) + .AnotherName(StringAppendOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new StringAppendOperator(",")); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName()) + .AnotherName(StringAppendTESTOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new StringAppendTESTOperator(",")); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(SortList::kClassName()) + .AnotherName(SortList::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new SortList()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(BytesXOROperator::kClassName()) + .AnotherName(BytesXOROperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new BytesXOROperator()); + return guard->get(); + }); + + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status MergeOperator::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(config_options, value, + LoadMergeOperator, result); +} + +std::shared_ptr MergeOperators::CreateFromStringId( + const std::string& id) { + std::shared_ptr result; + Status s = MergeOperator::CreateFromString(ConfigOptions(), id, &result); + if (s.ok()) { + return result; + } else { + // Empty or unknown, just return nullptr + return nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators.h b/src/rocksdb/utilities/merge_operators.h new file mode 100644 index 000000000..9b90107e3 --- /dev/null +++ b/src/rocksdb/utilities/merge_operators.h @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include + +#include +#include + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeOperators { + public: + static std::shared_ptr CreatePutOperator(); + static std::shared_ptr CreateDeprecatedPutOperator(); + static std::shared_ptr CreateUInt64AddOperator(); + static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendOperator( + char delim_char); + static std::shared_ptr CreateStringAppendOperator( + const std::string& delim); + static std::shared_ptr CreateStringAppendTESTOperator(); + static std::shared_ptr CreateMaxOperator(); + static std::shared_ptr CreateBytesXOROperator(); + static std::shared_ptr CreateSortOperator(); + + // Will return a different merge operator depending on the string. + static std::shared_ptr CreateFromStringId( + const std::string& name); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.cc b/src/rocksdb/utilities/merge_operators/bytesxor.cc new file mode 100644 index 000000000..fa09c18ea --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/bytesxor.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/merge_operators/bytesxor.h" + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr MergeOperators::CreateBytesXOROperator() { + return std::make_shared(); +} + +bool BytesXOROperator::Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { + XOR(existing_value, value, new_value); + return true; +} + +void BytesXOROperator::XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const { + if (!existing_value) { + new_value->clear(); + new_value->assign(value.data(), value.size()); + return; + } + + size_t min_size = std::min(existing_value->size(), value.size()); + size_t max_size = std::max(existing_value->size(), value.size()); + + new_value->clear(); + new_value->reserve(max_size); + + const char* existing_value_data = existing_value->data(); + const char* value_data = value.data(); + + for (size_t i = 0; i < min_size; i++) { + new_value->push_back(existing_value_data[i] ^ value_data[i]); + } + + if (existing_value->size() == max_size) { + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(existing_value_data[i]); + } + } else { + assert(value.size() == max_size); + for (size_t i = min_size; i < max_size; i++) { + new_value->push_back(value_data[i]); + } + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.h b/src/rocksdb/utilities/merge_operators/bytesxor.h new file mode 100644 index 000000000..3c7baacce --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/bytesxor.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// A 'model' merge operator that XORs two (same sized) array of bytes. +// Implemented as an AssociativeMergeOperator for simplicity and example. +class BytesXOROperator : public AssociativeMergeOperator { + public: + // XORs the two array of bytes one byte at a time and stores the result + // in new_value. len is the number of xored bytes, and the length of new_value + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "BytesXOR"; } + static const char* kNickName() { return "bytesxor"; } + + const char* NickName() const override { return kNickName(); } + const char* Name() const override { return kClassName(); } + + void XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/max.cc b/src/rocksdb/utilities/merge_operators/max.cc new file mode 100644 index 000000000..de4abfa6f --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/max.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" + +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::MergeOperator; +using ROCKSDB_NAMESPACE::Slice; + +namespace { // anonymous namespace + +// Merge operator that picks the maximum operand, Comparison is based on +// Slice::compare +class MaxOperator : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + Slice& max = merge_out->existing_operand; + if (merge_in.existing_value) { + max = Slice(merge_in.existing_value->data(), + merge_in.existing_value->size()); + } else if (max.data() == nullptr) { + max = Slice(); + } + + for (const auto& op : merge_in.operand_list) { + if (max.compare(op) < 0) { + max = op; + } + } + + return true; + } + + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override { + if (left_operand.compare(right_operand) >= 0) { + new_value->assign(left_operand.data(), left_operand.size()); + } else { + new_value->assign(right_operand.data(), right_operand.size()); + } + return true; + } + + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + Slice max; + for (const auto& operand : operand_list) { + if (max.compare(operand) < 0) { + max = operand; + } + } + + new_value->assign(max.data(), max.size()); + return true; + } + + static const char* kClassName() { return "MaxOperator"; } + static const char* kNickName() { return "max"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } +}; + +} // end of anonymous namespace + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr MergeOperators::CreateMaxOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/put.cc b/src/rocksdb/utilities/merge_operators/put.cc new file mode 100644 index 000000000..ccf9ff21f --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/put.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" + +namespace { // anonymous namespace + +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::MergeOperator; +using ROCKSDB_NAMESPACE::Slice; + +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. +class PutOperator : public MergeOperator { + public: + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& operand_sequence, + std::string* new_value, Logger* /*logger*/) const override { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; + } + + bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override { + new_value->assign(right_operand.data(), right_operand.size()); + return true; + } + + using MergeOperator::PartialMergeMulti; + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + new_value->assign(operand_list.back().data(), operand_list.back().size()); + return true; + } + + static const char* kClassName() { return "PutOperator"; } + static const char* kNickName() { return "put_v1"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } +}; + +class PutOperatorV2 : public PutOperator { + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + assert(false); + return false; + } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // Put basically only looks at the current/latest value + assert(!merge_in.operand_list.empty()); + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + static const char* kNickName() { return "put"; } + const char* NickName() const override { return kNickName(); } +}; + +} // end of anonymous namespace + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr MergeOperators::CreateDeprecatedPutOperator() { + return std::make_shared(); +} + +std::shared_ptr MergeOperators::CreatePutOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/sortlist.cc b/src/rocksdb/utilities/merge_operators/sortlist.cc new file mode 100644 index 000000000..67bfc7e5e --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/sortlist.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "utilities/merge_operators/sortlist.h" + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +bool SortList::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + std::vector left; + for (Slice slice : merge_in.operand_list) { + std::vector right; + MakeVector(right, slice); + left = Merge(left, right); + } + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + merge_out->new_value.append(std::to_string(left[i])).append(","); + } + merge_out->new_value.append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const { + std::vector left; + std::vector right; + MakeVector(left, left_operand); + MakeVector(right, right_operand); + left = Merge(left, right); + for (int i = 0; i < static_cast(left.size()) - 1; i++) { + new_value->append(std::to_string(left[i])).append(","); + } + new_value->append(std::to_string(left.back())); + return true; +} + +bool SortList::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + (void)operand_list; + (void)new_value; + return true; +} + +void SortList::MakeVector(std::vector& operand, Slice slice) const { + do { + const char* begin = slice.data_; + while (*slice.data_ != ',' && *slice.data_) slice.data_++; + operand.push_back(std::stoi(std::string(begin, slice.data_))); + } while (0 != *slice.data_++); +} + +std::vector SortList::Merge(std::vector& left, + std::vector& right) const { + // Fill the resultant vector with sorted results from both vectors + std::vector result; + unsigned left_it = 0, right_it = 0; + + while (left_it < left.size() && right_it < right.size()) { + // If the left value is smaller than the right it goes next + // into the resultant vector + if (left[left_it] < right[right_it]) { + result.push_back(left[left_it]); + left_it++; + } else { + result.push_back(right[right_it]); + right_it++; + } + } + + // Push the remaining data from both vectors onto the resultant + while (left_it < left.size()) { + result.push_back(left[left_it]); + left_it++; + } + + while (right_it < right.size()) { + result.push_back(right[right_it]); + right_it++; + } + + return result; +} + +std::shared_ptr MergeOperators::CreateSortOperator() { + return std::make_shared(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/sortlist.h b/src/rocksdb/utilities/merge_operators/sortlist.h new file mode 100644 index 000000000..eaa4e76fb --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/sortlist.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// A MergeOperator for RocksDB that implements Merge Sort. +// It is built using the MergeOperator interface. The operator works by taking +// an input which contains one or more merge operands where each operand is a +// list of sorted ints and merges them to form a large sorted list. +#pragma once + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class SortList : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + static const char* kClassName() { return "MergeSortOperator"; } + static const char* kNickName() { return "sortlist"; } + + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + void MakeVector(std::vector& operand, Slice slice) const; + + private: + std::vector Merge(std::vector& left, std::vector& right) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc new file mode 100644 index 000000000..5092cabcb --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc @@ -0,0 +1,78 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend.h" + +#include + +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map + stringappend_merge_type_info = { +#ifndef ROCKSDB_LITE + {"delimiter", + {0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +} // namespace +// Constructor: also specify the delimiter character. +StringAppendOperator::StringAppendOperator(char delim_char) + : delim_(1, delim_char) { + RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info); +} + +StringAppendOperator::StringAppendOperator(const std::string& delim) + : delim_(delim) { + RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info); +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendOperator::Merge(const Slice& /*key*/, + const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + if (!existing_value) { + // No existing_value. Set *new_value = value + new_value->assign(value.data(), value.size()); + } else { + // Generic append (existing_value != null). + // Reserve *new_value to correct size, and apply concatenation. + new_value->reserve(existing_value->size() + delim_.size() + value.size()); + new_value->assign(existing_value->data(), existing_value->size()); + new_value->append(delim_); + new_value->append(value.data(), value.size()); + } + + return true; +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator() { + return std::make_shared(','); +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator( + char delim_char) { + return std::make_shared(delim_char); +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator( + const std::string& delim) { + return std::make_shared(delim); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h new file mode 100644 index 000000000..153532382 --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h @@ -0,0 +1,32 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class StringAppendOperator : public AssociativeMergeOperator { + public: + // Constructor: specify delimiter + explicit StringAppendOperator(char delim_char); + explicit StringAppendOperator(const std::string& delim); + + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "StringAppendOperator"; } + static const char* kNickName() { return "stringappend"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + std::string delim_; // The delimiter is inserted between elements +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc new file mode 100644 index 000000000..36cb9ee34 --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc @@ -0,0 +1,132 @@ +/** + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend2.h" + +#include + +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map + stringappend2_merge_type_info = { +#ifndef ROCKSDB_LITE + {"delimiter", + {0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +} // namespace + +// Constructor: also specify the delimiter character. +StringAppendTESTOperator::StringAppendTESTOperator(char delim_char) + : delim_(1, delim_char) { + RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info); +} + +StringAppendTESTOperator::StringAppendTESTOperator(const std::string& delim) + : delim_(delim) { + RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info); +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendTESTOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Clear the *new_value for writing. + merge_out->new_value.clear(); + + if (merge_in.existing_value == nullptr && merge_in.operand_list.size() == 1) { + // Only one operand + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + // Compute the space needed for the final result. + size_t numBytes = 0; + + for (auto it = merge_in.operand_list.begin(); + it != merge_in.operand_list.end(); ++it) { + numBytes += it->size() + delim_.size(); + } + + // Only print the delimiter after the first entry has been printed + bool printDelim = false; + + // Prepend the *existing_value if one exists. + if (merge_in.existing_value) { + merge_out->new_value.reserve(numBytes + merge_in.existing_value->size()); + merge_out->new_value.append(merge_in.existing_value->data(), + merge_in.existing_value->size()); + printDelim = true; + } else if (numBytes) { + // Without the existing (initial) value, the delimiter before the first of + // subsequent operands becomes redundant. + merge_out->new_value.reserve(numBytes - delim_.size()); + } + + // Concatenate the sequence of strings (and add a delimiter between each) + for (auto it = merge_in.operand_list.begin(); + it != merge_in.operand_list.end(); ++it) { + if (printDelim) { + merge_out->new_value.append(delim_); + } + merge_out->new_value.append(it->data(), it->size()); + printDelim = true; + } + + return true; +} + +bool StringAppendTESTOperator::PartialMergeMulti( + const Slice& /*key*/, const std::deque& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + return false; +} + +// A version of PartialMerge that actually performs "partial merging". +// Use this to simulate the exact behaviour of the StringAppendOperator. +bool StringAppendTESTOperator::_AssocPartialMergeMulti( + const Slice& /*key*/, const std::deque& operand_list, + std::string* new_value, Logger* /*logger*/) const { + // Clear the *new_value for writing + assert(new_value); + new_value->clear(); + assert(operand_list.size() >= 2); + + // Generic append + // Determine and reserve correct size for *new_value. + size_t size = 0; + for (const auto& operand : operand_list) { + size += operand.size(); + } + size += (operand_list.size() - 1) * delim_.length(); // Delimiters + new_value->reserve(size); + + // Apply concatenation + new_value->assign(operand_list.front().data(), operand_list.front().size()); + + for (std::deque::const_iterator it = operand_list.begin() + 1; + it != operand_list.end(); ++it) { + new_value->append(delim_); + new_value->append(it->data(), it->size()); + } + + return true; +} + +std::shared_ptr +MergeOperators::CreateStringAppendTESTOperator() { + return std::make_shared(','); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h new file mode 100644 index 000000000..75389e4ae --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h @@ -0,0 +1,52 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +/** + * A TEST MergeOperator for rocksdb that implements string append. + * It is built using the MergeOperator interface rather than the simpler + * AssociativeMergeOperator interface. This is useful for testing/benchmarking. + * While the two operators are semantically the same, all production code + * should use the StringAppendOperator defined in stringappend.{h,cc}. The + * operator defined in the present file is primarily for testing. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class StringAppendTESTOperator : public MergeOperator { + public: + // Constructor with delimiter + explicit StringAppendTESTOperator(char delim_char); + explicit StringAppendTESTOperator(const std::string& delim); + + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + static const char* kClassName() { return "StringAppendTESTOperator"; } + static const char* kNickName() { return "stringappendtest"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + private: + // A version of PartialMerge that actually performs "partial merging". + // Use this to simulate the exact behaviour of the StringAppendOperator. + bool _AssocPartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + std::string delim_; // The delimiter is inserted between elements +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc new file mode 100644 index 000000000..22b6144af --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc @@ -0,0 +1,640 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +/** + * An persistent map : key -> (list of strings), using rocksdb merge. + * This file is a test-harness / use-case for the StringAppendOperator. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook, Inc. + */ + +#include "utilities/merge_operators/string_append/stringappend.h" + +#include +#include +#include + +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/db_ttl.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +// Path to the database on file system +const std::string kDbName = test::PerThreadDBPath("stringappend_test"); + +namespace { +// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator +std::shared_ptr OpenNormalDb(const std::string& delim) { + DB* db; + Options options; + options.create_if_missing = true; + MergeOperator* mergeOperator; + if (delim.size() == 1) { + mergeOperator = new StringAppendOperator(delim[0]); + } else { + mergeOperator = new StringAppendOperator(delim); + } + options.merge_operator.reset(mergeOperator); + EXPECT_OK(DB::Open(options, kDbName, &db)); + return std::shared_ptr(db); +} + +#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite +// Open a TtlDB with a non-associative StringAppendTESTOperator +std::shared_ptr OpenTtlDb(const std::string& delim) { + DBWithTTL* db; + Options options; + options.create_if_missing = true; + MergeOperator* mergeOperator; + if (delim.size() == 1) { + mergeOperator = new StringAppendTESTOperator(delim[0]); + } else { + mergeOperator = new StringAppendTESTOperator(delim); + } + options.merge_operator.reset(mergeOperator); + EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456)); + return std::shared_ptr(db); +} +#endif // !ROCKSDB_LITE +} // namespace + +/// StringLists represents a set of string-lists, each with a key-index. +/// Supports Append(list, string) and Get(list) +class StringLists { + public: + // Constructor: specifies the rocksdb db + /* implicit */ + StringLists(std::shared_ptr db) + : db_(db), merge_option_(), get_option_() { + assert(db); + } + + // Append string val onto the list defined by key; return true on success + bool Append(const std::string& key, const std::string& val) { + Slice valSlice(val.data(), val.size()); + auto s = db_->Merge(merge_option_, key, valSlice); + + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + // Returns the list of strings associated with key (or "" if does not exist) + bool Get(const std::string& key, std::string* const result) { + assert(result != nullptr); // we should have a place to store the result + auto s = db_->Get(get_option_, key, result); + + if (s.ok()) { + return true; + } + + // Either key does not exist, or there is some error. + *result = ""; // Always return empty string (just for convention) + + // NotFound is okay; just return empty (similar to std::map) + // But network or db errors, etc, should fail the test (or at least yell) + if (!s.IsNotFound()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + } + + // Always return false if s.ok() was not true + return false; + } + + private: + std::shared_ptr db_; + WriteOptions merge_option_; + ReadOptions get_option_; +}; + +// The class for unit-testing +class StringAppendOperatorTest : public testing::Test, + public ::testing::WithParamInterface { + public: + StringAppendOperatorTest() { + EXPECT_OK( + DestroyDB(kDbName, Options())); // Start each test with a fresh DB + } + + void SetUp() override { +#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite + bool if_use_ttl = GetParam(); + if (if_use_ttl) { + fprintf(stderr, "Running tests with ttl db and generic operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb); + return; + } +#endif // !ROCKSDB_LITE + fprintf(stderr, "Running tests with regular db and operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb); + } + + using OpenFuncPtr = std::shared_ptr (*)(const std::string&); + + // Allows user to open databases with different configurations. + // e.g.: Can open a DB or a TtlDB, etc. + static void SetOpenDbFunction(OpenFuncPtr func) { OpenDb = func; } + + protected: + static OpenFuncPtr OpenDb; +}; +StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = + nullptr; + +// THE TEST CASES BEGIN HERE + +TEST_P(StringAppendOperatorTest, IteratorTest) { + auto db_ = OpenDb(","); + StringLists slists(db_); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + slists.Append("k2", "a1"); + slists.Append("k2", "a2"); + slists.Append("k2", "a3"); + + std::string res; + std::unique_ptr it( + db_->NewIterator(ReadOptions())); + std::string k1("k1"); + std::string k2("k2"); + bool first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + slists.Append("k2", "a4"); + slists.Append("k1", "v4"); + + // Snapshot should still be the same. Should ignore a4 and v4. + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + + // Should release the snapshot and be aware of the new stuff now + it.reset(db_->NewIterator(ReadOptions())); + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + // start from k2 this time. + for (it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + slists.Append("k3", "g1"); + + it.reset(db_->NewIterator(ReadOptions())); + first = true; + std::string k3("k3"); + for (it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + for (it->Seek(k3); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + // should not be hit + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } +} + +TEST_P(StringAppendOperatorTest, SimpleTest) { + auto db = OpenDb(","); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + ASSERT_EQ(res, "v1,v2,v3"); +} + +TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) { + auto db = OpenDb("|"); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + ASSERT_EQ(res, "v1|v2|v3"); +} + +TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) { + auto db = OpenDb(""); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + ASSERT_EQ(res, "v1v2v3"); +} + +TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) { + auto db = OpenDb("<>"); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + ASSERT_EQ(res, "v1<>v2<>v3"); +} + +TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) { + std::string delimiter = "<>"; + auto db = OpenDb(delimiter); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + delimiter.clear(); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + ASSERT_EQ(res, "v1<>v2<>v3"); +} + +TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) { + auto db = OpenDb("!"); + StringLists slists(db); + + slists.Append("random_key", "single_val"); + + std::string res; + ASSERT_TRUE(slists.Get("random_key", &res)); + ASSERT_EQ(res, "single_val"); +} + +TEST_P(StringAppendOperatorTest, VariousKeys) { + auto db = OpenDb("\n"); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + bool sa, sb, sc; + sa = slists.Get("a", &a); + sb = slists.Get("b", &b); + sc = slists.Get("c", &c); + + ASSERT_TRUE(sa && sb && sc); // All three keys should have been found + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); +} + +// Generate semi random keys/words from a small distribution. +TEST_P(StringAppendOperatorTest, RandomMixGetAppend) { + auto db = OpenDb(" "); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", + "342839", "dsuha", "mabuais", "sadajsid", + "jf9834hf", "2d9j89", "dj9823jd", "a", + "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", + "keykey", "muki", "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(1337); // deterministic seed; always get same results! + + const int kNumQueries = 30; + for (int q = 0; q < kNumQueries; ++q) { + // Generate a random query (Append or Get) and random parameters + query_t query = (query_t)randomGen.Uniform((int)NUM_OPS); + std::string key = keys[randomGen.Uniform((int)kKeyCount)]; + std::string word = words[randomGen.Uniform((int)kWordCount)]; + + // Apply the query and any checks. + if (query == APPEND_OP) { + // Apply the rocksdb test-harness Append defined above + slists.Append(key, word); // apply the rocksdb append + + // Apply the similar "Append" to the parallel copy + if (parallel_copy[key].size() > 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + } +} + +TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) { + auto db = OpenDb(" "); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", + "342839", "dsuha", "mabuais", "sadajsid", + "jf9834hf", "2d9j89", "dj9823jd", "a", + "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", + "keykey", "muki", "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(9138204); // deterministic seed + + const int kNumQueries = 1000; + for (int q = 0; q < kNumQueries; ++q) { + // Generate a random query (Append or Get) and random parameters + query_t query = (query_t)randomGen.Uniform((int)NUM_OPS); + std::string key = keys[randomGen.Uniform((int)kKeyCount)]; + std::string word = words[randomGen.Uniform((int)kWordCount)]; + + // Apply the query and any checks. + if (query == APPEND_OP) { + // Apply the rocksdb test-harness Append defined above + slists.Append(key, word); // apply the rocksdb append + + // Apply the similar "Append" to the parallel copy + if (parallel_copy[key].size() > 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + } +} + +TEST_P(StringAppendOperatorTest, PersistentVariousKeys) { + // Perform the following operations in limited scope + { + auto db = OpenDb("\n"); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb("\n"); + StringLists slists(db); + + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + + // The previous changes should be on disk (L0) + // The most recent changes should be in memory (MemTable) + // Hence, this will test both Get() paths. + std::string a, b, c; + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb("\n"); + StringLists slists(db); + + // All changes should be on disk. This will test VersionSet Get() + std::string a, b, c; + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } +} + +TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) { + // Perform the following operations in limited scope + { + auto db = OpenDb("\n"); + StringLists slists(db); + std::string a, b, c; + + // Append, Flush, Get + slists.Append("c", "asdasd"); + ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions())); + ASSERT_TRUE(slists.Get("c", &c)); + ASSERT_EQ(c, "asdasd"); + + // Append, Flush, Append, Get + slists.Append("a", "x"); + slists.Append("b", "y"); + ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions())); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_EQ(a, "x\nt\nr"); + + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_EQ(b, "y\n2"); + + // Append, Get + ASSERT_TRUE(slists.Append("c", "asdasd")); + ASSERT_TRUE(slists.Append("b", "monkey")); + + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2\nmonkey"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb("\n"); + StringLists slists(db); + std::string a, b, c; + + // Get (Quick check for persistence of previous database) + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_EQ(a, "x\nt\nr"); + + // Append, Compact, Get + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + ASSERT_EQ(a, "x\nt\nr\nsa"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx"); + + // Append, Get + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + ASSERT_TRUE(slists.Get("a", &a)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_TRUE(slists.Get("c", &c)); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Compact, Get + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Append, Flush, Compact, Get + slists.Append("b", "afcg"); + ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions())); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(slists.Get("b", &b)); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg"); + } +} + +TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) { + auto db = OpenDb(std::string(1, '\0')); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + ASSERT_TRUE(slists.Get("k1", &res)); + + // Construct the desired string. Default constructor doesn't like '\0' chars. + std::string checker("v1,v2,v3"); // Verify that the string is right size. + checker[2] = '\0'; // Use null delimiter instead of comma. + checker[5] = '\0'; + ASSERT_EQ(checker.size(), 8); // Verify it is still the correct size + + // Check that the rocksdb result string matches the desired string + ASSERT_EQ(res.size(), checker.size()); + ASSERT_EQ(res, checker); +} + +INSTANTIATE_TEST_CASE_P(StringAppendOperatorTest, StringAppendOperatorTest, + testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/merge_operators/uint64add.cc b/src/rocksdb/utilities/merge_operators/uint64add.cc new file mode 100644 index 000000000..5be2f5641 --- /dev/null +++ b/src/rocksdb/utilities/merge_operators/uint64add.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "logging/logging.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace { // anonymous namespace + +using ROCKSDB_NAMESPACE::AssociativeMergeOperator; +using ROCKSDB_NAMESPACE::InfoLogLevel; +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::Slice; + +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. +class UInt64AddOperator : public AssociativeMergeOperator { + public: + bool Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override { + uint64_t orig_value = 0; + if (existing_value) { + orig_value = DecodeInteger(*existing_value, logger); + } + uint64_t operand = DecodeInteger(value, logger); + + assert(new_value); + new_value->clear(); + ROCKSDB_NAMESPACE::PutFixed64(new_value, orig_value + operand); + + return true; // Return true always since corruption will be treated as 0 + } + + static const char* kClassName() { return "UInt64AddOperator"; } + static const char* kNickName() { return "uint64add"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = ROCKSDB_NAMESPACE::DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + ROCKS_LOG_ERROR(logger, + "uint64 value corruption, size: %" ROCKSDB_PRIszt + " > %" ROCKSDB_PRIszt, + value.size(), sizeof(uint64_t)); + } + + return result; + } +}; + +} // anonymous namespace + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr MergeOperators::CreateUInt64AddOperator() { + return std::make_shared(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/object_registry.cc b/src/rocksdb/utilities/object_registry.cc new file mode 100644 index 000000000..18834783d --- /dev/null +++ b/src/rocksdb/utilities/object_registry.cc @@ -0,0 +1,383 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/object_registry.h" + +#include + +#include "logging/logging.h" +#include "port/lang.h" +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +namespace { +bool MatchesInteger(const std::string &target, size_t start, size_t pos) { + // If it is numeric, everything up to the match must be a number + int digits = 0; + if (target[start] == '-') { + start++; // Allow negative numbers + } + while (start < pos) { + if (!isdigit(target[start++])) { + return false; + } else { + digits++; + } + } + return (digits > 0); +} + +bool MatchesDecimal(const std::string &target, size_t start, size_t pos) { + int digits = 0; + if (target[start] == '-') { + start++; // Allow negative numbers + } + for (bool point = false; start < pos; start++) { + if (target[start] == '.') { + if (point) { + return false; + } else { + point = true; + } + } else if (!isdigit(target[start])) { + return false; + } else { + digits++; + } + } + return (digits > 0); +} +} // namespace + +size_t ObjectLibrary::PatternEntry::MatchSeparatorAt( + size_t start, Quantifier mode, const std::string &target, size_t tlen, + const std::string &separator) const { + size_t slen = separator.size(); + // See if there is enough space. If so, find the separator + if (tlen < start + slen) { + return std::string::npos; // not enough space left + } else if (mode == kMatchExact) { + // Exact mode means the next thing we are looking for is the separator + if (target.compare(start, slen, separator) != 0) { + return std::string::npos; + } else { + return start + slen; // Found the separator, return where we found it + } + } else { + auto pos = start + 1; + if (!separator.empty()) { + pos = target.find(separator, pos); + } + if (pos == std::string::npos) { + return pos; + } else if (mode == kMatchInteger) { + if (!MatchesInteger(target, start, pos)) { + return std::string::npos; + } + } else if (mode == kMatchDecimal) { + if (!MatchesDecimal(target, start, pos)) { + return std::string::npos; + } + } + return pos + slen; + } +} + +bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name, + size_t nlen, + const std::string &target, + size_t tlen) const { + if (separators_.empty()) { + assert(optional_); // If there are no separators, it must be only a name + return nlen == tlen && name == target; + } else if (nlen == tlen) { // The lengths are the same + return optional_ && name == target; + } else if (tlen < nlen + slength_) { + // The target is not long enough + return false; + } else if (target.compare(0, nlen, name) != 0) { + return false; // Target does not start with name + } else { + // Loop through all of the separators one at a time matching them. + // Note that we first match the separator and then its quantifiers. + // Since we expect the separator first, we start with an exact match + // Subsequent matches will use the quantifier of the previous separator + size_t start = nlen; + auto mode = kMatchExact; + for (size_t idx = 0; idx < separators_.size(); ++idx) { + const auto &separator = separators_[idx]; + start = MatchSeparatorAt(start, mode, target, tlen, separator.first); + if (start == std::string::npos) { + return false; + } else { + mode = separator.second; + } + } + // We have matched all of the separators. Now check that what is left + // unmatched in the target is acceptable. + if (mode == kMatchExact) { + return (start == tlen); + } else if (start > tlen || (start == tlen && mode != kMatchZeroOrMore)) { + return false; + } else if (mode == kMatchInteger) { + return MatchesInteger(target, start, tlen); + } else if (mode == kMatchDecimal) { + return MatchesDecimal(target, start, tlen); + } + } + return true; +} + +bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const { + auto tlen = target.size(); + if (MatchesTarget(name_, nlength_, target, tlen)) { + return true; + } else if (!names_.empty()) { + for (const auto &alt : names_) { + if (MatchesTarget(alt, alt.size(), target, tlen)) { + return true; + } + } + } + return false; +} + +size_t ObjectLibrary::GetFactoryCount(size_t *types) const { + std::unique_lock lock(mu_); + *types = factories_.size(); + size_t factories = 0; + for (const auto &e : factories_) { + factories += e.second.size(); + } + return factories; +} + +size_t ObjectLibrary::GetFactoryCount(const std::string &type) const { + std::unique_lock lock(mu_); + auto iter = factories_.find(type); + if (iter != factories_.end()) { + return iter->second.size(); + } else { + return 0; + } +} + +void ObjectLibrary::GetFactoryNames(const std::string &type, + std::vector *names) const { + assert(names); + std::unique_lock lock(mu_); + auto iter = factories_.find(type); + if (iter != factories_.end()) { + for (const auto &f : iter->second) { + names->push_back(f->Name()); + } + } +} + +void ObjectLibrary::GetFactoryTypes( + std::unordered_set *types) const { + assert(types); + std::unique_lock lock(mu_); + for (const auto &iter : factories_) { + types->insert(iter.first); + } +} + +void ObjectLibrary::Dump(Logger *logger) const { + std::unique_lock lock(mu_); + if (logger != nullptr && !factories_.empty()) { + ROCKS_LOG_HEADER(logger, " Registered Library: %s\n", id_.c_str()); + for (const auto &iter : factories_) { + ROCKS_LOG_HEADER(logger, " Registered factories for type[%s] ", + iter.first.c_str()); + bool printed_one = false; + for (const auto &e : iter.second) { + ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name()); + printed_one = true; + } + } + } +} + +// Returns the Default singleton instance of the ObjectLibrary +// This instance will contain most of the "standard" registered objects +std::shared_ptr &ObjectLibrary::Default() { + // Use avoid destruction here so the default ObjectLibrary will not be + // statically destroyed and long-lived. + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared("default")); + return instance; +} + +ObjectRegistry::ObjectRegistry(const std::shared_ptr &library) { + libraries_.push_back(library); + for (const auto &b : builtins_) { + RegisterPlugin(b.first, b.second); + } +} + +std::shared_ptr ObjectRegistry::Default() { + // Use avoid destruction here so the default ObjectRegistry will not be + // statically destroyed and long-lived. + STATIC_AVOID_DESTRUCTION(std::shared_ptr, instance) + (std::make_shared(ObjectLibrary::Default())); + return instance; +} + +std::shared_ptr ObjectRegistry::NewInstance() { + return std::make_shared(Default()); +} + +std::shared_ptr ObjectRegistry::NewInstance( + const std::shared_ptr &parent) { + return std::make_shared(parent); +} + +Status ObjectRegistry::SetManagedObject( + const std::string &type, const std::string &id, + const std::shared_ptr &object) { + std::string object_key = ToManagedObjectKey(type, id); + std::shared_ptr curr; + if (parent_ != nullptr) { + curr = parent_->GetManagedObject(type, id); + } + if (curr == nullptr) { + // We did not find the object in any parent. Update in the current + std::unique_lock lock(objects_mutex_); + auto iter = managed_objects_.find(object_key); + if (iter != managed_objects_.end()) { // The object exists + curr = iter->second.lock(); + if (curr != nullptr && curr != object) { + return Status::InvalidArgument("Object already exists: ", object_key); + } else { + iter->second = object; + } + } else { + // The object does not exist. Add it + managed_objects_[object_key] = object; + } + } else if (curr != object) { + return Status::InvalidArgument("Object already exists: ", object_key); + } + return Status::OK(); +} + +std::shared_ptr ObjectRegistry::GetManagedObject( + const std::string &type, const std::string &id) const { + { + std::unique_lock lock(objects_mutex_); + auto iter = managed_objects_.find(ToManagedObjectKey(type, id)); + if (iter != managed_objects_.end()) { + return iter->second.lock(); + } + } + if (parent_ != nullptr) { + return parent_->GetManagedObject(type, id); + } else { + return nullptr; + } +} + +Status ObjectRegistry::ListManagedObjects( + const std::string &type, const std::string &name, + std::vector> *results) const { + { + std::string key = ToManagedObjectKey(type, name); + std::unique_lock lock(objects_mutex_); + for (auto iter = managed_objects_.lower_bound(key); + iter != managed_objects_.end() && StartsWith(iter->first, key); + ++iter) { + auto shared = iter->second.lock(); + if (shared != nullptr) { + if (name.empty() || shared->IsInstanceOf(name)) { + results->emplace_back(shared); + } + } + } + } + if (parent_ != nullptr) { + return parent_->ListManagedObjects(type, name, results); + } else { + return Status::OK(); + } +} + +// Returns the number of registered types for this registry. +// If specified (not-null), types is updated to include the names of the +// registered types. +size_t ObjectRegistry::GetFactoryCount(const std::string &type) const { + size_t count = 0; + if (parent_ != nullptr) { + count = parent_->GetFactoryCount(type); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + count += library->GetFactoryCount(type); + } + return count; +} + +void ObjectRegistry::GetFactoryNames(const std::string &type, + std::vector *names) const { + assert(names); + names->clear(); + if (parent_ != nullptr) { + parent_->GetFactoryNames(type, names); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + library->GetFactoryNames(type, names); + } +} + +void ObjectRegistry::GetFactoryTypes( + std::unordered_set *types) const { + assert(types); + if (parent_ != nullptr) { + parent_->GetFactoryTypes(types); + } + std::unique_lock lock(library_mutex_); + for (const auto &library : libraries_) { + library->GetFactoryTypes(types); + } +} + +void ObjectRegistry::Dump(Logger *logger) const { + if (logger != nullptr) { + std::unique_lock lock(library_mutex_); + if (!plugins_.empty()) { + ROCKS_LOG_HEADER(logger, " Registered Plugins:"); + bool printed_one = false; + for (const auto &plugin : plugins_) { + ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ", + plugin.c_str()); + printed_one = true; + } + ROCKS_LOG_HEADER(logger, "\n"); + } + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) { + iter->get()->Dump(logger); + } + } + if (parent_ != nullptr) { + parent_->Dump(logger); + } +} + +int ObjectRegistry::RegisterPlugin(const std::string &name, + const RegistrarFunc &func) { + if (!name.empty() && func != nullptr) { + plugins_.push_back(name); + return AddLibrary(name)->Register(func, name); + } else { + return -1; + } +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/object_registry_test.cc b/src/rocksdb/utilities/object_registry_test.cc new file mode 100644 index 000000000..90cd155ee --- /dev/null +++ b/src/rocksdb/utilities/object_registry_test.cc @@ -0,0 +1,872 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/object_registry.h" + +#include "rocksdb/convenience.h" +#include "rocksdb/customizable.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class ObjRegistryTest : public testing::Test { + public: + static int num_a, num_b; +}; + +int ObjRegistryTest::num_a = 0; +int ObjRegistryTest::num_b = 0; +static FactoryFunc test_reg_a = ObjectLibrary::Default()->AddFactory( + ObjectLibrary::PatternEntry("a", false).AddSeparator("://"), + [](const std::string& /*uri*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { + ++ObjRegistryTest::num_a; + return Env::Default(); + }); + +class WrappedEnv : public EnvWrapper { + private: + std::string id_; + + public: + WrappedEnv(Env* t, const std::string& id) : EnvWrapper(t), id_(id) {} + const char* Name() const override { return id_.c_str(); } + std::string GetId() const override { return id_; } +}; +static FactoryFunc test_reg_b = ObjectLibrary::Default()->AddFactory( + ObjectLibrary::PatternEntry("b", false).AddSeparator("://"), + [](const std::string& uri, std::unique_ptr* env_guard, + std::string* /* errmsg */) { + ++ObjRegistryTest::num_b; + // Env::Default() is a singleton so we can't grant ownership directly to + // the caller - we must wrap it first. + env_guard->reset(new WrappedEnv(Env::Default(), uri)); + return env_guard->get(); + }); + +TEST_F(ObjRegistryTest, Basics) { + std::string msg; + std::unique_ptr guard; + Env* a_env = nullptr; + + auto registry = ObjectRegistry::NewInstance(); + ASSERT_NOK(registry->NewStaticObject("c://test", &a_env)); + ASSERT_NOK(registry->NewUniqueObject("c://test", &guard)); + ASSERT_EQ(a_env, nullptr); + ASSERT_EQ(guard, nullptr); + ASSERT_EQ(0, num_a); + ASSERT_EQ(0, num_b); + + ASSERT_OK(registry->NewStaticObject("a://test", &a_env)); + ASSERT_NE(a_env, nullptr); + ASSERT_EQ(1, num_a); + ASSERT_EQ(0, num_b); + + ASSERT_OK(registry->NewUniqueObject("b://test", &guard)); + ASSERT_NE(guard, nullptr); + ASSERT_EQ(1, num_a); + ASSERT_EQ(1, num_b); + + Env* b_env = nullptr; + ASSERT_NOK(registry->NewStaticObject("b://test", &b_env)); + ASSERT_EQ(b_env, nullptr); + ASSERT_EQ(1, num_a); + ASSERT_EQ(2, num_b); // Created but rejected as not static + + b_env = a_env; + ASSERT_NOK(registry->NewStaticObject("b://test", &b_env)); + ASSERT_EQ(b_env, a_env); + ASSERT_EQ(1, num_a); + ASSERT_EQ(3, num_b); + + b_env = guard.get(); + ASSERT_NOK(registry->NewUniqueObject("a://test", &guard)); + ASSERT_EQ(guard.get(), b_env); // Unchanged + ASSERT_EQ(2, num_a); // Created one but rejected it as not unique + ASSERT_EQ(3, num_b); +} + +TEST_F(ObjRegistryTest, LocalRegistry) { + Env* env = nullptr; + auto registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = + std::make_shared("local"); + registry->AddLibrary(library); + library->AddFactory( + "test-local", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + ObjectLibrary::Default()->AddFactory( + "test-global", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + + ASSERT_NOK( + ObjectRegistry::NewInstance()->NewStaticObject("test-local", &env)); + ASSERT_EQ(env, nullptr); + ASSERT_OK( + ObjectRegistry::NewInstance()->NewStaticObject("test-global", &env)); + ASSERT_NE(env, nullptr); + ASSERT_OK(registry->NewStaticObject("test-local", &env)); + ASSERT_NE(env, nullptr); + ASSERT_OK(registry->NewStaticObject("test-global", &env)); + ASSERT_NE(env, nullptr); +} + +static int RegisterTestUnguarded(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + "unguarded", + [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return Env::Default(); }); + library.AddFactory( + "guarded", [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new WrappedEnv(Env::Default(), uri)); + return guard->get(); + }); + return 2; +} + +TEST_F(ObjRegistryTest, CheckShared) { + std::shared_ptr shared; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + registry->AddLibrary("shared", RegisterTestUnguarded, ""); + + ASSERT_OK(registry->NewSharedObject("guarded", &shared)); + ASSERT_NE(shared, nullptr); + shared.reset(); + ASSERT_NOK(registry->NewSharedObject("unguarded", &shared)); + ASSERT_EQ(shared, nullptr); +} + +TEST_F(ObjRegistryTest, CheckStatic) { + Env* env = nullptr; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + registry->AddLibrary("static", RegisterTestUnguarded, ""); + + ASSERT_NOK(registry->NewStaticObject("guarded", &env)); + ASSERT_EQ(env, nullptr); + env = nullptr; + ASSERT_OK(registry->NewStaticObject("unguarded", &env)); + ASSERT_NE(env, nullptr); +} + +TEST_F(ObjRegistryTest, CheckUnique) { + std::unique_ptr unique; + std::shared_ptr registry = ObjectRegistry::NewInstance(); + registry->AddLibrary("unique", RegisterTestUnguarded, ""); + + ASSERT_OK(registry->NewUniqueObject("guarded", &unique)); + ASSERT_NE(unique, nullptr); + unique.reset(); + ASSERT_NOK(registry->NewUniqueObject("unguarded", &unique)); + ASSERT_EQ(unique, nullptr); +} + +TEST_F(ObjRegistryTest, FailingFactory) { + std::shared_ptr registry = ObjectRegistry::NewInstance(); + std::shared_ptr library = + std::make_shared("failing"); + registry->AddLibrary(library); + library->AddFactory( + "failing", [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, std::string* errmsg) { + *errmsg = "Bad Factory"; + return nullptr; + }); + std::unique_ptr unique; + std::shared_ptr shared; + Env* pointer = nullptr; + Status s; + s = registry->NewUniqueObject("failing", &unique); + ASSERT_TRUE(s.IsInvalidArgument()); + s = registry->NewSharedObject("failing", &shared); + ASSERT_TRUE(s.IsInvalidArgument()); + s = registry->NewStaticObject("failing", &pointer); + ASSERT_TRUE(s.IsInvalidArgument()); + + s = registry->NewUniqueObject("missing", &unique); + ASSERT_TRUE(s.IsNotSupported()); + s = registry->NewSharedObject("missing", &shared); + ASSERT_TRUE(s.IsNotSupported()); + s = registry->NewStaticObject("missing", &pointer); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_F(ObjRegistryTest, TestRegistryParents) { + auto grand = ObjectRegistry::Default(); + auto parent = ObjectRegistry::NewInstance(); // parent with a grandparent + auto uncle = ObjectRegistry::NewInstance(grand); + auto child = ObjectRegistry::NewInstance(parent); + auto cousin = ObjectRegistry::NewInstance(uncle); + + auto library = parent->AddLibrary("parent"); + library->AddFactory( + "parent", [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new WrappedEnv(Env::Default(), uri)); + return guard->get(); + }); + library = cousin->AddLibrary("cousin"); + library->AddFactory( + "cousin", [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new WrappedEnv(Env::Default(), uri)); + return guard->get(); + }); + + Env* env = nullptr; + std::unique_ptr guard; + std::string msg; + + // a:://* is registered in Default, so they should all work + ASSERT_OK(parent->NewStaticObject("a://test", &env)); + ASSERT_OK(child->NewStaticObject("a://test", &env)); + ASSERT_OK(uncle->NewStaticObject("a://test", &env)); + ASSERT_OK(cousin->NewStaticObject("a://test", &env)); + + // The parent env is only registered for parent, not uncle, + // So parent and child should return success and uncle and cousin should fail + ASSERT_OK(parent->NewUniqueObject("parent", &guard)); + ASSERT_OK(child->NewUniqueObject("parent", &guard)); + ASSERT_NOK(uncle->NewUniqueObject("parent", &guard)); + ASSERT_NOK(cousin->NewUniqueObject("parent", &guard)); + + // The cousin is only registered in the cousin, so all of the others should + // fail + ASSERT_OK(cousin->NewUniqueObject("cousin", &guard)); + ASSERT_NOK(parent->NewUniqueObject("cousin", &guard)); + ASSERT_NOK(child->NewUniqueObject("cousin", &guard)); + ASSERT_NOK(uncle->NewUniqueObject("cousin", &guard)); +} + +class MyCustomizable : public Customizable { + public: + static const char* Type() { return "MyCustomizable"; } + MyCustomizable(const char* prefix, const std::string& id) : id_(id) { + name_ = id_.substr(0, strlen(prefix) - 1); + } + const char* Name() const override { return name_.c_str(); } + std::string GetId() const override { return id_; } + + private: + std::string id_; + std::string name_; +}; + +TEST_F(ObjRegistryTest, TestFactoryCount) { + std::string msg; + auto grand = ObjectRegistry::Default(); + auto local = ObjectRegistry::NewInstance(); + std::unordered_set grand_types, local_types; + std::vector grand_names, local_names; + + // Check how many types we have on startup. + // Grand should equal local + grand->GetFactoryTypes(&grand_types); + local->GetFactoryTypes(&local_types); + ASSERT_EQ(grand_types, local_types); + size_t grand_count = grand->GetFactoryCount(Env::Type()); + size_t local_count = local->GetFactoryCount(Env::Type()); + + ASSERT_EQ(grand_count, local_count); + grand->GetFactoryNames(Env::Type(), &grand_names); + local->GetFactoryNames(Env::Type(), &local_names); + ASSERT_EQ(grand_names.size(), grand_count); + ASSERT_EQ(local_names.size(), local_count); + ASSERT_EQ(grand_names, local_names); + + // Add an Env to the local registry. + // This will add one factory. + auto library = local->AddLibrary("local"); + library->AddFactory( + "A", [](const std::string& /*uri*/, std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return nullptr; }); + ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type())); + ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type())); + local->GetFactoryTypes(&local_types); + local->GetFactoryNames(Env::Type(), &local_names); + ASSERT_EQ(grand_names.size() + 1, local_names.size()); + ASSERT_EQ(local_names.size(), local->GetFactoryCount(Env::Type())); + + if (grand_count == 0) { + // There were no Env when we started. Should have one more type + // than previously + ASSERT_NE(grand_types, local_types); + ASSERT_EQ(grand_types.size() + 1, local_types.size()); + } else { + // There was an Env type when we started. The types should match + ASSERT_EQ(grand_types, local_types); + } + + // Add a MyCustomizable to the registry. This should be a new type + library->AddFactory( + "MY", [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return nullptr; }); + ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type())); + ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type())); + ASSERT_EQ(0U, grand->GetFactoryCount(MyCustomizable::Type())); + ASSERT_EQ(1U, local->GetFactoryCount(MyCustomizable::Type())); + + local->GetFactoryNames(MyCustomizable::Type(), &local_names); + ASSERT_EQ(1U, local_names.size()); + ASSERT_EQ(local_names[0], "MY"); + + local->GetFactoryTypes(&local_types); + ASSERT_EQ(grand_count == 0 ? 2 : grand_types.size() + 1, local_types.size()); + + // Add the same name again. We should now have 2 factories. + library->AddFactory( + "MY", [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return nullptr; }); + local->GetFactoryNames(MyCustomizable::Type(), &local_names); + ASSERT_EQ(2U, local_names.size()); +} + +TEST_F(ObjRegistryTest, TestManagedObjects) { + auto registry = ObjectRegistry::NewInstance(); + auto m_a1 = std::make_shared("", "A"); + auto m_a2 = std::make_shared("", "A"); + + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_OK(registry->SetManagedObject(m_a1)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a1); + + ASSERT_NOK(registry->SetManagedObject(m_a2)); + ASSERT_OK(registry->SetManagedObject(m_a1)); + m_a1.reset(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_OK(registry->SetManagedObject(m_a2)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a2); +} + +TEST_F(ObjRegistryTest, TestTwoManagedObjects) { + auto registry = ObjectRegistry::NewInstance(); + auto m_a = std::make_shared("", "A"); + auto m_b = std::make_shared("", "B"); + std::vector> objects; + + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 0U); + ASSERT_OK(registry->SetManagedObject(m_a)); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), m_a); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_a); + + ASSERT_OK(registry->SetManagedObject(m_b)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a); + ASSERT_EQ(registry->GetManagedObject("B"), m_b); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 2U); + ASSERT_OK(registry->ListManagedObjects("A", &objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_a); + ASSERT_OK(registry->ListManagedObjects("B", &objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_b); + ASSERT_OK(registry->ListManagedObjects("C", &objects)); + ASSERT_EQ(objects.size(), 0U); + + m_a.reset(); + objects.clear(); + + ASSERT_EQ(registry->GetManagedObject("B"), m_b); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_b); + + m_b.reset(); + objects.clear(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); +} + +TEST_F(ObjRegistryTest, TestAlternateNames) { + auto registry = ObjectRegistry::NewInstance(); + auto m_a = std::make_shared("", "A"); + auto m_b = std::make_shared("", "B"); + std::vector> objects; + // Test no objects exist + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); + ASSERT_EQ(registry->GetManagedObject("TheOne"), nullptr); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 0U); + + // Mark "TheOne" to be A + ASSERT_OK(registry->SetManagedObject("TheOne", m_a)); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("TheOne"), m_a); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_a); + + // Try to mark "TheOne" again. + ASSERT_NOK(registry->SetManagedObject("TheOne", m_b)); + ASSERT_OK(registry->SetManagedObject("TheOne", m_a)); + + // Add "A" as a managed object. Registered 2x + ASSERT_OK(registry->SetManagedObject(m_a)); + ASSERT_EQ(registry->GetManagedObject("B"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), m_a); + ASSERT_EQ(registry->GetManagedObject("TheOne"), m_a); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 2U); + + // Delete "A". + m_a.reset(); + objects.clear(); + + ASSERT_EQ(registry->GetManagedObject("TheOne"), nullptr); + ASSERT_OK(registry->SetManagedObject("TheOne", m_b)); + ASSERT_EQ(registry->GetManagedObject("TheOne"), m_b); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 1U); + ASSERT_EQ(objects.front(), m_b); + + m_b.reset(); + objects.clear(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("TheOne"), nullptr); + ASSERT_OK(registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 0U); +} + +TEST_F(ObjRegistryTest, TestTwoManagedClasses) { + class MyCustomizable2 : public MyCustomizable { + public: + static const char* Type() { return "MyCustomizable2"; } + MyCustomizable2(const char* prefix, const std::string& id) + : MyCustomizable(prefix, id) {} + }; + + auto registry = ObjectRegistry::NewInstance(); + auto m_a1 = std::make_shared("", "A"); + auto m_a2 = std::make_shared("", "A"); + std::vector> obj1s; + std::vector> obj2s; + + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + + ASSERT_OK(registry->SetManagedObject(m_a1)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a1); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + + ASSERT_OK(registry->SetManagedObject(m_a2)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a2); + ASSERT_OK(registry->ListManagedObjects(&obj1s)); + ASSERT_OK(registry->ListManagedObjects(&obj2s)); + ASSERT_EQ(obj1s.size(), 1U); + ASSERT_EQ(obj2s.size(), 1U); + ASSERT_EQ(obj1s.front(), m_a1); + ASSERT_EQ(obj2s.front(), m_a2); + m_a1.reset(); + obj1s.clear(); + obj2s.clear(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), m_a2); + + m_a2.reset(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); +} + +TEST_F(ObjRegistryTest, TestManagedObjectsWithParent) { + auto base = ObjectRegistry::NewInstance(); + auto registry = ObjectRegistry::NewInstance(base); + + auto m_a = std::make_shared("", "A"); + auto m_b = std::make_shared("", "A"); + + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_OK(base->SetManagedObject(m_a)); + ASSERT_EQ(registry->GetManagedObject("A"), m_a); + + ASSERT_NOK(registry->SetManagedObject(m_b)); + ASSERT_OK(registry->SetManagedObject(m_a)); + + m_a.reset(); + ASSERT_EQ(registry->GetManagedObject("A"), nullptr); + ASSERT_OK(registry->SetManagedObject(m_b)); + ASSERT_EQ(registry->GetManagedObject("A"), m_b); +} + +TEST_F(ObjRegistryTest, TestGetOrCreateManagedObject) { + auto registry = ObjectRegistry::NewInstance(); + registry->AddLibrary("test")->AddFactory( + ObjectLibrary::PatternEntry::AsIndividualId("MC"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MyCustomizable("MC", uri)); + return guard->get(); + }); + std::shared_ptr m_a, m_b, obj; + std::vector> objs; + + std::unordered_map opt_map; + + ASSERT_EQ(registry->GetManagedObject("MC@A#1"), nullptr); + ASSERT_EQ(registry->GetManagedObject("MC@B#1"), nullptr); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a)); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &m_b)); + ASSERT_EQ(registry->GetManagedObject("MC@A#1"), m_a); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &obj)); + ASSERT_EQ(obj, m_a); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj)); + ASSERT_EQ(obj, m_b); + ASSERT_OK(registry->ListManagedObjects(&objs)); + ASSERT_EQ(objs.size(), 2U); + + objs.clear(); + m_a.reset(); + obj.reset(); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a)); + ASSERT_EQ(1, m_a.use_count()); + ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj)); + ASSERT_EQ(2, obj.use_count()); +} + +TEST_F(ObjRegistryTest, RegisterPlugin) { + std::shared_ptr registry = ObjectRegistry::NewInstance(); + std::unique_ptr guard; + Env* env = nullptr; + + ASSERT_NOK(registry->NewObject("unguarded", &env, &guard)); + ASSERT_EQ(registry->RegisterPlugin("Missing", nullptr), -1); + ASSERT_EQ(registry->RegisterPlugin("", RegisterTestUnguarded), -1); + ASSERT_GT(registry->RegisterPlugin("Valid", RegisterTestUnguarded), 0); + ASSERT_OK(registry->NewObject("unguarded", &env, &guard)); + ASSERT_NE(env, nullptr); +} +class PatternEntryTest : public testing::Test {}; + +TEST_F(PatternEntryTest, TestSimpleEntry) { + ObjectLibrary::PatternEntry entry("ABC", true); + + ASSERT_TRUE(entry.Matches("ABC")); + ASSERT_FALSE(entry.Matches("AABC")); + ASSERT_FALSE(entry.Matches("ABCA")); + ASSERT_FALSE(entry.Matches("AABCA")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("BC")); + ASSERT_FALSE(entry.Matches("ABD")); + ASSERT_FALSE(entry.Matches("BCA")); +} + +TEST_F(PatternEntryTest, TestPatternEntry) { + // Matches A:+ + ObjectLibrary::PatternEntry entry("A", false); + entry.AddSeparator(":"); + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("B")); + ASSERT_FALSE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA:B")); + ASSERT_FALSE(entry.Matches("AA:BB")); + ASSERT_TRUE(entry.Matches("A:B")); + ASSERT_TRUE(entry.Matches("A:BB")); + + entry.SetOptional(true); // Now matches "A" or "A:+" + ASSERT_TRUE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("B")); + ASSERT_FALSE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA:B")); + ASSERT_FALSE(entry.Matches("AA:BB")); + ASSERT_TRUE(entry.Matches("A:B")); + ASSERT_TRUE(entry.Matches("A:BB")); +} + +TEST_F(PatternEntryTest, MatchZeroOrMore) { + // Matches A:* + ObjectLibrary::PatternEntry entry("A", false); + entry.AddSeparator(":", false); + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("B")); + ASSERT_TRUE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("B:")); + ASSERT_FALSE(entry.Matches("B:A")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA:B")); + ASSERT_FALSE(entry.Matches("AA:BB")); + ASSERT_TRUE(entry.Matches("A:B")); + ASSERT_TRUE(entry.Matches("A:BB")); + + entry.SetOptional(true); // Now matches "A" or "A:*" + ASSERT_TRUE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("B")); + ASSERT_TRUE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("B:")); + ASSERT_FALSE(entry.Matches("B:A")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA:B")); + ASSERT_FALSE(entry.Matches("AA:BB")); + ASSERT_TRUE(entry.Matches("A:B")); + ASSERT_TRUE(entry.Matches("A:BB")); +} + +TEST_F(PatternEntryTest, TestSuffixEntry) { + ObjectLibrary::PatternEntry entry("AA", true); + entry.AddSuffix("BB"); + + ASSERT_TRUE(entry.Matches("AA")); + ASSERT_TRUE(entry.Matches("AABB")); + + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AB")); + ASSERT_FALSE(entry.Matches("B")); + ASSERT_FALSE(entry.Matches("BB")); + ASSERT_FALSE(entry.Matches("ABA")); + ASSERT_FALSE(entry.Matches("BBAA")); + ASSERT_FALSE(entry.Matches("AABBA")); + ASSERT_FALSE(entry.Matches("AABBB")); +} + +TEST_F(PatternEntryTest, TestNumericEntry) { + ObjectLibrary::PatternEntry entry("A", false); + entry.AddNumber(":"); + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_TRUE(entry.Matches("A:1")); + ASSERT_TRUE(entry.Matches("A:11")); + ASSERT_FALSE(entry.Matches("AA:1")); + ASSERT_FALSE(entry.Matches("AA:11")); + ASSERT_FALSE(entry.Matches("A:B")); + ASSERT_FALSE(entry.Matches("A:1B")); + ASSERT_FALSE(entry.Matches("A:B1")); + + entry.AddSeparator(":", false); + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_TRUE(entry.Matches("A:1:")); + ASSERT_TRUE(entry.Matches("A:11:")); + ASSERT_FALSE(entry.Matches("A:1")); + ASSERT_FALSE(entry.Matches("A:B1:")); + ASSERT_FALSE(entry.Matches("A:1B:")); + ASSERT_FALSE(entry.Matches("A::")); +} + +TEST_F(PatternEntryTest, TestDoubleEntry) { + ObjectLibrary::PatternEntry entry("A", false); + entry.AddNumber(":", false); + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("A:")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA:1")); + ASSERT_FALSE(entry.Matches("AA:11")); + ASSERT_FALSE(entry.Matches("A:B")); + ASSERT_FALSE(entry.Matches("A:1B")); + ASSERT_FALSE(entry.Matches("A:B1")); + ASSERT_TRUE(entry.Matches("A:1")); + ASSERT_TRUE(entry.Matches("A:11")); + ASSERT_TRUE(entry.Matches("A:1.1")); + ASSERT_TRUE(entry.Matches("A:11.11")); + ASSERT_TRUE(entry.Matches("A:1.")); + ASSERT_TRUE(entry.Matches("A:.1")); + ASSERT_TRUE(entry.Matches("A:0.1")); + ASSERT_TRUE(entry.Matches("A:1.0")); + ASSERT_TRUE(entry.Matches("A:1.0")); + + ASSERT_FALSE(entry.Matches("A:1.0.")); + ASSERT_FALSE(entry.Matches("A:1.0.2")); + ASSERT_FALSE(entry.Matches("A:.1.0")); + ASSERT_FALSE(entry.Matches("A:..10")); + ASSERT_FALSE(entry.Matches("A:10..")); + ASSERT_FALSE(entry.Matches("A:.")); + + entry.AddSeparator(":", false); + ASSERT_FALSE(entry.Matches("A:1")); + ASSERT_FALSE(entry.Matches("A:1.0")); + + ASSERT_TRUE(entry.Matches("A:11:")); + ASSERT_TRUE(entry.Matches("A:1.1:")); + ASSERT_TRUE(entry.Matches("A:11.11:")); + ASSERT_TRUE(entry.Matches("A:1.:")); + ASSERT_TRUE(entry.Matches("A:.1:")); + ASSERT_TRUE(entry.Matches("A:0.1:")); + ASSERT_TRUE(entry.Matches("A:1.0:")); + ASSERT_TRUE(entry.Matches("A:1.0:")); + + ASSERT_FALSE(entry.Matches("A:1.0.:")); + ASSERT_FALSE(entry.Matches("A:1.0.2:")); + ASSERT_FALSE(entry.Matches("A:.1.0:")); + ASSERT_FALSE(entry.Matches("A:..10:")); + ASSERT_FALSE(entry.Matches("A:10..:")); + ASSERT_FALSE(entry.Matches("A:.:")); + ASSERT_FALSE(entry.Matches("A::")); +} + +TEST_F(PatternEntryTest, TestIndividualIdEntry) { + auto entry = ObjectLibrary::PatternEntry::AsIndividualId("AA"); + ASSERT_TRUE(entry.Matches("AA")); + ASSERT_TRUE(entry.Matches("AA@123#456")); + ASSERT_TRUE(entry.Matches("AA@deadbeef#id")); + + ASSERT_FALSE(entry.Matches("A")); + ASSERT_FALSE(entry.Matches("AAA")); + ASSERT_FALSE(entry.Matches("AA@123")); + ASSERT_FALSE(entry.Matches("AA@123#")); + ASSERT_FALSE(entry.Matches("AA@#123")); +} + +TEST_F(PatternEntryTest, TestTwoNameEntry) { + ObjectLibrary::PatternEntry entry("A"); + entry.AnotherName("B"); + ASSERT_TRUE(entry.Matches("A")); + ASSERT_TRUE(entry.Matches("B")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("BB")); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("BA")); + ASSERT_FALSE(entry.Matches("AB")); +} + +TEST_F(PatternEntryTest, TestTwoPatternEntry) { + ObjectLibrary::PatternEntry entry("AA", false); + entry.AddSeparator(":"); + entry.AddSeparator(":"); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA::")); + ASSERT_FALSE(entry.Matches("AA::12")); + ASSERT_TRUE(entry.Matches("AA:1:2")); + ASSERT_TRUE(entry.Matches("AA:1:2:")); + + ObjectLibrary::PatternEntry entry2("AA", false); + entry2.AddSeparator("::"); + entry2.AddSeparator("##"); + ASSERT_FALSE(entry2.Matches("AA")); + ASSERT_FALSE(entry2.Matches("AA:")); + ASSERT_FALSE(entry2.Matches("AA::")); + ASSERT_FALSE(entry2.Matches("AA::#")); + ASSERT_FALSE(entry2.Matches("AA::##")); + ASSERT_FALSE(entry2.Matches("AA##1::2")); + ASSERT_FALSE(entry2.Matches("AA::123##")); + ASSERT_TRUE(entry2.Matches("AA::1##2")); + ASSERT_TRUE(entry2.Matches("AA::12##34:")); + ASSERT_TRUE(entry2.Matches("AA::12::34##56")); + ASSERT_TRUE(entry2.Matches("AA::12##34::56")); +} + +TEST_F(PatternEntryTest, TestTwoNumbersEntry) { + ObjectLibrary::PatternEntry entry("AA", false); + entry.AddNumber(":"); + entry.AddNumber(":"); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AA:")); + ASSERT_FALSE(entry.Matches("AA::")); + ASSERT_FALSE(entry.Matches("AA::12")); + ASSERT_FALSE(entry.Matches("AA:1:2:")); + ASSERT_TRUE(entry.Matches("AA:1:2")); + ASSERT_TRUE(entry.Matches("AA:12:23456")); + + ObjectLibrary::PatternEntry entry2("AA", false); + entry2.AddNumber(":"); + entry2.AddNumber("#"); + ASSERT_FALSE(entry2.Matches("AA")); + ASSERT_FALSE(entry2.Matches("AA:")); + ASSERT_FALSE(entry2.Matches("AA:#")); + ASSERT_FALSE(entry2.Matches("AA#:")); + ASSERT_FALSE(entry2.Matches("AA:123#")); + ASSERT_FALSE(entry2.Matches("AA:123#B")); + ASSERT_FALSE(entry2.Matches("AA:B#123")); + ASSERT_TRUE(entry2.Matches("AA:1#2")); + ASSERT_FALSE(entry2.Matches("AA:123#23:")); + ASSERT_FALSE(entry2.Matches("AA::12#234")); +} + +TEST_F(PatternEntryTest, TestPatternAndSuffix) { + ObjectLibrary::PatternEntry entry("AA", false); + entry.AddSeparator("::"); + entry.AddSuffix("##"); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("AA::")); + ASSERT_FALSE(entry.Matches("AA::##")); + ASSERT_FALSE(entry.Matches("AB::1##")); + ASSERT_FALSE(entry.Matches("AB::1##2")); + ASSERT_FALSE(entry.Matches("AA##1::")); + ASSERT_TRUE(entry.Matches("AA::1##")); + ASSERT_FALSE(entry.Matches("AA::1###")); + + ObjectLibrary::PatternEntry entry2("AA", false); + entry2.AddSuffix("::"); + entry2.AddSeparator("##"); + ASSERT_FALSE(entry2.Matches("AA")); + ASSERT_FALSE(entry2.Matches("AA::")); + ASSERT_FALSE(entry2.Matches("AA::##")); + ASSERT_FALSE(entry2.Matches("AB::1##")); + ASSERT_FALSE(entry2.Matches("AB::1##2")); + ASSERT_TRUE(entry2.Matches("AA::##12")); +} + +TEST_F(PatternEntryTest, TestTwoNamesAndPattern) { + ObjectLibrary::PatternEntry entry("AA", true); + entry.AddSeparator("::"); + entry.AnotherName("BBB"); + ASSERT_TRUE(entry.Matches("AA")); + ASSERT_TRUE(entry.Matches("AA::1")); + ASSERT_TRUE(entry.Matches("BBB")); + ASSERT_TRUE(entry.Matches("BBB::2")); + + ASSERT_FALSE(entry.Matches("AA::")); + ASSERT_FALSE(entry.Matches("AAA::")); + ASSERT_FALSE(entry.Matches("BBB::")); + + entry.SetOptional(false); + ASSERT_FALSE(entry.Matches("AA")); + ASSERT_FALSE(entry.Matches("BBB")); + + ASSERT_FALSE(entry.Matches("AA::")); + ASSERT_FALSE(entry.Matches("AAA::")); + ASSERT_FALSE(entry.Matches("BBB::")); + + ASSERT_TRUE(entry.Matches("AA::1")); + ASSERT_TRUE(entry.Matches("BBB::2")); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else // ROCKSDB_LITE +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc new file mode 100644 index 000000000..e93d2152d --- /dev/null +++ b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/option_change_migration.h" + +#ifndef ROCKSDB_LITE +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// Return a version of Options `opts` that allow us to open/write into a DB +// without triggering an automatic compaction or stalling. This is guaranteed +// by disabling automatic compactions and using huge values for stalling +// triggers. +Options GetNoCompactionOptions(const Options& opts) { + Options ret_opts = opts; + ret_opts.disable_auto_compactions = true; + ret_opts.level0_slowdown_writes_trigger = 999999; + ret_opts.level0_stop_writes_trigger = 999999; + ret_opts.soft_pending_compaction_bytes_limit = 0; + ret_opts.hard_pending_compaction_bytes_limit = 0; + return ret_opts; +} + +Status OpenDb(const Options& options, const std::string& dbname, + std::unique_ptr* db) { + db->reset(); + DB* tmpdb; + Status s = DB::Open(options, dbname, &tmpdb); + if (s.ok()) { + db->reset(tmpdb); + } + return s; +} + +// l0_file_size specifies size of file on L0. Files will be range partitioned +// after a full compaction so they are likely qualified to put on L0. If +// left as 0, the files are compacted in a single file and put to L0. Otherwise, +// will try to compact the files as size l0_file_size. +Status CompactToLevel(const Options& options, const std::string& dbname, + int dest_level, uint64_t l0_file_size, bool need_reopen) { + std::unique_ptr db; + Options no_compact_opts = GetNoCompactionOptions(options); + if (dest_level == 0) { + if (l0_file_size == 0) { + // Single file. + l0_file_size = 999999999999999; + } + // L0 has strict sequenceID requirements to files to it. It's safer + // to only put one compacted file to there. + // This is only used for converting to universal compaction with + // only one level. In this case, compacting to one file is also + // optimal. + no_compact_opts.target_file_size_base = l0_file_size; + no_compact_opts.max_compaction_bytes = l0_file_size; + } + Status s = OpenDb(no_compact_opts, dbname, &db); + if (!s.ok()) { + return s; + } + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = dest_level; + if (dest_level == 0) { + // cannot use kForceOptimized because the compaction is expected to + // generate one output file + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + } + s = db->CompactRange(cro, nullptr, nullptr); + + if (s.ok() && need_reopen) { + // Need to restart DB to rewrite the manifest file. + // In order to open a DB with specific num_levels, the manifest file should + // contain no record that mentiones any level beyond num_levels. Issuing a + // full compaction will move all the data to a level not exceeding + // num_levels, but the manifest may still contain previous record mentioning + // a higher level. Reopening the DB will force the manifest to be rewritten + // so that those records will be cleared. + db.reset(); + s = OpenDb(no_compact_opts, dbname, &db); + } + return s; +} + +Status MigrateToUniversal(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (old_opts.num_levels <= new_opts.num_levels || + old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + return Status::OK(); + } else { + bool need_compact = false; + { + std::unique_ptr db; + Options opts = GetNoCompactionOptions(old_opts); + Status s = OpenDb(opts, dbname, &db); + if (!s.ok()) { + return s; + } + ColumnFamilyMetaData metadata; + db->GetColumnFamilyMetaData(&metadata); + if (!metadata.levels.empty() && + metadata.levels.back().level >= new_opts.num_levels) { + need_compact = true; + } + } + if (need_compact) { + return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, true); + } + return Status::OK(); + } +} + +Status MigrateToLevelBase(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (!new_opts.level_compaction_dynamic_level_bytes) { + if (old_opts.num_levels == 1) { + return Status::OK(); + } + // Compact everything to level 1 to guarantee it can be safely opened. + Options opts = old_opts; + opts.target_file_size_base = new_opts.target_file_size_base; + // Although sometimes we can open the DB with the new option without error, + // We still want to compact the files to avoid the LSM tree to stuck + // in bad shape. For example, if the user changed the level size + // multiplier from 4 to 8, with the same data, we will have fewer + // levels. Unless we issue a full comaction, the LSM tree may stuck + // with more levels than needed and it won't recover automatically. + return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true); + } else { + // Compact everything to the last level to guarantee it can be safely + // opened. + if (old_opts.num_levels == 1) { + return Status::OK(); + } else if (new_opts.num_levels > old_opts.num_levels) { + // Dynamic level mode requires data to be put in the last level first. + return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, false); + } else { + Options opts = old_opts; + opts.target_file_size_base = new_opts.target_file_size_base; + return CompactToLevel(opts, dbname, new_opts.num_levels - 1, + /*l0_file_size=*/0, true); + } + } +} +} // namespace + +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts) { + if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + // LSM generated by FIFO compaction can be opened by any compaction. + return Status::OK(); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleUniversal) { + return MigrateToUniversal(dbname, old_opts, new_opts); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleLevel) { + return MigrateToLevelBase(dbname, old_opts, new_opts); + } else if (new_opts.compaction_style == + CompactionStyle::kCompactionStyleFIFO) { + uint64_t l0_file_size = 0; + if (new_opts.compaction_options_fifo.max_table_files_size > 0) { + // Create at least 8 files when max_table_files_size hits, so that the DB + // doesn't just disappear. This in fact violates the FIFO condition, but + // otherwise, the migrated DB is unlikley to be usable. + l0_file_size = new_opts.compaction_options_fifo.max_table_files_size / 8; + } + return CompactToLevel(old_opts, dbname, 0, l0_file_size, true); + } else { + return Status::NotSupported( + "Do not how to migrate to this compaction style"); + } +} +} // namespace ROCKSDB_NAMESPACE +#else +namespace ROCKSDB_NAMESPACE { +Status OptionChangeMigration(std::string /*dbname*/, + const Options& /*old_opts*/, + const Options& /*new_opts*/) { + return Status::NotSupported(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc new file mode 100644 index 000000000..71af45db1 --- /dev/null +++ b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc @@ -0,0 +1,550 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/utilities/option_change_migration.h" + +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBOptionChangeMigrationTests + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBOptionChangeMigrationTests() + : DBTestBase("db_option_change_migration_test", /*env_do_fsync=*/true) { + level1_ = std::get<0>(GetParam()); + compaction_style1_ = std::get<1>(GetParam()); + is_dynamic1_ = std::get<2>(GetParam()); + + level2_ = std::get<3>(GetParam()); + compaction_style2_ = std::get<4>(GetParam()); + is_dynamic2_ = std::get<5>(GetParam()); + fifo_max_table_files_size_ = std::get<6>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + int level1_; + int compaction_style1_; + bool is_dynamic1_; + + int level2_; + int compaction_style2_; + bool is_dynamic2_; + + uint64_t fifo_max_table_files_size_; +}; + +#ifndef ROCKSDB_LITE +TEST_P(DBOptionChangeMigrationTests, Migrate1) { + Options old_options = CurrentOptions(); + old_options.compaction_style = + static_cast(compaction_style1_); + if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + old_options.level_compaction_dynamic_level_bytes = is_dynamic1_; + } + if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + old_options.max_open_files = -1; + } + old_options.level0_file_num_compaction_trigger = 3; + old_options.write_buffer_size = 64 * 1024; + old_options.target_file_size_base = 128 * 1024; + // Make level target of L1, L2 to be 200KB and 600KB + old_options.num_levels = level1_; + old_options.max_bytes_for_level_multiplier = 3; + old_options.max_bytes_for_level_base = 200 * 1024; + + Reopen(old_options); + + Random rnd(301); + int key_idx = 0; + + // Generate at least 2MB of data + for (int num = 0; num < 20; num++) { + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Will make sure exactly those keys are in the DB after migration. + std::set keys; + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (; it->Valid(); it->Next()) { + keys.insert(it->key().ToString()); + } + } + Close(); + + Options new_options = old_options; + new_options.compaction_style = + static_cast(compaction_style2_); + if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + new_options.level_compaction_dynamic_level_bytes = is_dynamic2_; + } + if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + new_options.max_open_files = -1; + } + if (fifo_max_table_files_size_ != 0) { + new_options.compaction_options_fifo.max_table_files_size = + fifo_max_table_files_size_; + } + new_options.target_file_size_base = 256 * 1024; + new_options.num_levels = level2_; + new_options.max_bytes_for_level_base = 150 * 1024; + new_options.max_bytes_for_level_multiplier = 4; + ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options)); + Reopen(new_options); + + // Wait for compaction to finish and make sure it can reopen + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Reopen(new_options); + + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (std::string key : keys) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(key, it->key().ToString()); + it->Next(); + } + ASSERT_TRUE(!it->Valid()); + } +} + +TEST_P(DBOptionChangeMigrationTests, Migrate2) { + Options old_options = CurrentOptions(); + old_options.compaction_style = + static_cast(compaction_style2_); + if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + old_options.level_compaction_dynamic_level_bytes = is_dynamic2_; + } + if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + old_options.max_open_files = -1; + } + old_options.level0_file_num_compaction_trigger = 3; + old_options.write_buffer_size = 64 * 1024; + old_options.target_file_size_base = 128 * 1024; + // Make level target of L1, L2 to be 200KB and 600KB + old_options.num_levels = level2_; + old_options.max_bytes_for_level_multiplier = 3; + old_options.max_bytes_for_level_base = 200 * 1024; + + Reopen(old_options); + + Random rnd(301); + int key_idx = 0; + + // Generate at least 2MB of data + for (int num = 0; num < 20; num++) { + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Will make sure exactly those keys are in the DB after migration. + std::set keys; + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (; it->Valid(); it->Next()) { + keys.insert(it->key().ToString()); + } + } + + Close(); + + Options new_options = old_options; + new_options.compaction_style = + static_cast(compaction_style1_); + if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + new_options.level_compaction_dynamic_level_bytes = is_dynamic1_; + } + if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + new_options.max_open_files = -1; + } + if (fifo_max_table_files_size_ != 0) { + new_options.compaction_options_fifo.max_table_files_size = + fifo_max_table_files_size_; + } + new_options.target_file_size_base = 256 * 1024; + new_options.num_levels = level1_; + new_options.max_bytes_for_level_base = 150 * 1024; + new_options.max_bytes_for_level_multiplier = 4; + ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options)); + Reopen(new_options); + // Wait for compaction to finish and make sure it can reopen + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Reopen(new_options); + + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (std::string key : keys) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(key, it->key().ToString()); + it->Next(); + } + ASSERT_TRUE(!it->Valid()); + } +} + +TEST_P(DBOptionChangeMigrationTests, Migrate3) { + Options old_options = CurrentOptions(); + old_options.compaction_style = + static_cast(compaction_style1_); + if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + old_options.level_compaction_dynamic_level_bytes = is_dynamic1_; + } + if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + old_options.max_open_files = -1; + } + old_options.level0_file_num_compaction_trigger = 3; + old_options.write_buffer_size = 64 * 1024; + old_options.target_file_size_base = 128 * 1024; + // Make level target of L1, L2 to be 200KB and 600KB + old_options.num_levels = level1_; + old_options.max_bytes_for_level_multiplier = 3; + old_options.max_bytes_for_level_base = 200 * 1024; + + Reopen(old_options); + Random rnd(301); + for (int num = 0; num < 20; num++) { + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (num == 9) { + // Issue a full compaction to generate some zero-out files + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Will make sure exactly those keys are in the DB after migration. + std::set keys; + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (; it->Valid(); it->Next()) { + keys.insert(it->key().ToString()); + } + } + Close(); + + Options new_options = old_options; + new_options.compaction_style = + static_cast(compaction_style2_); + if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + new_options.level_compaction_dynamic_level_bytes = is_dynamic2_; + } + if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + new_options.max_open_files = -1; + } + if (fifo_max_table_files_size_ != 0) { + new_options.compaction_options_fifo.max_table_files_size = + fifo_max_table_files_size_; + } + new_options.target_file_size_base = 256 * 1024; + new_options.num_levels = level2_; + new_options.max_bytes_for_level_base = 150 * 1024; + new_options.max_bytes_for_level_multiplier = 4; + ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options)); + Reopen(new_options); + + // Wait for compaction to finish and make sure it can reopen + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Reopen(new_options); + + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (std::string key : keys) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(key, it->key().ToString()); + it->Next(); + } + ASSERT_TRUE(!it->Valid()); + } +} + +TEST_P(DBOptionChangeMigrationTests, Migrate4) { + Options old_options = CurrentOptions(); + old_options.compaction_style = + static_cast(compaction_style2_); + if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + old_options.level_compaction_dynamic_level_bytes = is_dynamic2_; + } + if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + old_options.max_open_files = -1; + } + old_options.level0_file_num_compaction_trigger = 3; + old_options.write_buffer_size = 64 * 1024; + old_options.target_file_size_base = 128 * 1024; + // Make level target of L1, L2 to be 200KB and 600KB + old_options.num_levels = level2_; + old_options.max_bytes_for_level_multiplier = 3; + old_options.max_bytes_for_level_base = 200 * 1024; + + Reopen(old_options); + Random rnd(301); + for (int num = 0; num < 20; num++) { + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (num == 9) { + // Issue a full compaction to generate some zero-out files + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Will make sure exactly those keys are in the DB after migration. + std::set keys; + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (; it->Valid(); it->Next()) { + keys.insert(it->key().ToString()); + } + } + + Close(); + + Options new_options = old_options; + new_options.compaction_style = + static_cast(compaction_style1_); + if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) { + new_options.level_compaction_dynamic_level_bytes = is_dynamic1_; + } + if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + new_options.max_open_files = -1; + } + if (fifo_max_table_files_size_ != 0) { + new_options.compaction_options_fifo.max_table_files_size = + fifo_max_table_files_size_; + } + new_options.target_file_size_base = 256 * 1024; + new_options.num_levels = level1_; + new_options.max_bytes_for_level_base = 150 * 1024; + new_options.max_bytes_for_level_multiplier = 4; + ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options)); + Reopen(new_options); + // Wait for compaction to finish and make sure it can reopen + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Reopen(new_options); + + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (std::string key : keys) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(key, it->key().ToString()); + it->Next(); + } + ASSERT_TRUE(!it->Valid()); + } +} + +INSTANTIATE_TEST_CASE_P( + DBOptionChangeMigrationTests, DBOptionChangeMigrationTests, + ::testing::Values( + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 0 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 4 /* old num_levels */, 0 /* new compaction style */, + true /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 4 /* old num_levels */, 0 /* new compaction style */, + false, 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 0 /* new compaction style */, + true /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 1 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 4 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 1 /* old num_levels */, 1 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 0 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 1 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 2 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 3 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 0 /*fifo max_table_files_size*/), + std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, 0), + std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */, + false /* is dynamic leveling in old option */, + 1 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 5 * 1024 * 1024 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */, + true /* is dynamic leveling in old option */, + 2 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 5 * 1024 * 1024 /*fifo max_table_files_size*/), + std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 3 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 5 * 1024 * 1024 /*fifo max_table_files_size*/), + std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */, + false /* is dynamic leveling in old option */, + 4 /* old num_levels */, 2 /* new compaction style */, + false /* is dynamic leveling in new option */, + 5 * 1024 * 1024 /*fifo max_table_files_size*/))); + +class DBOptionChangeMigrationTest : public DBTestBase { + public: + DBOptionChangeMigrationTest() + : DBTestBase("db_option_change_migration_test2", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) { + Options old_options = CurrentOptions(); + old_options.compaction_style = CompactionStyle::kCompactionStyleLevel; + old_options.max_compaction_bytes = 200 * 1024; + old_options.level_compaction_dynamic_level_bytes = false; + old_options.level0_file_num_compaction_trigger = 3; + old_options.write_buffer_size = 64 * 1024; + old_options.target_file_size_base = 128 * 1024; + // Make level target of L1, L2 to be 200KB and 600KB + old_options.num_levels = 4; + old_options.max_bytes_for_level_multiplier = 3; + old_options.max_bytes_for_level_base = 200 * 1024; + + Reopen(old_options); + Random rnd(301); + for (int num = 0; num < 20; num++) { + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); + } + } + Flush(); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // Will make sure exactly those keys are in the DB after migration. + std::set keys; + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (; it->Valid(); it->Next()) { + keys.insert(it->key().ToString()); + } + } + + Close(); + + Options new_options = old_options; + new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + new_options.target_file_size_base = 256 * 1024; + new_options.num_levels = 1; + new_options.max_bytes_for_level_base = 150 * 1024; + new_options.max_bytes_for_level_multiplier = 4; + ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options)); + Reopen(new_options); + // Wait for compaction to finish and make sure it can reopen + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Reopen(new_options); + + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToFirst(); + for (std::string key : keys) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(key, it->key().ToString()); + it->Next(); + } + ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); + } +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/options/options_util.cc b/src/rocksdb/utilities/options/options_util.cc new file mode 100644 index 000000000..00c4b981a --- /dev/null +++ b/src/rocksdb/utilities/options/options_util.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/options_util.h" + +#include "file/filename.h" +#include "options/options_parser.h" +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "table/block_based/block_based_table_factory.h" + +namespace ROCKSDB_NAMESPACE { +Status LoadOptionsFromFile(const std::string& file_name, Env* env, + DBOptions* db_options, + std::vector* cf_descs, + bool ignore_unknown_options, + std::shared_ptr* cache) { + ConfigOptions config_options; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = true; + config_options.env = env; + + return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs, + cache); +} + +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& file_name, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { + RocksDBOptionsParser parser; + const auto& fs = config_options.env->GetFileSystem(); + Status s = parser.Parse(config_options, file_name, fs.get()); + if (!s.ok()) { + return s; + } + *db_options = *parser.db_opt(); + const std::vector& cf_names = *parser.cf_names(); + const std::vector& cf_opts = *parser.cf_opts(); + cf_descs->clear(); + for (size_t i = 0; i < cf_opts.size(); ++i) { + cf_descs->push_back({cf_names[i], cf_opts[i]}); + if (cache != nullptr) { + TableFactory* tf = cf_opts[i].table_factory.get(); + if (tf != nullptr) { + auto* opts = tf->GetOptions(); + if (opts != nullptr) { + opts->block_cache = *cache; + } + } + } + } + return Status::OK(); +} + +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name) { + Status s; + std::string latest_file_name; + uint64_t latest_time_stamp = 0; + std::vector file_names; + s = env->GetChildren(dbpath, &file_names); + if (s.IsNotFound()) { + return Status::NotFound(Status::kPathNotFound, + "No options files found in the DB directory.", + dbpath); + } else if (!s.ok()) { + return s; + } + for (auto& file_name : file_names) { + uint64_t time_stamp; + FileType type; + if (ParseFileName(file_name, &time_stamp, &type) && type == kOptionsFile) { + if (time_stamp > latest_time_stamp) { + latest_time_stamp = time_stamp; + latest_file_name = file_name; + } + } + } + if (latest_file_name.size() == 0) { + return Status::NotFound(Status::kPathNotFound, + "No options files found in the DB directory.", + dbpath); + } + *options_file_name = latest_file_name; + return Status::OK(); +} + +Status LoadLatestOptions(const std::string& dbpath, Env* env, + DBOptions* db_options, + std::vector* cf_descs, + bool ignore_unknown_options, + std::shared_ptr* cache) { + ConfigOptions config_options; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = true; + config_options.env = env; + + return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache); +} + +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { + std::string options_file_name; + Status s = + GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name); + if (!s.ok()) { + return s; + } + return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name, + db_options, cf_descs, cache); +} + +Status CheckOptionsCompatibility( + const std::string& dbpath, Env* env, const DBOptions& db_options, + const std::vector& cf_descs, + bool ignore_unknown_options) { + ConfigOptions config_options(db_options); + config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = true; + config_options.env = env; + return CheckOptionsCompatibility(config_options, dbpath, db_options, + cf_descs); +} + +Status CheckOptionsCompatibility( + const ConfigOptions& config_options, const std::string& dbpath, + const DBOptions& db_options, + const std::vector& cf_descs) { + std::string options_file_name; + Status s = + GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name); + if (!s.ok()) { + return s; + } + + std::vector cf_names; + std::vector cf_opts; + for (const auto& cf_desc : cf_descs) { + cf_names.push_back(cf_desc.name); + cf_opts.push_back(cf_desc.options); + } + + const auto& fs = config_options.env->GetFileSystem(); + + return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( + config_options, db_options, cf_names, cf_opts, + dbpath + "/" + options_file_name, fs.get()); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/options/options_util_test.cc b/src/rocksdb/utilities/options/options_util_test.cc new file mode 100644 index 000000000..1c3b41ff2 --- /dev/null +++ b/src/rocksdb/utilities/options/options_util_test.cc @@ -0,0 +1,779 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/options_util.h" + +#include +#include +#include + +#include "env/mock_env.h" +#include "file/filename.h" +#include "options/options_parser.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +#ifndef GFLAGS +bool FLAGS_enable_print = false; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { +class OptionsUtilTest : public testing::Test { + public: + OptionsUtilTest() : rnd_(0xFB) { + env_.reset(NewMemEnv(Env::Default())); + dbname_ = test::PerThreadDBPath("options_util_test"); + } + + protected: + std::unique_ptr env_; + std::string dbname_; + Random rnd_; +}; + +TEST_F(OptionsUtilTest, SaveAndLoad) { + const size_t kCFCount = 5; + + DBOptions db_opt; + std::vector cf_names; + std::vector cf_opts; + test::RandomInitDBOptions(&db_opt, &rnd_); + for (size_t i = 0; i < kCFCount; ++i) { + cf_names.push_back(i == 0 ? kDefaultColumnFamilyName + : test::RandomName(&rnd_, 10)); + cf_opts.emplace_back(); + test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_); + } + + const std::string kFileName = "OPTIONS-123456"; + ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, + env_->GetFileSystem().get())); + + DBOptions loaded_db_opt; + std::vector loaded_cf_descs; + ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt, + &loaded_cf_descs)); + ConfigOptions exact; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt)); + test::RandomInitDBOptions(&db_opt, &rnd_); + ASSERT_NOK( + RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt)); + + for (size_t i = 0; i < kCFCount; ++i) { + ASSERT_EQ(cf_names[i], loaded_cf_descs[i].name); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + exact, cf_opts[i], loaded_cf_descs[i].options)); + ASSERT_OK(RocksDBOptionsParser::VerifyTableFactory( + exact, cf_opts[i].table_factory.get(), + loaded_cf_descs[i].options.table_factory.get())); + test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_); + ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( + exact, cf_opts[i], loaded_cf_descs[i].options)); + } + + ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_opts[0]))); + for (size_t i = 0; i < kCFCount; ++i) { + if (cf_opts[i].compaction_filter) { + delete cf_opts[i].compaction_filter; + } + } +} + +TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) { + // creating db + DBOptions db_opt; + db_opt.create_if_missing = true; + // initialize BlockBasedTableOptions + std::shared_ptr cache = NewLRUCache(1 * 1024); + BlockBasedTableOptions bbt_opts; + bbt_opts.block_size = 32 * 1024; + // saving cf options + std::vector cf_opts; + ColumnFamilyOptions default_column_family_opt = ColumnFamilyOptions(); + default_column_family_opt.table_factory.reset( + NewBlockBasedTableFactory(bbt_opts)); + cf_opts.push_back(default_column_family_opt); + + ColumnFamilyOptions cf_opt_sample = ColumnFamilyOptions(); + cf_opt_sample.table_factory.reset(NewBlockBasedTableFactory(bbt_opts)); + cf_opts.push_back(cf_opt_sample); + + ColumnFamilyOptions cf_opt_plain_table_opt = ColumnFamilyOptions(); + cf_opt_plain_table_opt.table_factory.reset(NewPlainTableFactory()); + cf_opts.push_back(cf_opt_plain_table_opt); + + std::vector cf_names; + cf_names.push_back(kDefaultColumnFamilyName); + cf_names.push_back("cf_sample"); + cf_names.push_back("cf_plain_table_sample"); + // Saving DB in file + const std::string kFileName = "OPTIONS-LOAD_CACHE_123456"; + ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, + env_->GetFileSystem().get())); + DBOptions loaded_db_opt; + std::vector loaded_cf_descs; + + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.input_strings_escaped = true; + config_options.env = env_.get(); + ASSERT_OK(LoadOptionsFromFile(config_options, kFileName, &loaded_db_opt, + &loaded_cf_descs, &cache)); + for (size_t i = 0; i < loaded_cf_descs.size(); i++) { + auto* loaded_bbt_opt = + loaded_cf_descs[i] + .options.table_factory->GetOptions(); + // Expect the same cache will be loaded + if (loaded_bbt_opt != nullptr) { + ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get()); + } + } + + // Test the old interface + ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt, + &loaded_cf_descs, false, &cache)); + for (size_t i = 0; i < loaded_cf_descs.size(); i++) { + auto* loaded_bbt_opt = + loaded_cf_descs[i] + .options.table_factory->GetOptions(); + // Expect the same cache will be loaded + if (loaded_bbt_opt != nullptr) { + ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get()); + } + } + ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0]))); +} + +namespace { +class DummyTableFactory : public TableFactory { + public: + DummyTableFactory() {} + ~DummyTableFactory() override {} + + const char* Name() const override { return "DummyTableFactory"; } + + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& /*ro*/, + const TableReaderOptions& /*table_reader_options*/, + std::unique_ptr&& /*file*/, + uint64_t /*file_size*/, std::unique_ptr* /*table_reader*/, + bool /*prefetch_index_and_filter_in_cache*/) const override { + return Status::NotSupported(); + } + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& /*table_builder_options*/, + WritableFileWriter* /*file*/) const override { + return nullptr; + } + + Status ValidateOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::NotSupported(); + } + + std::string GetPrintableOptions() const override { return ""; } +}; + +class DummyMergeOperator : public MergeOperator { + public: + DummyMergeOperator() {} + ~DummyMergeOperator() override {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + return false; + } + + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + return false; + } + + const char* Name() const override { return "DummyMergeOperator"; } +}; + +class DummySliceTransform : public SliceTransform { + public: + DummySliceTransform() {} + ~DummySliceTransform() override {} + + // Return the name of this transformation. + const char* Name() const override { return "DummySliceTransform"; } + + // transform a src in domain to a dst in the range + Slice Transform(const Slice& src) const override { return src; } + + // determine whether this is a valid src upon the function applies + bool InDomain(const Slice& /*src*/) const override { return false; } + + // determine whether dst=Transform(src) for some src + bool InRange(const Slice& /*dst*/) const override { return false; } +}; + +} // namespace + +TEST_F(OptionsUtilTest, SanityCheck) { + DBOptions db_opt; + std::vector cf_descs; + const size_t kCFCount = 5; + for (size_t i = 0; i < kCFCount; ++i) { + cf_descs.emplace_back(); + cf_descs.back().name = + (i == 0) ? kDefaultColumnFamilyName : test::RandomName(&rnd_, 10); + + cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory()); + // Assign non-null values to prefix_extractors except the first cf. + cf_descs.back().options.prefix_extractor.reset( + i != 0 ? test::RandomSliceTransform(&rnd_) : nullptr); + cf_descs.back().options.merge_operator.reset( + test::RandomMergeOperator(&rnd_)); + } + + db_opt.create_missing_column_families = true; + db_opt.create_if_missing = true; + + ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options))); + DB* db; + std::vector handles; + // open and persist the options + ASSERT_OK(DB::Open(db_opt, dbname_, cf_descs, &handles, &db)); + + // close the db + for (auto* handle : handles) { + delete handle; + } + delete db; + + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.input_strings_escaped = true; + config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + // perform sanity check + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + ASSERT_GE(kCFCount, 5); + // merge operator + { + std::shared_ptr merge_op = + cf_descs[0].options.merge_operator; + + ASSERT_NE(merge_op.get(), nullptr); + cf_descs[0].options.merge_operator.reset(); + ASSERT_NOK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[0].options.merge_operator.reset(new DummyMergeOperator()); + ASSERT_NOK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[0].options.merge_operator = merge_op; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + + // prefix extractor + { + std::shared_ptr prefix_extractor = + cf_descs[1].options.prefix_extractor; + + // It's okay to set prefix_extractor to nullptr. + ASSERT_NE(prefix_extractor, nullptr); + cf_descs[1].options.prefix_extractor.reset(); + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[1].options.prefix_extractor.reset(new DummySliceTransform()); + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[1].options.prefix_extractor = prefix_extractor; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + + // prefix extractor nullptr case + { + std::shared_ptr prefix_extractor = + cf_descs[0].options.prefix_extractor; + + // It's okay to set prefix_extractor to nullptr. + ASSERT_EQ(prefix_extractor, nullptr); + cf_descs[0].options.prefix_extractor.reset(); + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + // It's okay to change prefix_extractor from nullptr to non-nullptr + cf_descs[0].options.prefix_extractor.reset(new DummySliceTransform()); + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[0].options.prefix_extractor = prefix_extractor; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + + // comparator + { + test::SimpleSuffixReverseComparator comparator; + + auto* prev_comparator = cf_descs[2].options.comparator; + cf_descs[2].options.comparator = &comparator; + ASSERT_NOK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[2].options.comparator = prev_comparator; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + + // table factory + { + std::shared_ptr table_factory = + cf_descs[3].options.table_factory; + + ASSERT_NE(table_factory, nullptr); + cf_descs[3].options.table_factory.reset(new DummyTableFactory()); + ASSERT_NOK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[3].options.table_factory = table_factory; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options))); +} + +TEST_F(OptionsUtilTest, LatestOptionsNotFound) { + std::unique_ptr env(NewMemEnv(Env::Default())); + Status s; + Options options; + ConfigOptions config_opts; + std::vector cf_descs; + + options.env = env.get(); + options.create_if_missing = true; + config_opts.env = options.env; + config_opts.ignore_unknown_options = false; + + std::vector children; + + std::string options_file_name; + ASSERT_OK(DestroyDB(dbname_, options)); + // First, test where the db directory does not exist + ASSERT_NOK(options.env->GetChildren(dbname_, &children)); + + s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs); + ASSERT_TRUE(s.IsPathNotFound()); + + s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + // Second, test where the db directory exists but is empty + ASSERT_OK(options.env->CreateDir(dbname_)); + + s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + // Finally, test where a file exists but is not an "Options" file + std::unique_ptr file; + ASSERT_OK( + options.env->NewWritableFile(dbname_ + "/temp.txt", &file, EnvOptions())); + ASSERT_OK(file->Close()); + s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + + s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(s.IsPathNotFound()); + ASSERT_OK(options.env->DeleteFile(dbname_ + "/temp.txt")); + ASSERT_OK(options.env->DeleteDir(dbname_)); +} + +TEST_F(OptionsUtilTest, LoadLatestOptions) { + Options options; + options.OptimizeForSmallDb(); + ColumnFamilyDescriptor cf_desc; + ConfigOptions config_opts; + DBOptions db_opts; + std::vector cf_descs; + std::vector handles; + DB* db; + options.create_if_missing = true; + + ASSERT_OK(DestroyDB(dbname_, options)); + + cf_descs.emplace_back(); + cf_descs.back().name = kDefaultColumnFamilyName; + cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory()); + cf_descs.emplace_back(); + cf_descs.back().name = "Plain"; + cf_descs.back().options.table_factory.reset(NewPlainTableFactory()); + db_opts.create_missing_column_families = true; + db_opts.create_if_missing = true; + + // open and persist the options + ASSERT_OK(DB::Open(db_opts, dbname_, cf_descs, &handles, &db)); + + std::string options_file_name; + std::string new_options_file; + + ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file_name)); + ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + ASSERT_EQ(cf_descs.size(), 2U); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, + db->GetDBOptions(), db_opts)); + ASSERT_OK(handles[0]->GetDescriptor(&cf_desc)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options, + cf_descs[0].options)); + ASSERT_OK(handles[1]->GetDescriptor(&cf_desc)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options, + cf_descs[1].options)); + + // Now change some of the DBOptions + ASSERT_OK(db->SetDBOptions( + {{"delayed_write_rate", "1234"}, {"bytes_per_sync", "32768"}})); + ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file)); + ASSERT_NE(options_file_name, new_options_file); + ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, + db->GetDBOptions(), db_opts)); + options_file_name = new_options_file; + + // Now change some of the ColumnFamilyOptions + ASSERT_OK(db->SetOptions(handles[1], {{"write_buffer_size", "32768"}})); + ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file)); + ASSERT_NE(options_file_name, new_options_file); + ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, + db->GetDBOptions(), db_opts)); + ASSERT_OK(handles[0]->GetDescriptor(&cf_desc)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options, + cf_descs[0].options)); + ASSERT_OK(handles[1]->GetDescriptor(&cf_desc)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options, + cf_descs[1].options)); + + // close the db + for (auto* handle : handles) { + delete handle; + } + delete db; + ASSERT_OK(DestroyDB(dbname_, options, cf_descs)); +} + +static void WriteOptionsFile(Env* env, const std::string& path, + const std::string& options_file, int major, + int minor, const std::string& db_opts, + const std::string& cf_opts, + const std::string& bbt_opts = "") { + std::string options_file_header = + "\n" + "[Version]\n" + " rocksdb_version=" + + std::to_string(major) + "." + std::to_string(minor) + + ".0\n" + " options_file_version=1\n"; + + std::unique_ptr wf; + ASSERT_OK(env->NewWritableFile(path + "/" + options_file, &wf, EnvOptions())); + ASSERT_OK( + wf->Append(options_file_header + "[ DBOptions ]\n" + db_opts + "\n")); + ASSERT_OK(wf->Append( + "[CFOptions \"default\"] # column family must be specified\n" + + cf_opts + "\n")); + ASSERT_OK(wf->Append("[TableOptions/BlockBasedTable \"default\"]\n" + + bbt_opts + "\n")); + ASSERT_OK(wf->Close()); + + std::string latest_options_file; + ASSERT_OK(GetLatestOptionsFileName(path, env, &latest_options_file)); + ASSERT_EQ(latest_options_file, options_file); +} + +TEST_F(OptionsUtilTest, BadLatestOptions) { + Status s; + ConfigOptions config_opts; + DBOptions db_opts; + std::vector cf_descs; + Options options; + options.env = env_.get(); + config_opts.env = env_.get(); + config_opts.ignore_unknown_options = false; + config_opts.delimiter = "\n"; + + ConfigOptions ignore_opts = config_opts; + ignore_opts.ignore_unknown_options = true; + + std::string options_file_name; + + // Test where the db directory exists but is empty + ASSERT_OK(options.env->CreateDir(dbname_)); + ASSERT_NOK( + GetLatestOptionsFileName(dbname_, options.env, &options_file_name)); + ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + + // Write an options file for a previous major release with an unknown DB + // Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0001", ROCKSDB_MAJOR - 1, + ROCKSDB_MINOR, "unknown_db_opt=true", ""); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Write an options file for a previous minor release with an unknown CF + // Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0002", ROCKSDB_MAJOR, + ROCKSDB_MINOR - 1, "", "unknown_cf_opt=true"); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Write an options file for a previous minor release with an unknown BBT + // Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0003", ROCKSDB_MAJOR, + ROCKSDB_MINOR - 1, "", "", "unknown_bbt_opt=true"); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Write an options file for the current release with an unknown DB Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0004", ROCKSDB_MAJOR, + ROCKSDB_MINOR, "unknown_db_opt=true", ""); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Write an options file for the current release with an unknown CF Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0005", ROCKSDB_MAJOR, + ROCKSDB_MINOR, "", "unknown_cf_opt=true"); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Write an options file for the current release with an invalid DB Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0006", ROCKSDB_MAJOR, + ROCKSDB_MINOR, "create_if_missing=hello", ""); + s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Even though ignore_unknown_options=true, we still return an error... + s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Write an options file for the next release with an invalid DB Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0007", ROCKSDB_MAJOR, + ROCKSDB_MINOR + 1, "create_if_missing=hello", ""); + ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); + + // Write an options file for the next release with an unknown DB Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0008", ROCKSDB_MAJOR, + ROCKSDB_MINOR + 1, "unknown_db_opt=true", ""); + ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + // Ignore the errors for future releases when ignore_unknown_options=true + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); + + // Write an options file for the next major release with an unknown CF Option + WriteOptionsFile(options.env, dbname_, "OPTIONS-0009", ROCKSDB_MAJOR + 1, + ROCKSDB_MINOR, "", "unknown_cf_opt=true"); + ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs)); + // Ignore the errors for future releases when ignore_unknown_options=true + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); +} + +TEST_F(OptionsUtilTest, RenameDatabaseDirectory) { + DB* db; + Options options; + DBOptions db_opts; + std::vector cf_descs; + std::vector handles; + + options.create_if_missing = true; + + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_OK(db->Put(WriteOptions(), "foo", "value0")); + delete db; + + auto new_dbname = dbname_ + "_2"; + + ASSERT_OK(options.env->RenameFile(dbname_, new_dbname)); + ASSERT_OK(LoadLatestOptions(new_dbname, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(cf_descs.size(), 1U); + + db_opts.create_if_missing = false; + ASSERT_OK(DB::Open(db_opts, new_dbname, cf_descs, &handles, &db)); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("value0", value); + // close the db + for (auto* handle : handles) { + delete handle; + } + delete db; + Options new_options(db_opts, cf_descs[0].options); + ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs)); + ASSERT_OK(DestroyDB(dbname_, options)); +} + +TEST_F(OptionsUtilTest, WalDirSettings) { + DB* db; + Options options; + DBOptions db_opts; + std::vector cf_descs; + std::vector handles; + + options.create_if_missing = true; + + // Open a DB with no wal dir set. The wal_dir should stay empty + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, ""); + + // Open a DB with wal_dir == dbname. The wal_dir should be set to empty + options.wal_dir = dbname_; + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, ""); + + // Open a DB with no wal_dir but a db_path==dbname_. The wal_dir should be + // empty + options.wal_dir = ""; + options.db_paths.emplace_back(dbname_, std::numeric_limits::max()); + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, ""); + + // Open a DB with no wal_dir==dbname_ and db_path==dbname_. The wal_dir + // should be empty + options.wal_dir = dbname_ + "/"; + options.db_paths.emplace_back(dbname_, std::numeric_limits::max()); + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, ""); + ASSERT_OK(DestroyDB(dbname_, options)); + + // Open a DB with no wal_dir but db_path != db_name. The wal_dir == dbname_ + options.wal_dir = ""; + options.db_paths.clear(); + options.db_paths.emplace_back(dbname_ + "_0", + std::numeric_limits::max()); + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, dbname_); + ASSERT_OK(DestroyDB(dbname_, options)); + + // Open a DB with wal_dir != db_name. The wal_dir remains unchanged + options.wal_dir = dbname_ + "/wal"; + options.db_paths.clear(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal"); + ASSERT_OK(DestroyDB(dbname_, options)); +} + +TEST_F(OptionsUtilTest, WalDirInOptins) { + DB* db; + Options options; + DBOptions db_opts; + std::vector cf_descs; + std::vector handles; + + // Store an options file with wal_dir=dbname_ and make sure it still loads + // when the input wal_dir is empty + options.create_if_missing = true; + options.wal_dir = ""; + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + options.wal_dir = dbname_; + std::string options_file; + ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file)); + ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options}, + dbname_ + "/" + options_file, + options.env->GetFileSystem().get())); + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, dbname_); + options.wal_dir = ""; + ASSERT_OK(DB::Open(options, dbname_, &db)); + delete db; + ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_EQ(db_opts.wal_dir, ""); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + printf("Skipped in RocksDBLite as utilities are not supported.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc new file mode 100644 index 000000000..8ad9bb1b1 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc @@ -0,0 +1,422 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/block_cache_tier.h" + +#include +#include + +#include "logging/logging.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" + +namespace ROCKSDB_NAMESPACE { + +// +// BlockCacheImpl +// +Status BlockCacheTier::Open() { + Status status; + + WriteLock _(&lock_); + + assert(!size_); + + // Check the validity of the options + status = opt_.ValidateSettings(); + assert(status.ok()); + if (!status.ok()) { + Error(opt_.log, "Invalid block cache options"); + return status; + } + + // Create base directory or cleanup existing directory + status = opt_.env->CreateDirIfMissing(opt_.path); + if (!status.ok()) { + Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + + // Create base/ directory + status = opt_.env->CreateDir(GetCachePath()); + if (!status.ok()) { + // directory already exists, clean it up + status = CleanupCacheFolder(GetCachePath()); + assert(status.ok()); + if (!status.ok()) { + Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + } + + // create a new file + assert(!cache_file_); + status = NewCacheFile(); + if (!status.ok()) { + Error(opt_.log, "Error creating new file %s. %s", opt_.path.c_str(), + status.ToString().c_str()); + return status; + } + + assert(cache_file_); + + if (opt_.pipeline_writes) { + assert(!insert_th_.joinable()); + insert_th_ = port::Thread(&BlockCacheTier::InsertMain, this); + } + + return Status::OK(); +} + +bool IsCacheFile(const std::string& file) { + // check if the file has .rc suffix + // Unfortunately regex support across compilers is not even, so we use simple + // string parsing + size_t pos = file.find("."); + if (pos == std::string::npos) { + return false; + } + + std::string suffix = file.substr(pos); + return suffix == ".rc"; +} + +Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) { + std::vector files; + Status status = opt_.env->GetChildren(folder, &files); + if (!status.ok()) { + Error(opt_.log, "Error getting files for %s. %s", folder.c_str(), + status.ToString().c_str()); + return status; + } + + // cleanup files with the patter :digi:.rc + for (auto file : files) { + if (IsCacheFile(file)) { + // cache file + Info(opt_.log, "Removing file %s.", file.c_str()); + status = opt_.env->DeleteFile(folder + "/" + file); + if (!status.ok()) { + Error(opt_.log, "Error deleting file %s. %s", file.c_str(), + status.ToString().c_str()); + return status; + } + } else { + ROCKS_LOG_DEBUG(opt_.log, "Skipping file %s", file.c_str()); + } + } + return Status::OK(); +} + +Status BlockCacheTier::Close() { + // stop the insert thread + if (opt_.pipeline_writes && insert_th_.joinable()) { + InsertOp op(/*quit=*/true); + insert_ops_.Push(std::move(op)); + insert_th_.join(); + } + + // stop the writer before + writer_.Stop(); + + // clear all metadata + WriteLock _(&lock_); + metadata_.Clear(); + return Status::OK(); +} + +template +void Add(std::map* stats, const std::string& key, + const T& t) { + stats->insert({key, static_cast(t)}); +} + +PersistentCache::StatsType BlockCacheTier::Stats() { + std::map stats; + Add(&stats, "persistentcache.blockcachetier.bytes_piplined", + stats_.bytes_pipelined_.Average()); + Add(&stats, "persistentcache.blockcachetier.bytes_written", + stats_.bytes_written_.Average()); + Add(&stats, "persistentcache.blockcachetier.bytes_read", + stats_.bytes_read_.Average()); + Add(&stats, "persistentcache.blockcachetier.insert_dropped", + stats_.insert_dropped_); + Add(&stats, "persistentcache.blockcachetier.cache_hits", stats_.cache_hits_); + Add(&stats, "persistentcache.blockcachetier.cache_misses", + stats_.cache_misses_); + Add(&stats, "persistentcache.blockcachetier.cache_errors", + stats_.cache_errors_); + Add(&stats, "persistentcache.blockcachetier.cache_hits_pct", + stats_.CacheHitPct()); + Add(&stats, "persistentcache.blockcachetier.cache_misses_pct", + stats_.CacheMissPct()); + Add(&stats, "persistentcache.blockcachetier.read_hit_latency", + stats_.read_hit_latency_.Average()); + Add(&stats, "persistentcache.blockcachetier.read_miss_latency", + stats_.read_miss_latency_.Average()); + Add(&stats, "persistentcache.blockcachetier.write_latency", + stats_.write_latency_.Average()); + + auto out = PersistentCacheTier::Stats(); + out.push_back(stats); + return out; +} + +Status BlockCacheTier::Insert(const Slice& key, const char* data, + const size_t size) { + // update stats + stats_.bytes_pipelined_.Add(size); + + if (opt_.pipeline_writes) { + // off load the write to the write thread + insert_ops_.Push( + InsertOp(key.ToString(), std::move(std::string(data, size)))); + return Status::OK(); + } + + assert(!opt_.pipeline_writes); + return InsertImpl(key, Slice(data, size)); +} + +void BlockCacheTier::InsertMain() { + while (true) { + InsertOp op(insert_ops_.Pop()); + + if (op.signal_) { + // that is a secret signal to exit + break; + } + + size_t retry = 0; + Status s; + while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) { + if (retry > kMaxRetry) { + break; + } + + // this can happen when the buffers are full, we wait till some buffers + // are free. Why don't we wait inside the code. This is because we want + // to support both pipelined and non-pipelined mode + buffer_allocator_.WaitUntilUsable(); + retry++; + } + + if (!s.ok()) { + stats_.insert_dropped_++; + } + } +} + +Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) { + // pre-condition + assert(key.size()); + assert(data.size()); + assert(cache_file_); + + StopWatchNano timer(opt_.clock, /*auto_start=*/true); + + WriteLock _(&lock_); + + LBA lba; + if (metadata_.Lookup(key, &lba)) { + // the key already exists, this is duplicate insert + return Status::OK(); + } + + while (!cache_file_->Append(key, data, &lba)) { + if (!cache_file_->Eof()) { + ROCKS_LOG_DEBUG(opt_.log, "Error inserting to cache file %d", + cache_file_->cacheid()); + stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::TryAgain(); + } + + assert(cache_file_->Eof()); + Status status = NewCacheFile(); + if (!status.ok()) { + return status; + } + } + + // Insert into lookup index + BlockInfo* info = metadata_.Insert(key, lba); + assert(info); + if (!info) { + return Status::IOError("Unexpected error inserting to index"); + } + + // insert to cache file reverse mapping + cache_file_->Add(info); + + // update stats + stats_.bytes_written_.Add(data.size()); + stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::OK(); +} + +Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr* val, + size_t* size) { + StopWatchNano timer(opt_.clock, /*auto_start=*/true); + + LBA lba; + bool status; + status = metadata_.Lookup(key, &lba); + if (!status) { + stats_.cache_misses_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: key not found"); + } + + BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_); + if (!file) { + // this can happen because the block index and cache file index are + // different, and the cache file might be removed between the two lookups + stats_.cache_misses_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: cache file not found"); + } + + assert(file->refs_); + + std::unique_ptr scratch(new char[lba.size_]); + Slice blk_key; + Slice blk_val; + + status = file->Read(lba, &blk_key, &blk_val, scratch.get()); + --file->refs_; + if (!status) { + stats_.cache_misses_++; + stats_.cache_errors_++; + stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); + return Status::NotFound("blockcache: error reading data"); + } + + assert(blk_key == key); + + val->reset(new char[blk_val.size()]); + memcpy(val->get(), blk_val.data(), blk_val.size()); + *size = blk_val.size(); + + stats_.bytes_read_.Add(*size); + stats_.cache_hits_++; + stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000); + + return Status::OK(); +} + +bool BlockCacheTier::Erase(const Slice& key) { + WriteLock _(&lock_); + BlockInfo* info = metadata_.Remove(key); + assert(info); + delete info; + return true; +} + +Status BlockCacheTier::NewCacheFile() { + lock_.AssertHeld(); + + TEST_SYNC_POINT_CALLBACK("BlockCacheTier::NewCacheFile:DeleteDir", + (void*)(GetCachePath().c_str())); + + std::unique_ptr f(new WriteableCacheFile( + opt_.env, &buffer_allocator_, &writer_, GetCachePath(), writer_cache_id_, + opt_.cache_file_size, opt_.log)); + + bool status = f->Create(opt_.enable_direct_writes, opt_.enable_direct_reads); + if (!status) { + return Status::IOError("Error creating file"); + } + + Info(opt_.log, "Created cache file %d", writer_cache_id_); + + writer_cache_id_++; + cache_file_ = f.release(); + + // insert to cache files tree + status = metadata_.Insert(cache_file_); + assert(status); + if (!status) { + Error(opt_.log, "Error inserting to metadata"); + return Status::IOError("Error inserting to metadata"); + } + + return Status::OK(); +} + +bool BlockCacheTier::Reserve(const size_t size) { + WriteLock _(&lock_); + assert(size_ <= opt_.cache_size); + + if (size + size_ <= opt_.cache_size) { + // there is enough space to write + size_ += size; + return true; + } + + assert(size + size_ >= opt_.cache_size); + // there is not enough space to fit the requested data + // we can clear some space by evicting cold data + + const double retain_fac = (100 - kEvictPct) / static_cast(100); + while (size + size_ > opt_.cache_size * retain_fac) { + std::unique_ptr f(metadata_.Evict()); + if (!f) { + // nothing is evictable + return false; + } + assert(!f->refs_); + uint64_t file_size; + if (!f->Delete(&file_size).ok()) { + // unable to delete file + return false; + } + + assert(file_size <= size_); + size_ -= file_size; + } + + size_ += size; + assert(size_ <= opt_.cache_size * 0.9); + return true; +} + +Status NewPersistentCache(Env* const env, const std::string& path, + const uint64_t size, + const std::shared_ptr& log, + const bool optimized_for_nvm, + std::shared_ptr* cache) { + if (!cache) { + return Status::IOError("invalid argument cache"); + } + + auto opt = PersistentCacheConfig(env, path, size, log); + if (optimized_for_nvm) { + // the default settings are optimized for SSD + // NVM devices are better accessed with 4K direct IO and written with + // parallelism + opt.enable_direct_writes = true; + opt.writer_qdepth = 4; + opt.writer_dispatch_size = 4 * 1024; + } + + auto pcache = std::make_shared(opt); + Status s = pcache->Open(); + + if (!s.ok()) { + return s; + } + + *cache = pcache; + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ifndef ROCKSDB_LITE diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h new file mode 100644 index 000000000..1aac287cc --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h @@ -0,0 +1,156 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE + +#ifndef OS_WIN +#include +#endif // ! OS_WIN + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/system_clock.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" +#include "utilities/persistent_cache/block_cache_tier_metadata.h" +#include "utilities/persistent_cache/persistent_cache_util.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Block cache tier implementation +// +class BlockCacheTier : public PersistentCacheTier { + public: + explicit BlockCacheTier(const PersistentCacheConfig& opt) + : opt_(opt), + insert_ops_(static_cast(opt_.max_write_pipeline_backlog_size)), + buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()), + writer_(this, opt_.writer_qdepth, + static_cast(opt_.writer_dispatch_size)) { + Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt, + opt_.write_buffer_size, opt_.write_buffer_count()); + } + + virtual ~BlockCacheTier() { + // Close is re-entrant so we can call close even if it is already closed + Close().PermitUncheckedError(); + assert(!insert_th_.joinable()); + } + + Status Insert(const Slice& key, const char* data, const size_t size) override; + Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) override; + Status Open() override; + Status Close() override; + bool Erase(const Slice& key) override; + bool Reserve(const size_t size) override; + + bool IsCompressed() override { return opt_.is_compressed; } + + std::string GetPrintableOptions() const override { return opt_.ToString(); } + + PersistentCache::StatsType Stats() override; + + void TEST_Flush() override { + while (insert_ops_.Size()) { + /* sleep override */ + SystemClock::Default()->SleepForMicroseconds(1000000); + } + } + + private: + // Percentage of cache to be evicted when the cache is full + static const size_t kEvictPct = 10; + // Max attempts to insert key, value to cache in pipelined mode + static const size_t kMaxRetry = 3; + + // Pipelined operation + struct InsertOp { + explicit InsertOp(const bool signal) : signal_(signal) {} + explicit InsertOp(std::string&& key, const std::string& data) + : key_(std::move(key)), data_(data) {} + ~InsertOp() {} + + InsertOp() = delete; + InsertOp(InsertOp&& /*rhs*/) = default; + InsertOp& operator=(InsertOp&& rhs) = default; + + // used for estimating size by bounded queue + size_t Size() { return data_.size() + key_.size(); } + + std::string key_; + std::string data_; + bool signal_ = false; // signal to request processing thread to exit + }; + + // entry point for insert thread + void InsertMain(); + // insert implementation + Status InsertImpl(const Slice& key, const Slice& data); + // Create a new cache file + Status NewCacheFile(); + // Get cache directory path + std::string GetCachePath() const { return opt_.path + "/cache"; } + // Cleanup folder + Status CleanupCacheFolder(const std::string& folder); + + // Statistics + struct Statistics { + HistogramImpl bytes_pipelined_; + HistogramImpl bytes_written_; + HistogramImpl bytes_read_; + HistogramImpl read_hit_latency_; + HistogramImpl read_miss_latency_; + HistogramImpl write_latency_; + std::atomic cache_hits_{0}; + std::atomic cache_misses_{0}; + std::atomic cache_errors_{0}; + std::atomic insert_dropped_{0}; + + double CacheHitPct() const { + const auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_hits_ / static_cast(lookups) : 0.0; + } + + double CacheMissPct() const { + const auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_misses_ / static_cast(lookups) : 0.0; + } + }; + + port::RWMutex lock_; // Synchronization + const PersistentCacheConfig opt_; // BlockCache options + BoundedQueue insert_ops_; // Ops waiting for insert + ROCKSDB_NAMESPACE::port::Thread insert_th_; // Insert thread + uint32_t writer_cache_id_ = 0; // Current cache file identifier + WriteableCacheFile* cache_file_ = nullptr; // Current cache file reference + CacheWriteBufferAllocator buffer_allocator_; // Buffer provider + ThreadedWriter writer_; // Writer threads + BlockCacheTierMetadata metadata_; // Cache meta data manager + std::atomic size_{0}; // Size of the cache + Statistics stats_; // Statistics +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc new file mode 100644 index 000000000..f4f8517ab --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc @@ -0,0 +1,610 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/block_cache_tier_file.h" + +#ifndef OS_WIN +#include +#endif +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +// +// File creation factories +// +Status NewWritableCacheFile(Env* const env, const std::string& filepath, + std::unique_ptr* file, + const bool use_direct_writes = false) { + EnvOptions opt; + opt.use_direct_writes = use_direct_writes; + Status s = env->NewWritableFile(filepath, file, opt); + return s; +} + +Status NewRandomAccessCacheFile(const std::shared_ptr& fs, + const std::string& filepath, + std::unique_ptr* file, + const bool use_direct_reads = true) { + assert(fs.get()); + + FileOptions opt; + opt.use_direct_reads = use_direct_reads; + return fs->NewRandomAccessFile(filepath, opt, file, nullptr); +} + +// +// BlockCacheFile +// +Status BlockCacheFile::Delete(uint64_t* size) { + assert(env_); + + Status status = env_->GetFileSize(Path(), size); + if (!status.ok()) { + return status; + } + return env_->DeleteFile(Path()); +} + +// +// CacheRecord +// +// Cache record represents the record on disk +// +// +--------+---------+----------+------------+---------------+-------------+ +// | magic | crc | key size | value size | key data | value data | +// +--------+---------+----------+------------+---------------+-------------+ +// <-- 4 --><-- 4 --><-- 4 --><-- 4 --><-- key size --><-- v-size --> +// +struct CacheRecordHeader { + CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {} + CacheRecordHeader(const uint32_t magic, const uint32_t key_size, + const uint32_t val_size) + : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {} + + uint32_t magic_; + uint32_t crc_; + uint32_t key_size_; + uint32_t val_size_; +}; + +struct CacheRecord { + CacheRecord() {} + CacheRecord(const Slice& key, const Slice& val) + : hdr_(MAGIC, static_cast(key.size()), + static_cast(val.size())), + key_(key), + val_(val) { + hdr_.crc_ = ComputeCRC(); + } + + uint32_t ComputeCRC() const; + bool Serialize(std::vector* bufs, size_t* woff); + bool Deserialize(const Slice& buf); + + static uint32_t CalcSize(const Slice& key, const Slice& val) { + return static_cast(sizeof(CacheRecordHeader) + key.size() + + val.size()); + } + + static const uint32_t MAGIC = 0xfefa; + + bool Append(std::vector* bufs, size_t* woff, + const char* data, const size_t size); + + CacheRecordHeader hdr_; + Slice key_; + Slice val_; +}; + +static_assert(sizeof(CacheRecordHeader) == 16, "DataHeader is not aligned"); + +uint32_t CacheRecord::ComputeCRC() const { + uint32_t crc = 0; + CacheRecordHeader tmp = hdr_; + tmp.crc_ = 0; + crc = crc32c::Extend(crc, reinterpret_cast(&tmp), sizeof(tmp)); + crc = crc32c::Extend(crc, reinterpret_cast(key_.data()), + key_.size()); + crc = crc32c::Extend(crc, reinterpret_cast(val_.data()), + val_.size()); + return crc; +} + +bool CacheRecord::Serialize(std::vector* bufs, + size_t* woff) { + assert(bufs->size()); + return Append(bufs, woff, reinterpret_cast(&hdr_), + sizeof(hdr_)) && + Append(bufs, woff, reinterpret_cast(key_.data()), + key_.size()) && + Append(bufs, woff, reinterpret_cast(val_.data()), + val_.size()); +} + +bool CacheRecord::Append(std::vector* bufs, size_t* woff, + const char* data, const size_t data_size) { + assert(*woff < bufs->size()); + + const char* p = data; + size_t size = data_size; + + while (size && *woff < bufs->size()) { + CacheWriteBuffer* buf = (*bufs)[*woff]; + const size_t free = buf->Free(); + if (size <= free) { + buf->Append(p, size); + size = 0; + } else { + buf->Append(p, free); + p += free; + size -= free; + assert(!buf->Free()); + assert(buf->Used() == buf->Capacity()); + } + + if (!buf->Free()) { + *woff += 1; + } + } + + assert(!size); + + return !size; +} + +bool CacheRecord::Deserialize(const Slice& data) { + assert(data.size() >= sizeof(CacheRecordHeader)); + if (data.size() < sizeof(CacheRecordHeader)) { + return false; + } + + memcpy(&hdr_, data.data(), sizeof(hdr_)); + + assert(hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) == data.size()); + if (hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) != data.size()) { + return false; + } + + key_ = Slice(data.data_ + sizeof(hdr_), hdr_.key_size_); + val_ = Slice(key_.data_ + hdr_.key_size_, hdr_.val_size_); + + if (!(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_)) { + fprintf(stderr, "** magic %d ** \n", hdr_.magic_); + fprintf(stderr, "** key_size %d ** \n", hdr_.key_size_); + fprintf(stderr, "** val_size %d ** \n", hdr_.val_size_); + fprintf(stderr, "** key %s ** \n", key_.ToString().c_str()); + fprintf(stderr, "** val %s ** \n", val_.ToString().c_str()); + for (size_t i = 0; i < hdr_.val_size_; ++i) { + fprintf(stderr, "%d.", (uint8_t)val_.data()[i]); + } + fprintf(stderr, "\n** cksum %d != %d **", hdr_.crc_, ComputeCRC()); + } + + assert(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_); + return hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_; +} + +// +// RandomAccessFile +// + +bool RandomAccessCacheFile::Open(const bool enable_direct_reads) { + WriteLock _(&rwlock_); + return OpenImpl(enable_direct_reads); +} + +bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) { + rwlock_.AssertHeld(); + + ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str()); + assert(env_); + + std::unique_ptr file; + Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file, + enable_direct_reads); + if (!status.ok()) { + Error(log_, "Error opening random access file %s. %s", Path().c_str(), + status.ToString().c_str()); + return false; + } + freader_.reset(new RandomAccessFileReader(std::move(file), Path(), + env_->GetSystemClock().get())); + + return true; +} + +bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val, + char* scratch) { + ReadLock _(&rwlock_); + + assert(lba.cache_id_ == cache_id_); + + if (!freader_) { + return false; + } + + Slice result; + Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch, + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + Error(log_, "Error reading from file %s. %s", Path().c_str(), + s.ToString().c_str()); + return false; + } + + assert(result.data() == scratch); + + return ParseRec(lba, key, val, scratch); +} + +bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val, + char* scratch) { + Slice data(scratch, lba.size_); + + CacheRecord rec; + if (!rec.Deserialize(data)) { + assert(!"Error deserializing data"); + Error(log_, "Error de-serializing record from file %s off %d", + Path().c_str(), lba.off_); + return false; + } + + *key = Slice(rec.key_); + *val = Slice(rec.val_); + + return true; +} + +// +// WriteableCacheFile +// + +WriteableCacheFile::~WriteableCacheFile() { + WriteLock _(&rwlock_); + if (!eof_) { + // This file never flushed. We give priority to shutdown since this is a + // cache + // TODO(krad): Figure a way to flush the pending data + if (file_) { + assert(refs_ == 1); + --refs_; + } + } + assert(!refs_); + ClearBuffers(); +} + +bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/, + const bool enable_direct_reads) { + WriteLock _(&rwlock_); + + enable_direct_reads_ = enable_direct_reads; + + ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)", + Path().c_str(), max_size_); + + assert(env_); + + Status s = env_->FileExists(Path()); + if (s.ok()) { + ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(), + s.ToString().c_str()); + } + + s = NewWritableCacheFile(env_, Path(), &file_); + if (!s.ok()) { + ROCKS_LOG_WARN(log_, "Unable to create file %s. %s", Path().c_str(), + s.ToString().c_str()); + return false; + } + + assert(!refs_); + ++refs_; + + return true; +} + +bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) { + WriteLock _(&rwlock_); + + if (eof_) { + // We can't append since the file is full + return false; + } + + // estimate the space required to store the (key, val) + uint32_t rec_size = CacheRecord::CalcSize(key, val); + + if (!ExpandBuffer(rec_size)) { + // unable to expand the buffer + ROCKS_LOG_DEBUG(log_, "Error expanding buffers. size=%d", rec_size); + return false; + } + + lba->cache_id_ = cache_id_; + lba->off_ = disk_woff_; + lba->size_ = rec_size; + + CacheRecord rec(key, val); + if (!rec.Serialize(&bufs_, &buf_woff_)) { + // unexpected error: unable to serialize the data + assert(!"Error serializing record"); + return false; + } + + disk_woff_ += rec_size; + eof_ = disk_woff_ >= max_size_; + + // dispatch buffer for flush + DispatchBuffer(); + + return true; +} + +bool WriteableCacheFile::ExpandBuffer(const size_t size) { + rwlock_.AssertHeld(); + assert(!eof_); + + // determine if there is enough space + size_t free = 0; // compute the free space left in buffer + for (size_t i = buf_woff_; i < bufs_.size(); ++i) { + free += bufs_[i]->Free(); + if (size <= free) { + // we have enough space in the buffer + return true; + } + } + + // expand the buffer until there is enough space to write `size` bytes + assert(free < size); + assert(alloc_); + + while (free < size) { + CacheWriteBuffer* const buf = alloc_->Allocate(); + if (!buf) { + ROCKS_LOG_DEBUG(log_, "Unable to allocate buffers"); + return false; + } + + size_ += static_cast(buf->Free()); + free += buf->Free(); + bufs_.push_back(buf); + } + + assert(free >= size); + return true; +} + +void WriteableCacheFile::DispatchBuffer() { + rwlock_.AssertHeld(); + + assert(bufs_.size()); + assert(buf_doff_ <= buf_woff_); + assert(buf_woff_ <= bufs_.size()); + + if (pending_ios_) { + return; + } + + if (!eof_ && buf_doff_ == buf_woff_) { + // dispatch buffer is pointing to write buffer and we haven't hit eof + return; + } + + assert(eof_ || buf_doff_ < buf_woff_); + assert(buf_doff_ < bufs_.size()); + assert(file_); + assert(alloc_); + + auto* buf = bufs_[buf_doff_]; + const uint64_t file_off = buf_doff_ * alloc_->BufferSize(); + + assert(!buf->Free() || + (eof_ && buf_doff_ == buf_woff_ && buf_woff_ < bufs_.size())); + // we have reached end of file, and there is space in the last buffer + // pad it with zero for direct IO + buf->FillTrailingZeros(); + + assert(buf->Used() % kFileAlignmentSize == 0); + + writer_->Write(file_.get(), buf, file_off, + std::bind(&WriteableCacheFile::BufferWriteDone, this)); + pending_ios_++; + buf_doff_++; +} + +void WriteableCacheFile::BufferWriteDone() { + WriteLock _(&rwlock_); + + assert(bufs_.size()); + + pending_ios_--; + + if (buf_doff_ < bufs_.size()) { + DispatchBuffer(); + } + + if (eof_ && buf_doff_ >= bufs_.size() && !pending_ios_) { + // end-of-file reached, move to read mode + CloseAndOpenForReading(); + } +} + +void WriteableCacheFile::CloseAndOpenForReading() { + // Our env abstraction do not allow reading from a file opened for appending + // We need close the file and re-open it for reading + Close(); + RandomAccessCacheFile::OpenImpl(enable_direct_reads_); +} + +bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block, + char* scratch) { + rwlock_.AssertHeld(); + + if (!ReadBuffer(lba, scratch)) { + Error(log_, "Error reading from buffer. cache=%d off=%d", cache_id_, + lba.off_); + return false; + } + + return ParseRec(lba, key, block, scratch); +} + +bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) { + rwlock_.AssertHeld(); + + assert(lba.off_ < disk_woff_); + assert(alloc_); + + // we read from the buffers like reading from a flat file. The list of buffers + // are treated as contiguous stream of data + + char* tmp = data; + size_t pending_nbytes = lba.size_; + // start buffer + size_t start_idx = lba.off_ / alloc_->BufferSize(); + // offset into the start buffer + size_t start_off = lba.off_ % alloc_->BufferSize(); + + assert(start_idx <= buf_woff_); + + for (size_t i = start_idx; pending_nbytes && i < bufs_.size(); ++i) { + assert(i <= buf_woff_); + auto* buf = bufs_[i]; + assert(i == buf_woff_ || !buf->Free()); + // bytes to write to the buffer + size_t nbytes = pending_nbytes > (buf->Used() - start_off) + ? (buf->Used() - start_off) + : pending_nbytes; + memcpy(tmp, buf->Data() + start_off, nbytes); + + // left over to be written + pending_nbytes -= nbytes; + start_off = 0; + tmp += nbytes; + } + + assert(!pending_nbytes); + if (pending_nbytes) { + return false; + } + + assert(tmp == data + lba.size_); + return true; +} + +void WriteableCacheFile::Close() { + rwlock_.AssertHeld(); + + assert(size_ >= max_size_); + assert(disk_woff_ >= max_size_); + assert(buf_doff_ == bufs_.size()); + assert(bufs_.size() - buf_woff_ <= 1); + assert(!pending_ios_); + + Info(log_, "Closing file %s. size=%d written=%d", Path().c_str(), size_, + disk_woff_); + + ClearBuffers(); + file_.reset(); + + assert(refs_); + --refs_; +} + +void WriteableCacheFile::ClearBuffers() { + assert(alloc_); + + for (size_t i = 0; i < bufs_.size(); ++i) { + alloc_->Deallocate(bufs_[i]); + } + + bufs_.clear(); +} + +// +// ThreadedFileWriter implementation +// +ThreadedWriter::ThreadedWriter(PersistentCacheTier* const cache, + const size_t qdepth, const size_t io_size) + : Writer(cache), io_size_(io_size) { + for (size_t i = 0; i < qdepth; ++i) { + port::Thread th(&ThreadedWriter::ThreadMain, this); + threads_.push_back(std::move(th)); + } +} + +void ThreadedWriter::Stop() { + // notify all threads to exit + for (size_t i = 0; i < threads_.size(); ++i) { + q_.Push(IO(/*signal=*/true)); + } + + // wait for all threads to exit + for (auto& th : threads_) { + th.join(); + assert(!th.joinable()); + } + threads_.clear(); +} + +void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) { + q_.Push(IO(file, buf, file_off, callback)); +} + +void ThreadedWriter::ThreadMain() { + while (true) { + // Fetch the IO to process + IO io(q_.Pop()); + if (io.signal_) { + // that's secret signal to exit + break; + } + + // Reserve space for writing the buffer + while (!cache_->Reserve(io.buf_->Used())) { + // We can fail to reserve space if every file in the system + // is being currently accessed + /* sleep override */ + SystemClock::Default()->SleepForMicroseconds(1000000); + } + + DispatchIO(io); + + io.callback_(); + } +} + +void ThreadedWriter::DispatchIO(const IO& io) { + size_t written = 0; + while (written < io.buf_->Used()) { + Slice data(io.buf_->Data() + written, io_size_); + Status s = io.file_->Append(data); + assert(s.ok()); + if (!s.ok()) { + // That is definite IO error to device. There is not much we can + // do but ignore the failure. This can lead to corruption of data on + // disk, but the cache will skip while reading + fprintf(stderr, "Error writing data to file. %s\n", s.ToString().c_str()); + } + written += io_size_; + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h new file mode 100644 index 000000000..1d265ab74 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h @@ -0,0 +1,293 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "file/random_access_file_reader.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "utilities/persistent_cache/block_cache_tier_file_buffer.h" +#include "utilities/persistent_cache/lrulist.h" +#include "utilities/persistent_cache/persistent_cache_tier.h" +#include "utilities/persistent_cache/persistent_cache_util.h" + +// The io code path of persistent cache uses pipelined architecture +// +// client -> In Queue <-- BlockCacheTier --> Out Queue <-- Writer <--> Kernel +// +// This would enable the system to scale for GB/s of throughput which is +// expected with modern devies like NVM. +// +// The file level operations are encapsulated in the following abstractions +// +// BlockCacheFile +// ^ +// | +// | +// RandomAccessCacheFile (For reading) +// ^ +// | +// | +// WriteableCacheFile (For writing) +// +// Write IO code path : +// +namespace ROCKSDB_NAMESPACE { + +class WriteableCacheFile; +struct BlockInfo; + +// Represents a logical record on device +// +// (L)ogical (B)lock (Address = { cache-file-id, offset, size } +struct LogicalBlockAddress { + LogicalBlockAddress() {} + explicit LogicalBlockAddress(const uint32_t cache_id, const uint32_t off, + const uint16_t size) + : cache_id_(cache_id), off_(off), size_(size) {} + + uint32_t cache_id_ = 0; + uint32_t off_ = 0; + uint32_t size_ = 0; +}; + +using LBA = LogicalBlockAddress; + +// class Writer +// +// Writer is the abstraction used for writing data to file. The component can be +// multithreaded. It is the last step of write pipeline +class Writer { + public: + explicit Writer(PersistentCacheTier* const cache) : cache_(cache) {} + virtual ~Writer() {} + + // write buffer to file at the given offset + virtual void Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) = 0; + // stop the writer + virtual void Stop() = 0; + + PersistentCacheTier* const cache_; +}; + +// class BlockCacheFile +// +// Generic interface to support building file specialized for read/writing +class BlockCacheFile : public LRUElement { + public: + explicit BlockCacheFile(const uint32_t cache_id) + : LRUElement(), cache_id_(cache_id) {} + + explicit BlockCacheFile(Env* const env, const std::string& dir, + const uint32_t cache_id) + : LRUElement(), + env_(env), + dir_(dir), + cache_id_(cache_id) {} + + virtual ~BlockCacheFile() {} + + // append key/value to file and return LBA locator to user + virtual bool Append(const Slice& /*key*/, const Slice& /*val*/, + LBA* const /*lba*/) { + assert(!"not implemented"); + return false; + } + + // read from the record locator (LBA) and return key, value and status + virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/, + char* /*scratch*/) { + assert(!"not implemented"); + return false; + } + + // get file path + std::string Path() const { + return dir_ + "/" + std::to_string(cache_id_) + ".rc"; + } + // get cache ID + uint32_t cacheid() const { return cache_id_; } + // Add block information to file data + // Block information is the list of index reference for this file + virtual void Add(BlockInfo* binfo) { + WriteLock _(&rwlock_); + block_infos_.push_back(binfo); + } + // get block information + std::list& block_infos() { return block_infos_; } + // delete file and return the size of the file + virtual Status Delete(uint64_t* size); + + protected: + port::RWMutex rwlock_; // synchronization mutex + Env* const env_ = nullptr; // Env for OS + const std::string dir_; // Directory name + const uint32_t cache_id_; // Cache id for the file + std::list block_infos_; // List of index entries mapping to the + // file content +}; + +// class RandomAccessFile +// +// Thread safe implementation for reading random data from file +class RandomAccessCacheFile : public BlockCacheFile { + public: + explicit RandomAccessCacheFile(Env* const env, const std::string& dir, + const uint32_t cache_id, + const std::shared_ptr& log) + : BlockCacheFile(env, dir, cache_id), log_(log) {} + + virtual ~RandomAccessCacheFile() {} + + // open file for reading + bool Open(const bool enable_direct_reads); + // read data from the disk + bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override; + + private: + std::unique_ptr freader_; + + protected: + bool OpenImpl(const bool enable_direct_reads); + bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch); + + std::shared_ptr log_; // log file +}; + +// class WriteableCacheFile +// +// All writes to the files are cached in buffers. The buffers are flushed to +// disk as they get filled up. When file size reaches a certain size, a new file +// will be created provided there is free space +class WriteableCacheFile : public RandomAccessCacheFile { + public: + explicit WriteableCacheFile(Env* const env, CacheWriteBufferAllocator* alloc, + Writer* writer, const std::string& dir, + const uint32_t cache_id, const uint32_t max_size, + const std::shared_ptr& log) + : RandomAccessCacheFile(env, dir, cache_id, log), + alloc_(alloc), + writer_(writer), + max_size_(max_size) {} + + virtual ~WriteableCacheFile(); + + // create file on disk + bool Create(const bool enable_direct_writes, const bool enable_direct_reads); + + // read data from logical file + bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override { + ReadLock _(&rwlock_); + const bool closed = eof_ && bufs_.empty(); + if (closed) { + // the file is closed, read from disk + return RandomAccessCacheFile::Read(lba, key, block, scratch); + } + // file is still being written, read from buffers + return ReadBuffer(lba, key, block, scratch); + } + + // append data to end of file + bool Append(const Slice&, const Slice&, LBA* const) override; + // End-of-file + bool Eof() const { return eof_; } + + private: + friend class ThreadedWriter; + + static const size_t kFileAlignmentSize = 4 * 1024; // align file size + + bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch); + bool ReadBuffer(const LBA& lba, char* data); + bool ExpandBuffer(const size_t size); + void DispatchBuffer(); + void BufferWriteDone(); + void CloseAndOpenForReading(); + void ClearBuffers(); + void Close(); + + // File layout in memory + // + // +------+------+------+------+------+------+ + // | b0 | b1 | b2 | b3 | b4 | b5 | + // +------+------+------+------+------+------+ + // ^ ^ + // | | + // buf_doff_ buf_woff_ + // (next buffer to (next buffer to fill) + // flush to disk) + // + // The buffers are flushed to disk serially for a given file + + CacheWriteBufferAllocator* const alloc_ = nullptr; // Buffer provider + Writer* const writer_ = nullptr; // File writer thread + std::unique_ptr file_; // RocksDB Env file abstraction + std::vector bufs_; // Written buffers + uint32_t size_ = 0; // Size of the file + const uint32_t max_size_; // Max size of the file + bool eof_ = false; // End of file + uint32_t disk_woff_ = 0; // Offset to write on disk + size_t buf_woff_ = 0; // off into bufs_ to write + size_t buf_doff_ = 0; // off into bufs_ to dispatch + size_t pending_ios_ = 0; // Number of ios to disk in-progress + bool enable_direct_reads_ = false; // Should we enable direct reads + // when reading from disk +}; + +// +// Abstraction to do writing to device. It is part of pipelined architecture. +// +class ThreadedWriter : public Writer { + public: + // Representation of IO to device + struct IO { + explicit IO(const bool signal) : signal_(signal) {} + explicit IO(WritableFile* const file, CacheWriteBuffer* const buf, + const uint64_t file_off, const std::function callback) + : file_(file), buf_(buf), file_off_(file_off), callback_(callback) {} + + IO(const IO&) = default; + IO& operator=(const IO&) = default; + size_t Size() const { return sizeof(IO); } + + WritableFile* file_ = nullptr; // File to write to + CacheWriteBuffer* buf_ = nullptr; // buffer to write + uint64_t file_off_ = 0; // file offset + bool signal_ = false; // signal to exit thread loop + std::function callback_; // Callback on completion + }; + + explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth, + const size_t io_size); + virtual ~ThreadedWriter() { assert(threads_.empty()); } + + void Stop() override; + void Write(WritableFile* const file, CacheWriteBuffer* buf, + const uint64_t file_off, + const std::function callback) override; + + private: + void ThreadMain(); + void DispatchIO(const IO& io); + + const size_t io_size_ = 0; + BoundedQueue q_; + std::vector threads_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h new file mode 100644 index 000000000..d4f02455a --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h @@ -0,0 +1,127 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "memory/arena.h" +#include "rocksdb/comparator.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// +// CacheWriteBuffer +// +// Buffer abstraction that can be manipulated via append +// (not thread safe) +class CacheWriteBuffer { + public: + explicit CacheWriteBuffer(const size_t size) : size_(size), pos_(0) { + buf_.reset(new char[size_]); + assert(!pos_); + assert(size_); + } + + virtual ~CacheWriteBuffer() {} + + void Append(const char* buf, const size_t size) { + assert(pos_ + size <= size_); + memcpy(buf_.get() + pos_, buf, size); + pos_ += size; + assert(pos_ <= size_); + } + + void FillTrailingZeros() { + assert(pos_ <= size_); + memset(buf_.get() + pos_, '0', size_ - pos_); + pos_ = size_; + } + + void Reset() { pos_ = 0; } + size_t Free() const { return size_ - pos_; } + size_t Capacity() const { return size_; } + size_t Used() const { return pos_; } + char* Data() const { return buf_.get(); } + + private: + std::unique_ptr buf_; + const size_t size_; + size_t pos_; +}; + +// +// CacheWriteBufferAllocator +// +// Buffer pool abstraction(not thread safe) +// +class CacheWriteBufferAllocator { + public: + explicit CacheWriteBufferAllocator(const size_t buffer_size, + const size_t buffer_count) + : cond_empty_(&lock_), buffer_size_(buffer_size) { + MutexLock _(&lock_); + buffer_size_ = buffer_size; + for (uint32_t i = 0; i < buffer_count; i++) { + auto* buf = new CacheWriteBuffer(buffer_size_); + assert(buf); + if (buf) { + bufs_.push_back(buf); + cond_empty_.Signal(); + } + } + } + + virtual ~CacheWriteBufferAllocator() { + MutexLock _(&lock_); + assert(bufs_.size() * buffer_size_ == Capacity()); + for (auto* buf : bufs_) { + delete buf; + } + bufs_.clear(); + } + + CacheWriteBuffer* Allocate() { + MutexLock _(&lock_); + if (bufs_.empty()) { + return nullptr; + } + + assert(!bufs_.empty()); + CacheWriteBuffer* const buf = bufs_.front(); + bufs_.pop_front(); + return buf; + } + + void Deallocate(CacheWriteBuffer* const buf) { + assert(buf); + MutexLock _(&lock_); + buf->Reset(); + bufs_.push_back(buf); + cond_empty_.Signal(); + } + + void WaitUntilUsable() { + // We are asked to wait till we have buffers available + MutexLock _(&lock_); + while (bufs_.empty()) { + cond_empty_.Wait(); + } + } + + size_t Capacity() const { return bufs_.size() * buffer_size_; } + size_t Free() const { return bufs_.size() * buffer_size_; } + size_t BufferSize() const { return buffer_size_; } + + private: + port::Mutex lock_; // Sync lock + port::CondVar cond_empty_; // Condition var for empty buffers + size_t buffer_size_; // Size of each buffer + std::list bufs_; // Buffer stash +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc new file mode 100644 index 000000000..d73b5d0b4 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/block_cache_tier_metadata.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +bool BlockCacheTierMetadata::Insert(BlockCacheFile* file) { + return cache_file_index_.Insert(file); +} + +BlockCacheFile* BlockCacheTierMetadata::Lookup(const uint32_t cache_id) { + BlockCacheFile* ret = nullptr; + BlockCacheFile lookup_key(cache_id); + bool ok = cache_file_index_.Find(&lookup_key, &ret); + if (ok) { + assert(ret->refs_); + return ret; + } + return nullptr; +} + +BlockCacheFile* BlockCacheTierMetadata::Evict() { + using std::placeholders::_1; + auto fn = std::bind(&BlockCacheTierMetadata::RemoveAllKeys, this, _1); + return cache_file_index_.Evict(fn); +} + +void BlockCacheTierMetadata::Clear() { + cache_file_index_.Clear([](BlockCacheFile* arg) { delete arg; }); + block_index_.Clear([](BlockInfo* arg) { delete arg; }); +} + +BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) { + std::unique_ptr binfo(new BlockInfo(key, lba)); + if (!block_index_.Insert(binfo.get())) { + return nullptr; + } + return binfo.release(); +} + +bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) { + BlockInfo lookup_key(key); + BlockInfo* block; + port::RWMutex* rlock = nullptr; + if (!block_index_.Find(&lookup_key, &block, &rlock)) { + return false; + } + + ReadUnlock _(rlock); + assert(block->key_ == key.ToString()); + if (lba) { + *lba = block->lba_; + } + return true; +} + +BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) { + BlockInfo lookup_key(key); + BlockInfo* binfo = nullptr; + bool ok __attribute__((__unused__)); + ok = block_index_.Erase(&lookup_key, &binfo); + assert(ok); + return binfo; +} + +void BlockCacheTierMetadata::RemoveAllKeys(BlockCacheFile* f) { + for (BlockInfo* binfo : f->block_infos()) { + BlockInfo* tmp = nullptr; + bool status = block_index_.Erase(binfo, &tmp); + (void)status; + assert(status); + assert(tmp == binfo); + delete binfo; + } + f->block_infos().clear(); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h new file mode 100644 index 000000000..2fcd50105 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h @@ -0,0 +1,124 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "utilities/persistent_cache/block_cache_tier_file.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/hash_table_evictable.h" +#include "utilities/persistent_cache/lrulist.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Block Cache Tier Metadata +// +// The BlockCacheTierMetadata holds all the metadata associated with block +// cache. It +// fundamentally contains 2 indexes and an LRU. +// +// Block Cache Index +// +// This is a forward index that maps a given key to a LBA (Logical Block +// Address). LBA is a disk pointer that points to a record on the cache. +// +// LBA = { cache-id, offset, size } +// +// Cache File Index +// +// This is a forward index that maps a given cache-id to a cache file object. +// Typically you would lookup using LBA and use the object to read or write +struct BlockInfo { + explicit BlockInfo(const Slice& key, const LBA& lba = LBA()) + : key_(key.ToString()), lba_(lba) {} + + std::string key_; + LBA lba_; +}; + +class BlockCacheTierMetadata { + public: + explicit BlockCacheTierMetadata(const uint32_t blocks_capacity = 1024 * 1024, + const uint32_t cachefile_capacity = 10 * 1024) + : cache_file_index_(cachefile_capacity), block_index_(blocks_capacity) {} + + virtual ~BlockCacheTierMetadata() {} + + // Insert a given cache file + bool Insert(BlockCacheFile* file); + + // Lookup cache file based on cache_id + BlockCacheFile* Lookup(const uint32_t cache_id); + + // Insert block information to block index + BlockInfo* Insert(const Slice& key, const LBA& lba); + // bool Insert(BlockInfo* binfo); + + // Lookup block information from block index + bool Lookup(const Slice& key, LBA* lba); + + // Remove a given from the block index + BlockInfo* Remove(const Slice& key); + + // Find and evict a cache file using LRU policy + BlockCacheFile* Evict(); + + // Clear the metadata contents + virtual void Clear(); + + protected: + // Remove all block information from a given file + virtual void RemoveAllKeys(BlockCacheFile* file); + + private: + // Cache file index definition + // + // cache-id => BlockCacheFile + struct BlockCacheFileHash { + uint64_t operator()(const BlockCacheFile* rec) { + return std::hash()(rec->cacheid()); + } + }; + + struct BlockCacheFileEqual { + uint64_t operator()(const BlockCacheFile* lhs, const BlockCacheFile* rhs) { + return lhs->cacheid() == rhs->cacheid(); + } + }; + + using CacheFileIndexType = + EvictableHashTable; + + // Block Lookup Index + // + // key => LBA + struct Hash { + size_t operator()(BlockInfo* node) const { + return std::hash()(node->key_); + } + }; + + struct Equal { + size_t operator()(BlockInfo* lhs, BlockInfo* rhs) const { + return lhs->key_ == rhs->key_; + } + }; + + using BlockIndexType = HashTable; + + CacheFileIndexType cache_file_index_; + BlockIndexType block_index_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/hash_table.h b/src/rocksdb/utilities/persistent_cache/hash_table.h new file mode 100644 index 000000000..b00b294ce --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/hash_table.h @@ -0,0 +1,239 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include +#include + +#ifdef OS_LINUX +#include +#endif + +#include "rocksdb/env.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// HashTable +// +// Traditional implementation of hash table with synchronization built on top +// don't perform very well in multi-core scenarios. This is an implementation +// designed for multi-core scenarios with high lock contention. +// +// |<-------- alpha ------------->| +// Buckets Collision list +// ---- +----+ +---+---+--- ...... ---+---+---+ +// / | |--->| | | | | | +// / +----+ +---+---+--- ...... ---+---+---+ +// / | | +// Locks/ +----+ +// +--+/ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// \ +----+ +// \ | | +// \ +----+ +// \ | | +// \---- +----+ +// +// The lock contention is spread over an array of locks. This helps improve +// concurrent access. The spine is designed for a certain capacity and load +// factor. When the capacity planning is done correctly we can expect +// O(load_factor = 1) insert, access and remove time. +// +// Micro benchmark on debug build gives about .5 Million/sec rate of insert, +// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the +// blocks were of 4K, the hash table can support a virtual throughput of +// 6 GB/s. +// +// T Object type (contains both key and value) +// Hash Function that returns an hash from type T +// Equal Returns if two objects are equal +// (We need explicit equal for pointer type) +// +template +class HashTable { + public: + explicit HashTable(const size_t capacity = 1024 * 1024, + const float load_factor = 2.0, const uint32_t nlocks = 256) + : nbuckets_( + static_cast(load_factor ? capacity / load_factor : 0)), + nlocks_(nlocks) { + // pre-conditions + assert(capacity); + assert(load_factor); + assert(nbuckets_); + assert(nlocks_); + + buckets_.reset(new Bucket[nbuckets_]); +#ifdef OS_LINUX + mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); +#endif + + // initialize locks + locks_.reset(new port::RWMutex[nlocks_]); +#ifdef OS_LINUX + mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); +#endif + + // post-conditions + assert(buckets_); + assert(locks_); + } + + virtual ~HashTable() { AssertEmptyBuckets(); } + + // + // Insert given record to hash table + // + bool Insert(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + auto& bucket = buckets_[bucket_idx]; + return Insert(&bucket, t); + } + + // Lookup hash table + // + // Please note that read lock should be held by the caller. This is because + // the caller owns the data, and should hold the read lock as long as he + // operates on the data. + bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + port::RWMutex& lock = locks_[lock_idx]; + lock.ReadLock(); + + auto& bucket = buckets_[bucket_idx]; + if (Find(&bucket, t, ret)) { + *ret_lock = &lock; + return true; + } + + lock.ReadUnlock(); + return false; + } + + // + // Erase a given key from the hash table + // + bool Erase(const T& t, T* ret) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + + auto& bucket = buckets_[bucket_idx]; + return Erase(&bucket, t, ret); + } + + // Fetch the mutex associated with a key + // This call is used to hold the lock for a given data for extended period of + // time. + port::RWMutex* GetMutex(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + return &locks_[lock_idx]; + } + + void Clear(void (*fn)(T)) { + for (uint32_t i = 0; i < nbuckets_; ++i) { + const uint32_t lock_idx = i % nlocks_; + WriteLock _(&locks_[lock_idx]); + for (auto& t : buckets_[i].list_) { + (*fn)(t); + } + buckets_[i].list_.clear(); + } + } + + protected: + // Models bucket of keys that hash to the same bucket number + struct Bucket { + std::list list_; + }; + + // Substitute for std::find with custom comparator operator + typename std::list::iterator Find(std::list* list, const T& t) { + for (auto it = list->begin(); it != list->end(); ++it) { + if (Equal()(*it, t)) { + return it; + } + } + return list->end(); + } + + bool Insert(Bucket* bucket, const T& t) { + // Check if the key already exists + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + return false; + } + + // insert to bucket + bucket->list_.push_back(t); + return true; + } + + bool Find(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + return true; + } + return false; + } + + bool Erase(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + + bucket->list_.erase(it); + return true; + } + return false; + } + + // assert that all buckets are empty + void AssertEmptyBuckets() { +#ifndef NDEBUG + for (size_t i = 0; i < nbuckets_; ++i) { + WriteLock _(&locks_[i % nlocks_]); + assert(buckets_[i].list_.empty()); + } +#endif + } + + const uint32_t nbuckets_; // No. of buckets in the spine + std::unique_ptr buckets_; // Spine of the hash buckets + const uint32_t nlocks_; // No. of locks + std::unique_ptr locks_; // Granular locks +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc new file mode 100644 index 000000000..74d7e2edf --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc @@ -0,0 +1,310 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#if !defined(OS_WIN) && !defined(ROCKSDB_LITE) + +#ifndef GFLAGS +#include +int main() { fprintf(stderr, "Please install gflags to run tools\n"); } +#else + +#include +#include + +#include +#include +#include +#include + +#include "port/port_posix.h" +#include "port/sys_time.h" +#include "rocksdb/env.h" +#include "util/gflags_compat.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "utilities/persistent_cache/hash_table.h" + +using std::string; + +DEFINE_int32(nsec, 10, "nsec"); +DEFINE_int32(nthread_write, 1, "insert %"); +DEFINE_int32(nthread_read, 1, "lookup %"); +DEFINE_int32(nthread_erase, 1, "erase %"); + +namespace ROCKSDB_NAMESPACE { + +// +// HashTableImpl interface +// +// Abstraction of a hash table implementation +template +class HashTableImpl { + public: + virtual ~HashTableImpl() {} + + virtual bool Insert(const Key& key, const Value& val) = 0; + virtual bool Erase(const Key& key) = 0; + virtual bool Lookup(const Key& key, Value* val) = 0; +}; + +// HashTableBenchmark +// +// Abstraction to test a given hash table implementation. The test mostly +// focus on insert, lookup and erase. The test can operate in test mode and +// benchmark mode. +class HashTableBenchmark { + public: + explicit HashTableBenchmark(HashTableImpl* impl, + const size_t sec = 10, + const size_t nthread_write = 1, + const size_t nthread_read = 1, + const size_t nthread_erase = 1) + : impl_(impl), + sec_(sec), + ninserts_(0), + nreads_(0), + nerases_(0), + nerases_failed_(0), + quit_(false) { + Prepop(); + + StartThreads(nthread_write, WriteMain); + StartThreads(nthread_read, ReadMain); + StartThreads(nthread_erase, EraseMain); + + uint64_t start = NowInMillSec(); + while (!quit_) { + quit_ = NowInMillSec() - start > sec_ * 1000; + /* sleep override */ sleep(1); + } + + Env* env = Env::Default(); + env->WaitForJoin(); + + if (sec_) { + printf("Result \n"); + printf("====== \n"); + printf("insert/sec = %f \n", ninserts_ / static_cast(sec_)); + printf("read/sec = %f \n", nreads_ / static_cast(sec_)); + printf("erases/sec = %f \n", nerases_ / static_cast(sec_)); + const uint64_t ops = ninserts_ + nreads_ + nerases_; + printf("ops/sec = %f \n", ops / static_cast(sec_)); + printf("erase fail = %d (%f%%)\n", static_cast(nerases_failed_), + static_cast(nerases_failed_ / nerases_ * 100)); + printf("====== \n"); + } + } + + void RunWrite() { + while (!quit_) { + size_t k = insert_key_++; + std::string tmp(1000, k % 255); + bool status = impl_->Insert(k, tmp); + assert(status); + ninserts_++; + } + } + + void RunRead() { + Random64 rgen(time(nullptr)); + while (!quit_) { + std::string s; + size_t k = rgen.Next() % max_prepop_key; + bool status = impl_->Lookup(k, &s); + assert(status); + assert(s == std::string(1000, k % 255)); + nreads_++; + } + } + + void RunErase() { + while (!quit_) { + size_t k = erase_key_++; + bool status = impl_->Erase(k); + nerases_failed_ += !status; + nerases_++; + } + } + + private: + // Start threads for a given function + void StartThreads(const size_t n, void (*fn)(void*)) { + Env* env = Env::Default(); + for (size_t i = 0; i < n; ++i) { + env->StartThread(fn, this); + } + } + + // Prepop the hash table with 1M keys + void Prepop() { + for (size_t i = 0; i < max_prepop_key; ++i) { + bool status = impl_->Insert(i, std::string(1000, i % 255)); + assert(status); + } + + erase_key_ = insert_key_ = max_prepop_key; + + for (size_t i = 0; i < 10 * max_prepop_key; ++i) { + bool status = impl_->Insert(insert_key_++, std::string(1000, 'x')); + assert(status); + } + } + + static uint64_t NowInMillSec() { + port::TimeVal tv; + port::GetTimeOfDay(&tv, /*tz=*/nullptr); + return tv.tv_sec * 1000 + tv.tv_usec / 1000; + } + + // + // Wrapper functions for thread entry + // + static void WriteMain(void* args) { + reinterpret_cast(args)->RunWrite(); + } + + static void ReadMain(void* args) { + reinterpret_cast(args)->RunRead(); + } + + static void EraseMain(void* args) { + reinterpret_cast(args)->RunErase(); + } + + HashTableImpl* impl_; // Implementation to test + const size_t sec_; // Test time + const size_t max_prepop_key = 1ULL * 1024 * 1024; // Max prepop key + std::atomic insert_key_; // Last inserted key + std::atomic erase_key_; // Erase key + std::atomic ninserts_; // Number of inserts + std::atomic nreads_; // Number of reads + std::atomic nerases_; // Number of erases + std::atomic nerases_failed_; // Number of erases failed + bool quit_; // Should the threads quit ? +}; + +// +// SimpleImpl +// Lock safe unordered_map implementation +class SimpleImpl : public HashTableImpl { + public: + bool Insert(const size_t& key, const string& val) override { + WriteLock _(&rwlock_); + map_.insert(make_pair(key, val)); + return true; + } + + bool Erase(const size_t& key) override { + WriteLock _(&rwlock_); + auto it = map_.find(key); + if (it == map_.end()) { + return false; + } + map_.erase(it); + return true; + } + + bool Lookup(const size_t& key, string* val) override { + ReadLock _(&rwlock_); + auto it = map_.find(key); + if (it != map_.end()) { + *val = it->second; + } + return it != map_.end(); + } + + private: + port::RWMutex rwlock_; + std::unordered_map map_; +}; + +// +// GranularLockImpl +// Thread safe custom RocksDB implementation of hash table with granular +// locking +class GranularLockImpl : public HashTableImpl { + public: + bool Insert(const size_t& key, const string& val) override { + Node n(key, val); + return impl_.Insert(n); + } + + bool Erase(const size_t& key) override { + Node n(key, string()); + return impl_.Erase(n, nullptr); + } + + bool Lookup(const size_t& key, string* val) override { + Node n(key, string()); + port::RWMutex* rlock; + bool status = impl_.Find(n, &n, &rlock); + if (status) { + ReadUnlock _(rlock); + *val = n.val_; + } + return status; + } + + private: + struct Node { + explicit Node(const size_t key, const string& val) : key_(key), val_(val) {} + + size_t key_ = 0; + string val_; + }; + + struct Hash { + uint64_t operator()(const Node& node) { + return std::hash()(node.key_); + } + }; + + struct Equal { + bool operator()(const Node& lhs, const Node& rhs) { + return lhs.key_ == rhs.key_; + } + }; + + HashTable impl_; +}; + +} // namespace ROCKSDB_NAMESPACE + +// +// main +// +int main(int argc, char** argv) { + GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") + + std::string(argv[0]) + " [OPTIONS]..."); + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false); + + // + // Micro benchmark unordered_map + // + printf("Micro benchmarking std::unordered_map \n"); + { + ROCKSDB_NAMESPACE::SimpleImpl impl; + ROCKSDB_NAMESPACE::HashTableBenchmark _( + &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read, + FLAGS_nthread_erase); + } + // + // Micro benchmark scalable hash table + // + printf("Micro benchmarking scalable hash map \n"); + { + ROCKSDB_NAMESPACE::GranularLockImpl impl; + ROCKSDB_NAMESPACE::HashTableBenchmark _( + &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read, + FLAGS_nthread_erase); + } + + return 0; +} +#endif // #ifndef GFLAGS +#else +int main(int /*argc*/, char** /*argv*/) { return 0; } +#endif diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h new file mode 100644 index 000000000..e10939b2f --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h @@ -0,0 +1,168 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "util/random.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/lrulist.h" + +namespace ROCKSDB_NAMESPACE { + +// Evictable Hash Table +// +// Hash table index where least accessed (or one of the least accessed) elements +// can be evicted. +// +// Please note EvictableHashTable can only be created for pointer type objects +template +class EvictableHashTable : private HashTable { + public: + using hash_table = HashTable; + + explicit EvictableHashTable(const size_t capacity = 1024 * 1024, + const float load_factor = 2.0, + const uint32_t nlocks = 256) + : HashTable(capacity, load_factor, nlocks), + lru_lists_(new LRUList[hash_table::nlocks_]) { + assert(lru_lists_); + } + + virtual ~EvictableHashTable() { AssertEmptyLRU(); } + + // + // Insert given record to hash table (and LRU list) + // + bool Insert(T* t) { + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + LRUListType& lru = GetLRUList(h); + port::RWMutex& lock = GetMutex(h); + + WriteLock _(&lock); + if (hash_table::Insert(&bucket, t)) { + lru.Push(t); + return true; + } + return false; + } + + // + // Lookup hash table + // + // Please note that read lock should be held by the caller. This is because + // the caller owns the data, and should hold the read lock as long as he + // operates on the data. + bool Find(T* t, T** ret) { + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + LRUListType& lru = GetLRUList(h); + port::RWMutex& lock = GetMutex(h); + + ReadLock _(&lock); + if (hash_table::Find(&bucket, t, ret)) { + ++(*ret)->refs_; + lru.Touch(*ret); + return true; + } + return false; + } + + // + // Evict one of the least recently used object + // + T* Evict(const std::function& fn = nullptr) { + uint32_t random = Random::GetTLSInstance()->Next(); + const size_t start_idx = random % hash_table::nlocks_; + T* t = nullptr; + + // iterate from start_idx .. 0 .. start_idx + for (size_t i = 0; !t && i < hash_table::nlocks_; ++i) { + const size_t idx = (start_idx + i) % hash_table::nlocks_; + + WriteLock _(&hash_table::locks_[idx]); + LRUListType& lru = lru_lists_[idx]; + if (!lru.IsEmpty() && (t = lru.Pop()) != nullptr) { + assert(!t->refs_); + // We got an item to evict, erase from the bucket + const uint64_t h = Hash()(t); + typename hash_table::Bucket& bucket = GetBucket(h); + T* tmp = nullptr; + bool status = hash_table::Erase(&bucket, t, &tmp); + assert(t == tmp); + (void)status; + assert(status); + if (fn) { + fn(t); + } + break; + } + assert(!t); + } + return t; + } + + void Clear(void (*fn)(T*)) { + for (uint32_t i = 0; i < hash_table::nbuckets_; ++i) { + const uint32_t lock_idx = i % hash_table::nlocks_; + WriteLock _(&hash_table::locks_[lock_idx]); + auto& lru_list = lru_lists_[lock_idx]; + auto& bucket = hash_table::buckets_[i]; + for (auto* t : bucket.list_) { + lru_list.Unlink(t); + (*fn)(t); + } + bucket.list_.clear(); + } + // make sure that all LRU lists are emptied + AssertEmptyLRU(); + } + + void AssertEmptyLRU() { +#ifndef NDEBUG + for (uint32_t i = 0; i < hash_table::nlocks_; ++i) { + WriteLock _(&hash_table::locks_[i]); + auto& lru_list = lru_lists_[i]; + assert(lru_list.IsEmpty()); + } +#endif + } + + // + // Fetch the mutex associated with a key + // This call is used to hold the lock for a given data for extended period of + // time. + port::RWMutex* GetMutex(T* t) { return hash_table::GetMutex(t); } + + private: + using LRUListType = LRUList; + + typename hash_table::Bucket& GetBucket(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + return hash_table::buckets_[bucket_idx]; + } + + LRUListType& GetLRUList(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + const uint32_t lock_idx = bucket_idx % hash_table::nlocks_; + return lru_lists_[lock_idx]; + } + + port::RWMutex& GetMutex(const uint64_t h) { + const uint32_t bucket_idx = h % hash_table::nbuckets_; + const uint32_t lock_idx = bucket_idx % hash_table::nlocks_; + return hash_table::locks_[lock_idx]; + } + + std::unique_ptr lru_lists_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_test.cc b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc new file mode 100644 index 000000000..2f6387f5f --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "utilities/persistent_cache/hash_table.h" + +#include + +#include +#include +#include + +#include "db/db_test_util.h" +#include "memory/arena.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "utilities/persistent_cache/hash_table_evictable.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +struct HashTableTest : public testing::Test { + ~HashTableTest() override { map_.Clear(&HashTableTest::ClearNode); } + + struct Node { + Node() {} + explicit Node(const uint64_t key, const std::string& val = std::string()) + : key_(key), val_(val) {} + + uint64_t key_ = 0; + std::string val_; + }; + + struct Equal { + bool operator()(const Node& lhs, const Node& rhs) { + return lhs.key_ == rhs.key_; + } + }; + + struct Hash { + uint64_t operator()(const Node& node) { + return std::hash()(node.key_); + } + }; + + static void ClearNode(Node /*node*/) {} + + HashTable map_; +}; + +struct EvictableHashTableTest : public testing::Test { + ~EvictableHashTableTest() override { + map_.Clear(&EvictableHashTableTest::ClearNode); + } + + struct Node : LRUElement { + Node() {} + explicit Node(const uint64_t key, const std::string& val = std::string()) + : key_(key), val_(val) {} + + uint64_t key_ = 0; + std::string val_; + std::atomic refs_{0}; + }; + + struct Equal { + bool operator()(const Node* lhs, const Node* rhs) { + return lhs->key_ == rhs->key_; + } + }; + + struct Hash { + uint64_t operator()(const Node* node) { + return std::hash()(node->key_); + } + }; + + static void ClearNode(Node* /*node*/) {} + + EvictableHashTable map_; +}; + +TEST_F(HashTableTest, TestInsert) { + const uint64_t max_keys = 1024 * 1024; + + // insert + for (uint64_t k = 0; k < max_keys; ++k) { + map_.Insert(Node(k, std::string(1000, k % 255))); + } + + // verify + for (uint64_t k = 0; k < max_keys; ++k) { + Node val; + port::RWMutex* rlock = nullptr; + assert(map_.Find(Node(k), &val, &rlock)); + rlock->ReadUnlock(); + assert(val.val_ == std::string(1000, k % 255)); + } +} + +TEST_F(HashTableTest, TestErase) { + const uint64_t max_keys = 1024 * 1024; + // insert + for (uint64_t k = 0; k < max_keys; ++k) { + map_.Insert(Node(k, std::string(1000, k % 255))); + } + + auto rand = Random64(time(nullptr)); + // erase a few keys randomly + std::set erased; + for (int i = 0; i < 1024; ++i) { + uint64_t k = rand.Next() % max_keys; + if (erased.find(k) != erased.end()) { + continue; + } + assert(map_.Erase(Node(k), /*ret=*/nullptr)); + erased.insert(k); + } + + // verify + for (uint64_t k = 0; k < max_keys; ++k) { + Node val; + port::RWMutex* rlock = nullptr; + bool status = map_.Find(Node(k), &val, &rlock); + if (erased.find(k) == erased.end()) { + assert(status); + rlock->ReadUnlock(); + assert(val.val_ == std::string(1000, k % 255)); + } else { + assert(!status); + } + } +} + +TEST_F(EvictableHashTableTest, TestEvict) { + const uint64_t max_keys = 1024 * 1024; + + // insert + for (uint64_t k = 0; k < max_keys; ++k) { + map_.Insert(new Node(k, std::string(1000, k % 255))); + } + + // verify + for (uint64_t k = 0; k < max_keys; ++k) { + Node* val = map_.Evict(); + // unfortunately we can't predict eviction value since it is from any one of + // the lock stripe + assert(val); + assert(val->val_ == std::string(1000, val->key_ % 255)); + delete val; + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/persistent_cache/lrulist.h b/src/rocksdb/utilities/persistent_cache/lrulist.h new file mode 100644 index 000000000..a608890fc --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/lrulist.h @@ -0,0 +1,174 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// LRU element definition +// +// Any object that needs to be part of the LRU algorithm should extend this +// class +template +struct LRUElement { + explicit LRUElement() : next_(nullptr), prev_(nullptr), refs_(0) {} + + virtual ~LRUElement() { assert(!refs_); } + + T* next_; + T* prev_; + std::atomic refs_; +}; + +// LRU implementation +// +// In place LRU implementation. There is no copy or allocation involved when +// inserting or removing an element. This makes the data structure slim +template +class LRUList { + public: + virtual ~LRUList() { + MutexLock _(&lock_); + assert(!head_); + assert(!tail_); + } + + // Push element into the LRU at the cold end + inline void Push(T* const t) { + assert(t); + assert(!t->next_); + assert(!t->prev_); + + MutexLock _(&lock_); + + assert((!head_ && !tail_) || (head_ && tail_)); + assert(!head_ || !head_->prev_); + assert(!tail_ || !tail_->next_); + + t->next_ = head_; + if (head_) { + head_->prev_ = t; + } + + head_ = t; + if (!tail_) { + tail_ = t; + } + } + + // Unlink the element from the LRU + inline void Unlink(T* const t) { + MutexLock _(&lock_); + UnlinkImpl(t); + } + + // Evict an element from the LRU + inline T* Pop() { + MutexLock _(&lock_); + + assert(tail_ && head_); + assert(!tail_->next_); + assert(!head_->prev_); + + T* t = head_; + while (t && t->refs_) { + t = t->next_; + } + + if (!t) { + // nothing can be evicted + return nullptr; + } + + assert(!t->refs_); + + // unlike the element + UnlinkImpl(t); + return t; + } + + // Move the element from the front of the list to the back of the list + inline void Touch(T* const t) { + MutexLock _(&lock_); + UnlinkImpl(t); + PushBackImpl(t); + } + + // Check if the LRU is empty + inline bool IsEmpty() const { + MutexLock _(&lock_); + return !head_ && !tail_; + } + + private: + // Unlink an element from the LRU + void UnlinkImpl(T* const t) { + assert(t); + + lock_.AssertHeld(); + + assert(head_ && tail_); + assert(t->prev_ || head_ == t); + assert(t->next_ || tail_ == t); + + if (t->prev_) { + t->prev_->next_ = t->next_; + } + if (t->next_) { + t->next_->prev_ = t->prev_; + } + + if (tail_ == t) { + tail_ = tail_->prev_; + } + if (head_ == t) { + head_ = head_->next_; + } + + t->next_ = t->prev_ = nullptr; + } + + // Insert an element at the hot end + inline void PushBack(T* const t) { + MutexLock _(&lock_); + PushBackImpl(t); + } + + inline void PushBackImpl(T* const t) { + assert(t); + assert(!t->next_); + assert(!t->prev_); + + lock_.AssertHeld(); + + assert((!head_ && !tail_) || (head_ && tail_)); + assert(!head_ || !head_->prev_); + assert(!tail_ || !tail_->next_); + + t->prev_ = tail_; + if (tail_) { + tail_->next_ = t; + } + + tail_ = t; + if (!head_) { + head_ = tail_; + } + } + + mutable port::Mutex lock_; // synchronization primitive + T* head_ = nullptr; // front (cold) + T* tail_ = nullptr; // back (hot) +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc new file mode 100644 index 000000000..9d6e15d6b --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc @@ -0,0 +1,359 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#ifndef GFLAGS +#include +int main() { fprintf(stderr, "Please install gflags to run tools\n"); } +#else +#include +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" +#include "table/block_based/block_builder.h" +#include "util/gflags_compat.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "utilities/persistent_cache/block_cache_tier.h" +#include "utilities/persistent_cache/persistent_cache_tier.h" +#include "utilities/persistent_cache/volatile_tier_impl.h" + +DEFINE_int32(nsec, 10, "nsec"); +DEFINE_int32(nthread_write, 1, "Insert threads"); +DEFINE_int32(nthread_read, 1, "Lookup threads"); +DEFINE_string(path, "/tmp/microbench/blkcache", "Path for cachefile"); +DEFINE_string(log_path, "/tmp/log", "Path for the log file"); +DEFINE_uint64(cache_size, std::numeric_limits::max(), "Cache size"); +DEFINE_int32(iosize, 4 * 1024, "Read IO size"); +DEFINE_int32(writer_iosize, 4 * 1024, "File writer IO size"); +DEFINE_int32(writer_qdepth, 1, "File writer qdepth"); +DEFINE_bool(enable_pipelined_writes, false, "Enable async writes"); +DEFINE_string(cache_type, "block_cache", + "Cache type. (block_cache, volatile, tiered)"); +DEFINE_bool(benchmark, false, "Benchmark mode"); +DEFINE_int32(volatile_cache_pct, 10, "Percentage of cache in memory tier."); + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr NewVolatileCache() { + assert(FLAGS_cache_size != std::numeric_limits::max()); + std::unique_ptr pcache( + new VolatileCacheTier(FLAGS_cache_size)); + return pcache; +} + +std::unique_ptr NewBlockCache() { + std::shared_ptr log; + if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) { + fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str()); + return nullptr; + } + + PersistentCacheConfig opt(Env::Default(), FLAGS_path, FLAGS_cache_size, log); + opt.writer_dispatch_size = FLAGS_writer_iosize; + opt.writer_qdepth = FLAGS_writer_qdepth; + opt.pipeline_writes = FLAGS_enable_pipelined_writes; + opt.max_write_pipeline_backlog_size = std::numeric_limits::max(); + std::unique_ptr cache(new BlockCacheTier(opt)); + Status status = cache->Open(); + return cache; +} + +// create a new cache tier +// construct a tiered RAM+Block cache +std::unique_ptr NewTieredCache( + const size_t mem_size, const PersistentCacheConfig& opt) { + std::unique_ptr tcache(new PersistentTieredCache()); + // create primary tier + assert(mem_size); + auto pcache = + std::shared_ptr(new VolatileCacheTier(mem_size)); + tcache->AddTier(pcache); + // create secondary tier + auto scache = std::shared_ptr(new BlockCacheTier(opt)); + tcache->AddTier(scache); + + Status s = tcache->Open(); + assert(s.ok()); + return tcache; +} + +std::unique_ptr NewTieredCache() { + std::shared_ptr log; + if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) { + fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str()); + abort(); + } + + auto pct = FLAGS_volatile_cache_pct / static_cast(100); + PersistentCacheConfig opt(Env::Default(), FLAGS_path, + (1 - pct) * FLAGS_cache_size, log); + opt.writer_dispatch_size = FLAGS_writer_iosize; + opt.writer_qdepth = FLAGS_writer_qdepth; + opt.pipeline_writes = FLAGS_enable_pipelined_writes; + opt.max_write_pipeline_backlog_size = std::numeric_limits::max(); + return NewTieredCache(FLAGS_cache_size * pct, opt); +} + +// +// Benchmark driver +// +class CacheTierBenchmark { + public: + explicit CacheTierBenchmark(std::shared_ptr&& cache) + : cache_(cache) { + if (FLAGS_nthread_read) { + fprintf(stdout, "Pre-populating\n"); + Prepop(); + fprintf(stdout, "Pre-population completed\n"); + } + + stats_.Clear(); + + // Start IO threads + std::list threads; + Spawn(FLAGS_nthread_write, &threads, + std::bind(&CacheTierBenchmark::Write, this)); + Spawn(FLAGS_nthread_read, &threads, + std::bind(&CacheTierBenchmark::Read, this)); + + // Wait till FLAGS_nsec and then signal to quit + StopWatchNano t(SystemClock::Default().get(), /*auto_start=*/true); + size_t sec = t.ElapsedNanos() / 1000000000ULL; + while (!quit_) { + sec = t.ElapsedNanos() / 1000000000ULL; + quit_ = sec > size_t(FLAGS_nsec); + /* sleep override */ sleep(1); + } + + // Wait for threads to exit + Join(&threads); + // Print stats + PrintStats(sec); + // Close the cache + cache_->TEST_Flush(); + cache_->Close(); + } + + private: + void PrintStats(const size_t sec) { + std::ostringstream msg; + msg << "Test stats" << std::endl + << "* Elapsed: " << sec << " s" << std::endl + << "* Write Latency:" << std::endl + << stats_.write_latency_.ToString() << std::endl + << "* Read Latency:" << std::endl + << stats_.read_latency_.ToString() << std::endl + << "* Bytes written:" << std::endl + << stats_.bytes_written_.ToString() << std::endl + << "* Bytes read:" << std::endl + << stats_.bytes_read_.ToString() << std::endl + << "Cache stats:" << std::endl + << cache_->PrintStats() << std::endl; + fprintf(stderr, "%s\n", msg.str().c_str()); + } + + // + // Insert implementation and corresponding helper functions + // + void Prepop() { + for (uint64_t i = 0; i < 1024 * 1024; ++i) { + InsertKey(i); + insert_key_limit_++; + read_key_limit_++; + } + + // Wait until data is flushed + cache_->TEST_Flush(); + // warmup the cache + for (uint64_t i = 0; i < 1024 * 1024; ReadKey(i++)) { + } + } + + void Write() { + while (!quit_) { + InsertKey(insert_key_limit_++); + } + } + + void InsertKey(const uint64_t key) { + // construct key + uint64_t k[3]; + Slice block_key = FillKey(k, key); + + // construct value + auto block = NewBlock(key); + + // insert + StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true); + while (true) { + Status status = cache_->Insert(block_key, block.get(), FLAGS_iosize); + if (status.ok()) { + break; + } + + // transient error is possible if we run without pipelining + assert(!FLAGS_enable_pipelined_writes); + } + + // adjust stats + const size_t elapsed_micro = timer.ElapsedNanos() / 1000; + stats_.write_latency_.Add(elapsed_micro); + stats_.bytes_written_.Add(FLAGS_iosize); + } + + // + // Read implementation + // + void Read() { + while (!quit_) { + ReadKey(random() % read_key_limit_); + } + } + + void ReadKey(const uint64_t val) { + // construct key + uint64_t k[3]; + Slice key = FillKey(k, val); + + // Lookup in cache + StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true); + std::unique_ptr block; + size_t size; + Status status = cache_->Lookup(key, &block, &size); + if (!status.ok()) { + fprintf(stderr, "%s\n", status.ToString().c_str()); + } + assert(status.ok()); + assert(size == (size_t)FLAGS_iosize); + + // adjust stats + const size_t elapsed_micro = timer.ElapsedNanos() / 1000; + stats_.read_latency_.Add(elapsed_micro); + stats_.bytes_read_.Add(FLAGS_iosize); + + // verify content + if (!FLAGS_benchmark) { + auto expected_block = NewBlock(val); + assert(memcmp(block.get(), expected_block.get(), FLAGS_iosize) == 0); + } + } + + // create data for a key by filling with a certain pattern + std::unique_ptr NewBlock(const uint64_t val) { + std::unique_ptr data(new char[FLAGS_iosize]); + memset(data.get(), val % 255, FLAGS_iosize); + return data; + } + + // spawn threads + void Spawn(const size_t n, std::list* threads, + const std::function& fn) { + for (size_t i = 0; i < n; ++i) { + threads->emplace_back(fn); + } + } + + // join threads + void Join(std::list* threads) { + for (auto& th : *threads) { + th.join(); + } + } + + // construct key + Slice FillKey(uint64_t (&k)[3], const uint64_t val) { + k[0] = k[1] = 0; + k[2] = val; + void* p = static_cast(&k); + return Slice(static_cast(p), sizeof(k)); + } + + // benchmark stats + struct Stats { + void Clear() { + bytes_written_.Clear(); + bytes_read_.Clear(); + read_latency_.Clear(); + write_latency_.Clear(); + } + + HistogramImpl bytes_written_; + HistogramImpl bytes_read_; + HistogramImpl read_latency_; + HistogramImpl write_latency_; + }; + + std::shared_ptr cache_; // cache implementation + std::atomic insert_key_limit_{0}; // data inserted upto + std::atomic read_key_limit_{0}; // data can be read safely upto + bool quit_ = false; // Quit thread ? + mutable Stats stats_; // Stats +}; + +} // namespace ROCKSDB_NAMESPACE + +// +// main +// +int main(int argc, char** argv) { + GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") + + std::string(argv[0]) + " [OPTIONS]..."); + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false); + + std::ostringstream msg; + msg << "Config" << std::endl + << "======" << std::endl + << "* nsec=" << FLAGS_nsec << std::endl + << "* nthread_write=" << FLAGS_nthread_write << std::endl + << "* path=" << FLAGS_path << std::endl + << "* cache_size=" << FLAGS_cache_size << std::endl + << "* iosize=" << FLAGS_iosize << std::endl + << "* writer_iosize=" << FLAGS_writer_iosize << std::endl + << "* writer_qdepth=" << FLAGS_writer_qdepth << std::endl + << "* enable_pipelined_writes=" << FLAGS_enable_pipelined_writes + << std::endl + << "* cache_type=" << FLAGS_cache_type << std::endl + << "* benchmark=" << FLAGS_benchmark << std::endl + << "* volatile_cache_pct=" << FLAGS_volatile_cache_pct << std::endl; + + fprintf(stderr, "%s\n", msg.str().c_str()); + + std::shared_ptr cache; + if (FLAGS_cache_type == "block_cache") { + fprintf(stderr, "Using block cache implementation\n"); + cache = ROCKSDB_NAMESPACE::NewBlockCache(); + } else if (FLAGS_cache_type == "volatile") { + fprintf(stderr, "Using volatile cache implementation\n"); + cache = ROCKSDB_NAMESPACE::NewVolatileCache(); + } else if (FLAGS_cache_type == "tiered") { + fprintf(stderr, "Using tiered cache implementation\n"); + cache = ROCKSDB_NAMESPACE::NewTieredCache(); + } else { + fprintf(stderr, "Unknown option for cache\n"); + } + + assert(cache); + if (!cache) { + fprintf(stderr, "Error creating cache\n"); + abort(); + } + + std::unique_ptr benchmark( + new ROCKSDB_NAMESPACE::CacheTierBenchmark(std::move(cache))); + + return 0; +} +#endif // #ifndef GFLAGS +#else +int main(int, char**) { return 0; } +#endif diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc new file mode 100644 index 000000000..d1b18b68a --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc @@ -0,0 +1,462 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#if !defined ROCKSDB_LITE + +#include "utilities/persistent_cache/persistent_cache_test.h" + +#include +#include +#include + +#include "file/file_util.h" +#include "utilities/persistent_cache/block_cache_tier.h" + +namespace ROCKSDB_NAMESPACE { + +static const double kStressFactor = .125; + +#ifdef OS_LINUX +static void OnOpenForRead(void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", + std::bind(OnOpenForRead, std::placeholders::_1)); +} + +static void OnOpenForWrite(void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", + std::bind(OnOpenForWrite, std::placeholders::_1)); +} +#endif + +static void OnDeleteDir(void* arg) { + char* dir = static_cast(arg); + ASSERT_OK(DestroyDir(Env::Default(), std::string(dir))); +} + +// +// Simple logger that prints message on stdout +// +class ConsoleLogger : public Logger { + public: + using Logger::Logv; + ConsoleLogger() : Logger(InfoLogLevel::ERROR_LEVEL) {} + + void Logv(const char* format, va_list ap) override { + MutexLock _(&lock_); + vprintf(format, ap); + printf("\n"); + } + + port::Mutex lock_; +}; + +// construct a tiered RAM+Block cache +std::unique_ptr NewTieredCache( + const size_t mem_size, const PersistentCacheConfig& opt) { + std::unique_ptr tcache(new PersistentTieredCache()); + // create primary tier + assert(mem_size); + auto pcache = std::shared_ptr(new VolatileCacheTier( + /*is_compressed*/ true, mem_size)); + tcache->AddTier(pcache); + // create secondary tier + auto scache = std::shared_ptr(new BlockCacheTier(opt)); + tcache->AddTier(scache); + + Status s = tcache->Open(); + assert(s.ok()); + return tcache; +} + +// create block cache +std::unique_ptr NewBlockCache( + Env* env, const std::string& path, + const uint64_t max_size = std::numeric_limits::max(), + const bool enable_direct_writes = false) { + const uint32_t max_file_size = + static_cast(12 * 1024 * 1024 * kStressFactor); + auto log = std::make_shared(); + PersistentCacheConfig opt(env, path, max_size, log); + opt.cache_file_size = max_file_size; + opt.max_write_pipeline_backlog_size = std::numeric_limits::max(); + opt.enable_direct_writes = enable_direct_writes; + std::unique_ptr scache(new BlockCacheTier(opt)); + Status s = scache->Open(); + assert(s.ok()); + return scache; +} + +// create a new cache tier +std::unique_ptr NewTieredCache( + Env* env, const std::string& path, const uint64_t max_volatile_cache_size, + const uint64_t max_block_cache_size = + std::numeric_limits::max()) { + const uint32_t max_file_size = + static_cast(12 * 1024 * 1024 * kStressFactor); + auto log = std::make_shared(); + auto opt = PersistentCacheConfig(env, path, max_block_cache_size, log); + opt.cache_file_size = max_file_size; + opt.max_write_pipeline_backlog_size = std::numeric_limits::max(); + // create tier out of the two caches + auto cache = NewTieredCache(max_volatile_cache_size, opt); + return cache; +} + +PersistentCacheTierTest::PersistentCacheTierTest() + : path_(test::PerThreadDBPath("cache_test")) { +#ifdef OS_LINUX + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", OnOpenForRead); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", OnOpenForWrite); +#endif +} + +// Block cache tests +TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithFileCreateError) { + cache_ = NewBlockCache(Env::Default(), path_, + /*size=*/std::numeric_limits::max(), + /*direct_writes=*/false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockCacheTier::NewCacheFile:DeleteDir", OnDeleteDir); + + RunNegativeInsertTest(/*nthreads=*/1, + /*max_keys*/ + static_cast(10 * 1024 * kStressFactor)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Travis is unable to handle the normal version of the tests running out of +// fds, out of space and timeouts. This is an easier version of the test +// specifically written for Travis +TEST_F(PersistentCacheTierTest, DISABLED_BasicTest) { + cache_ = std::make_shared(); + RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024); + + cache_ = NewBlockCache(Env::Default(), path_, + /*size=*/std::numeric_limits::max(), + /*direct_writes=*/true); + RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024); + + cache_ = NewTieredCache(Env::Default(), path_, + /*memory_size=*/static_cast(1 * 1024 * 1024)); + RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024); +} + +// Volatile cache tests +// DISABLED for now (somewhat expensive) +TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsert) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : + {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) { + cache_ = std::make_shared(); + RunInsertTest(nthreads, static_cast(max_keys)); + } + } +} + +// DISABLED for now (somewhat expensive) +TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsertWithEviction) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) { + cache_ = std::make_shared( + /*compressed=*/true, + /*size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); + RunInsertTestWithEviction(nthreads, static_cast(max_keys)); + } + } +} + +// Block cache tests +// DISABLED for now (expensive) +TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsert) { + for (auto direct_writes : {true, false}) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : + {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) { + cache_ = NewBlockCache(Env::Default(), path_, + /*size=*/std::numeric_limits::max(), + direct_writes); + RunInsertTest(nthreads, static_cast(max_keys)); + } + } + } +} + +// DISABLED for now (somewhat expensive) +TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithEviction) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) { + cache_ = NewBlockCache( + Env::Default(), path_, + /*max_size=*/static_cast(200 * 1024 * 1024 * kStressFactor)); + RunInsertTestWithEviction(nthreads, static_cast(max_keys)); + } + } +} + +// Tiered cache tests +// DISABLED for now (expensive) +TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsert) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : + {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) { + cache_ = NewTieredCache( + Env::Default(), path_, + /*memory_size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); + RunInsertTest(nthreads, static_cast(max_keys)); + } + } +} + +// the tests causes a lot of file deletions which Travis limited testing +// environment cannot handle +// DISABLED for now (somewhat expensive) +TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) { + for (auto nthreads : {1, 5}) { + for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) { + cache_ = NewTieredCache( + Env::Default(), path_, + /*memory_size=*/static_cast(1 * 1024 * 1024 * kStressFactor), + /*block_cache_size*/ + static_cast(200 * 1024 * 1024 * kStressFactor)); + RunInsertTestWithEviction(nthreads, static_cast(max_keys)); + } + } +} + +std::shared_ptr MakeVolatileCache( + Env* /*env*/, const std::string& /*dbname*/) { + return std::make_shared(); +} + +std::shared_ptr MakeBlockCache(Env* env, + const std::string& dbname) { + return NewBlockCache(env, dbname); +} + +std::shared_ptr MakeTieredCache( + Env* env, const std::string& dbname) { + const auto memory_size = 1 * 1024 * 1024 * kStressFactor; + return NewTieredCache(env, dbname, static_cast(memory_size)); +} + +#ifdef OS_LINUX +static void UniqueIdCallback(void* arg) { + int* result = reinterpret_cast(arg); + if (*result == -1) { + *result = 0; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); +} +#endif + +TEST_F(PersistentCacheTierTest, FactoryTest) { + for (auto nvm_opt : {true, false}) { + ASSERT_FALSE(cache_); + auto log = std::make_shared(); + std::shared_ptr cache; + ASSERT_OK(NewPersistentCache(Env::Default(), path_, + /*size=*/1 * 1024 * 1024 * 1024, log, nvm_opt, + &cache)); + ASSERT_TRUE(cache); + ASSERT_EQ(cache->Stats().size(), 1); + ASSERT_TRUE(cache->Stats()[0].size()); + cache.reset(); + } +} + +PersistentCacheDBTest::PersistentCacheDBTest() + : DBTestBase("cache_test", /*env_do_fsync=*/true) { +#ifdef OS_LINUX + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", OnOpenForRead); +#endif +} + +// test template +void PersistentCacheDBTest::RunTest( + const std::function(bool)>& new_pcache, + const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) { + // number of insertion interations + int num_iter = static_cast(max_keys * kStressFactor); + + for (size_t iter = 0; iter < max_usecase; iter++) { + Options options; + options.write_buffer_size = + static_cast(64 * 1024 * kStressFactor); // small write buffer + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options = CurrentOptions(options); + + // setup page cache + std::shared_ptr pcache; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + + const size_t size_max = std::numeric_limits::max(); + + switch (iter) { + case 0: + // page cache, block cache, no-compressed cache + pcache = new_pcache(/*is_compressed=*/true); + table_options.persistent_cache = pcache; + table_options.block_cache = NewLRUCache(size_max); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 1: + // page cache, block cache, compressed cache + pcache = new_pcache(/*is_compressed=*/true); + table_options.persistent_cache = pcache; + table_options.block_cache = NewLRUCache(size_max); + table_options.block_cache_compressed = NewLRUCache(size_max); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 2: + // page cache, block cache, compressed cache + KNoCompression + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + pcache = new_pcache(/*is_compressed=*/true); + table_options.persistent_cache = pcache; + table_options.block_cache = NewLRUCache(size_max); + table_options.block_cache_compressed = NewLRUCache(size_max); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + break; + case 3: + // page cache, no block cache, no compressed cache + pcache = new_pcache(/*is_compressed=*/false); + table_options.persistent_cache = pcache; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 4: + // page cache, no block cache, no compressed cache + // Page cache caches compressed blocks + pcache = new_pcache(/*is_compressed=*/true); + table_options.persistent_cache = pcache; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + default: + FAIL(); + } + + std::vector values; + // insert data + Insert(options, table_options, num_iter, &values); + // flush all data in cache to device + pcache->TEST_Flush(); + // verify data + Verify(num_iter, values); + + auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS); + auto compressed_block_hit = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + auto compressed_block_miss = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT); + auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS); + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // page cache, block cache, no-compressed cache + ASSERT_GT(page_miss, 0); + ASSERT_GT(page_hit, 0); + ASSERT_GT(block_miss, 0); + ASSERT_EQ(compressed_block_miss, 0); + ASSERT_EQ(compressed_block_hit, 0); + break; + case 1: + // page cache, block cache, compressed cache + ASSERT_GT(page_miss, 0); + ASSERT_GT(block_miss, 0); + ASSERT_GT(compressed_block_miss, 0); + break; + case 2: + // page cache, block cache, compressed cache + KNoCompression + ASSERT_GT(page_miss, 0); + ASSERT_GT(page_hit, 0); + ASSERT_GT(block_miss, 0); + ASSERT_GT(compressed_block_miss, 0); + // remember kNoCompression + ASSERT_EQ(compressed_block_hit, 0); + break; + case 3: + case 4: + // page cache, no block cache, no compressed cache + ASSERT_GT(page_miss, 0); + ASSERT_GT(page_hit, 0); + ASSERT_EQ(compressed_block_hit, 0); + ASSERT_EQ(compressed_block_miss, 0); + break; + default: + FAIL(); + } + + options.create_if_missing = true; + DestroyAndReopen(options); + + ASSERT_OK(pcache->Close()); + } +} + +// Travis is unable to handle the normal version of the tests running out of +// fds, out of space and timeouts. This is an easier version of the test +// specifically written for Travis. +// Now used generally because main tests are too expensive as unit tests. +TEST_F(PersistentCacheDBTest, BasicTest) { + RunTest(std::bind(&MakeBlockCache, env_, dbname_), /*max_keys=*/1024, + /*max_usecase=*/1); +} + +// test table with block page cache +// DISABLED for now (very expensive, especially memory) +TEST_F(PersistentCacheDBTest, DISABLED_BlockCacheTest) { + RunTest(std::bind(&MakeBlockCache, env_, dbname_)); +} + +// test table with volatile page cache +// DISABLED for now (very expensive, especially memory) +TEST_F(PersistentCacheDBTest, DISABLED_VolatileCacheTest) { + RunTest(std::bind(&MakeVolatileCache, env_, dbname_)); +} + +// test table with tiered page cache +// DISABLED for now (very expensive, especially memory) +TEST_F(PersistentCacheDBTest, DISABLED_TieredCacheTest) { + RunTest(std::bind(&MakeTieredCache, env_, dbname_)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#else // !defined ROCKSDB_LITE +int main() { return 0; } +#endif // !defined ROCKSDB_LITE diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h new file mode 100644 index 000000000..f13155ed6 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h @@ -0,0 +1,286 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "memory/arena.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "table/block_based/block_builder.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "utilities/persistent_cache/volatile_tier_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Unit tests for testing PersistentCacheTier +// +class PersistentCacheTierTest : public testing::Test { + public: + PersistentCacheTierTest(); + virtual ~PersistentCacheTierTest() { + if (cache_) { + Status s = cache_->Close(); + assert(s.ok()); + } + } + + protected: + // Flush cache + void Flush() { + if (cache_) { + cache_->TEST_Flush(); + } + } + + // create threaded workload + template + std::list SpawnThreads(const size_t n, const T& fn) { + std::list threads; + for (size_t i = 0; i < n; i++) { + port::Thread th(fn); + threads.push_back(std::move(th)); + } + return threads; + } + + // Wait for threads to join + void Join(std::list&& threads) { + for (auto& th : threads) { + th.join(); + } + threads.clear(); + } + + // Run insert workload in threads + void Insert(const size_t nthreads, const size_t max_keys) { + key_ = 0; + max_keys_ = max_keys; + // spawn threads + auto fn = std::bind(&PersistentCacheTierTest::InsertImpl, this); + auto threads = SpawnThreads(nthreads, fn); + // join with threads + Join(std::move(threads)); + // Flush cache + Flush(); + } + + // Run verification on the cache + void Verify(const size_t nthreads = 1, const bool eviction_enabled = false) { + stats_verify_hits_ = 0; + stats_verify_missed_ = 0; + key_ = 0; + // spawn threads + auto fn = + std::bind(&PersistentCacheTierTest::VerifyImpl, this, eviction_enabled); + auto threads = SpawnThreads(nthreads, fn); + // join with threads + Join(std::move(threads)); + } + + // pad 0 to numbers + std::string PaddedNumber(const size_t data, const size_t pad_size) { + assert(pad_size); + char* ret = new char[pad_size]; + int pos = static_cast(pad_size) - 1; + size_t count = 0; + size_t t = data; + // copy numbers + while (t) { + count++; + ret[pos--] = '0' + t % 10; + t = t / 10; + } + // copy 0s + while (pos >= 0) { + ret[pos--] = '0'; + } + // post condition + assert(count <= pad_size); + assert(pos == -1); + std::string result(ret, pad_size); + delete[] ret; + return result; + } + + // Insert workload implementation + void InsertImpl() { + const std::string prefix = "key_prefix_"; + + while (true) { + size_t i = key_++; + if (i >= max_keys_) { + break; + } + + char data[4 * 1024]; + memset(data, '0' + (i % 10), sizeof(data)); + auto k = prefix + PaddedNumber(i, /*count=*/8); + Slice key(k); + while (true) { + Status status = cache_->Insert(key, data, sizeof(data)); + if (status.ok()) { + break; + } + ASSERT_TRUE(status.IsTryAgain()); + Env::Default()->SleepForMicroseconds(1 * 1000 * 1000); + } + } + } + + // Verification implementation + void VerifyImpl(const bool eviction_enabled = false) { + const std::string prefix = "key_prefix_"; + while (true) { + size_t i = key_++; + if (i >= max_keys_) { + break; + } + + char edata[4 * 1024]; + memset(edata, '0' + (i % 10), sizeof(edata)); + auto k = prefix + PaddedNumber(i, /*count=*/8); + Slice key(k); + std::unique_ptr block; + size_t block_size; + + if (eviction_enabled) { + if (!cache_->Lookup(key, &block, &block_size).ok()) { + // assume that the key is evicted + stats_verify_missed_++; + continue; + } + } + + ASSERT_OK(cache_->Lookup(key, &block, &block_size)); + ASSERT_EQ(block_size, sizeof(edata)); + ASSERT_EQ(memcmp(edata, block.get(), sizeof(edata)), 0); + stats_verify_hits_++; + } + } + + // template for insert test + void RunInsertTest(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads); + ASSERT_EQ(stats_verify_hits_, max_keys); + ASSERT_EQ(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + // template for negative insert test + void RunNegativeInsertTest(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads, /*eviction_enabled=*/true); + ASSERT_LT(stats_verify_hits_, max_keys); + ASSERT_GT(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + // template for insert with eviction test + void RunInsertTestWithEviction(const size_t nthreads, const size_t max_keys) { + Insert(nthreads, max_keys); + Verify(nthreads, /*eviction_enabled=*/true); + ASSERT_EQ(stats_verify_hits_ + stats_verify_missed_, max_keys); + ASSERT_GT(stats_verify_hits_, 0); + ASSERT_GT(stats_verify_missed_, 0); + + ASSERT_OK(cache_->Close()); + cache_.reset(); + } + + const std::string path_; + std::shared_ptr log_; + std::shared_ptr cache_; + std::atomic key_{0}; + size_t max_keys_ = 0; + std::atomic stats_verify_hits_{0}; + std::atomic stats_verify_missed_{0}; +}; + +// +// RocksDB tests +// +class PersistentCacheDBTest : public DBTestBase { + public: + PersistentCacheDBTest(); + + static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return static_cast( + options.statistics->getTickerCount(ticker_type)); + } + + // insert data to table + void Insert(const Options& options, + const BlockBasedTableOptions& /*table_options*/, + const int num_iter, std::vector* values) { + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = rnd.RandomString(1000); + } + values->push_back(str); + ASSERT_OK(Put(1, Key(i), (*values)[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + } + + // verify data + void Verify(const int num_iter, const std::vector& values) { + for (int j = 0; j < 2; ++j) { + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + } + } + + // test template + void RunTest(const std::function(bool)>& + new_pcache, + const size_t max_keys, const size_t max_usecase); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc new file mode 100644 index 000000000..54cbce8f7 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/persistent_cache_tier.h" + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +std::string PersistentCacheConfig::ToString() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " path: %s\n", path.c_str()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_direct_reads: %d\n", + enable_direct_reads); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_direct_writes: %d\n", + enable_direct_writes); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_size: %" PRIu64 "\n", cache_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_file_size: %" PRIu32 "\n", + cache_file_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " writer_qdepth: %" PRIu32 "\n", + writer_qdepth); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pipeline_writes: %d\n", pipeline_writes); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_write_pipeline_backlog_size: %" PRIu64 "\n", + max_write_pipeline_backlog_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " write_buffer_size: %" PRIu32 "\n", + write_buffer_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " writer_dispatch_size: %" PRIu64 "\n", + writer_dispatch_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " is_compressed: %d\n", is_compressed); + ret.append(buffer); + + return ret; +} + +// +// PersistentCacheTier implementation +// +Status PersistentCacheTier::Open() { + if (next_tier_) { + return next_tier_->Open(); + } + return Status::OK(); +} + +Status PersistentCacheTier::Close() { + if (next_tier_) { + return next_tier_->Close(); + } + return Status::OK(); +} + +bool PersistentCacheTier::Reserve(const size_t /*size*/) { + // default implementation is a pass through + return true; +} + +bool PersistentCacheTier::Erase(const Slice& /*key*/) { + // default implementation is a pass through since not all cache tiers might + // support erase + return true; +} + +std::string PersistentCacheTier::PrintStats() { + std::ostringstream os; + for (auto tier_stats : Stats()) { + os << "---- next tier -----" << std::endl; + for (auto stat : tier_stats) { + os << stat.first << ": " << stat.second << std::endl; + } + } + return os.str(); +} + +PersistentCache::StatsType PersistentCacheTier::Stats() { + if (next_tier_) { + return next_tier_->Stats(); + } + return PersistentCache::StatsType{}; +} + +uint64_t PersistentCacheTier::NewId() { + return last_id_.fetch_add(1, std::memory_order_relaxed); +} + +// +// PersistentTieredCache implementation +// +PersistentTieredCache::~PersistentTieredCache() { assert(tiers_.empty()); } + +Status PersistentTieredCache::Open() { + assert(!tiers_.empty()); + return tiers_.front()->Open(); +} + +Status PersistentTieredCache::Close() { + assert(!tiers_.empty()); + Status status = tiers_.front()->Close(); + if (status.ok()) { + tiers_.clear(); + } + return status; +} + +bool PersistentTieredCache::Erase(const Slice& key) { + assert(!tiers_.empty()); + return tiers_.front()->Erase(key); +} + +PersistentCache::StatsType PersistentTieredCache::Stats() { + assert(!tiers_.empty()); + return tiers_.front()->Stats(); +} + +std::string PersistentTieredCache::PrintStats() { + assert(!tiers_.empty()); + return tiers_.front()->PrintStats(); +} + +Status PersistentTieredCache::Insert(const Slice& page_key, const char* data, + const size_t size) { + assert(!tiers_.empty()); + return tiers_.front()->Insert(page_key, data, size); +} + +Status PersistentTieredCache::Lookup(const Slice& page_key, + std::unique_ptr* data, + size_t* size) { + assert(!tiers_.empty()); + return tiers_.front()->Lookup(page_key, data, size); +} + +void PersistentTieredCache::AddTier(const Tier& tier) { + if (!tiers_.empty()) { + tiers_.back()->set_next_tier(tier); + } + tiers_.push_back(tier); +} + +bool PersistentTieredCache::IsCompressed() { + assert(tiers_.size()); + return tiers_.front()->IsCompressed(); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h new file mode 100644 index 000000000..65aadcd3f --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h @@ -0,0 +1,342 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "rocksdb/env.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +// Persistent Cache +// +// Persistent cache is tiered key-value cache that can use persistent medium. It +// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM. +// The code has been kept generic but significant benchmark/design/development +// time has been spent to make sure the cache performs appropriately for +// respective storage medium. +// The file defines +// PersistentCacheTier : Implementation that handles individual cache tier +// PersistentTieresCache : Implementation that handles all tiers as a logical +// unit +// +// PersistentTieredCache architecture: +// +--------------------------+ PersistentCacheTier that handles multiple tiers +// | +----------------+ | +// | | RAM | PersistentCacheTier that handles RAM (VolatileCacheImpl) +// | +----------------+ | +// | | next | +// | v | +// | +----------------+ | +// | | NVM | PersistentCacheTier implementation that handles NVM +// | +----------------+ (BlockCacheImpl) +// | | next | +// | V | +// | +----------------+ | +// | | LE-SSD | PersistentCacheTier implementation that handles LE-SSD +// | +----------------+ (BlockCacheImpl) +// | | | +// | V | +// | null | +// +--------------------------+ +// | +// V +// null +namespace ROCKSDB_NAMESPACE { + +// Persistent Cache Config +// +// This struct captures all the options that are used to configure persistent +// cache. Some of the terminologies used in naming the options are +// +// dispatch size : +// This is the size in which IO is dispatched to the device +// +// write buffer size : +// This is the size of an individual write buffer size. Write buffers are +// grouped to form buffered file. +// +// cache size : +// This is the logical maximum for the cache size +// +// qdepth : +// This is the max number of IOs that can issues to the device in parallel +// +// pepeling : +// The writer code path follows pipelined architecture, which means the +// operations are handed off from one stage to another +// +// pipelining backlog size : +// With the pipelined architecture, there can always be backlogging of ops in +// pipeline queues. This is the maximum backlog size after which ops are dropped +// from queue +struct PersistentCacheConfig { + explicit PersistentCacheConfig( + Env* const _env, const std::string& _path, const uint64_t _cache_size, + const std::shared_ptr& _log, + const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) { + env = _env; + clock = (env != nullptr) ? env->GetSystemClock().get() + : SystemClock::Default().get(); + path = _path; + log = _log; + cache_size = _cache_size; + writer_dispatch_size = write_buffer_size = _write_buffer_size; + } + + // + // Validate the settings. Our intentions are to catch erroneous settings ahead + // of time instead going violating invariants or causing dead locks. + // + Status ValidateSettings() const { + // (1) check pre-conditions for variables + if (!env || path.empty()) { + return Status::InvalidArgument("empty or null args"); + } + + // (2) assert size related invariants + // - cache size cannot be less than cache file size + // - individual write buffer size cannot be greater than cache file size + // - total write buffer size cannot be less than 2X cache file size + if (cache_size < cache_file_size || write_buffer_size >= cache_file_size || + write_buffer_size * write_buffer_count() < 2 * cache_file_size) { + return Status::InvalidArgument("invalid cache size"); + } + + // (2) check writer settings + // - Queue depth cannot be 0 + // - writer_dispatch_size cannot be greater than writer_buffer_size + // - dispatch size and buffer size need to be aligned + if (!writer_qdepth || writer_dispatch_size > write_buffer_size || + write_buffer_size % writer_dispatch_size) { + return Status::InvalidArgument("invalid writer settings"); + } + + return Status::OK(); + } + + // + // Env abstraction to use for system level operations + // + Env* env; + SystemClock* clock; + // + // Path for the block cache where blocks are persisted + // + std::string path; + + // + // Log handle for logging messages + // + std::shared_ptr log; + + // + // Enable direct IO for reading + // + bool enable_direct_reads = true; + + // + // Enable direct IO for writing + // + bool enable_direct_writes = false; + + // + // Logical cache size + // + uint64_t cache_size = std::numeric_limits::max(); + + // cache-file-size + // + // Cache consists of multiples of small files. This parameter defines the + // size of an individual cache file + // + // default: 1M + uint32_t cache_file_size = 100ULL * 1024 * 1024; + + // writer-qdepth + // + // The writers can issues IO to the devices in parallel. This parameter + // controls the max number if IOs that can issues in parallel to the block + // device + // + // default :1 + uint32_t writer_qdepth = 1; + + // pipeline-writes + // + // The write optionally follow pipelined architecture. This helps + // avoid regression in the eviction code path of the primary tier. This + // parameter defines if pipelining is enabled or disabled + // + // default: true + bool pipeline_writes = true; + + // max-write-pipeline-backlog-size + // + // Max pipeline buffer size. This is the maximum backlog we can accumulate + // while waiting for writes. After the limit, new ops will be dropped. + // + // Default: 1GiB + uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024; + + // write-buffer-size + // + // This is the size in which buffer slabs are allocated. + // + // Default: 1M + uint32_t write_buffer_size = 1ULL * 1024 * 1024; + + // write-buffer-count + // + // This is the total number of buffer slabs. This is calculated as a factor of + // file size in order to avoid dead lock. + size_t write_buffer_count() const { + assert(write_buffer_size); + return static_cast((writer_qdepth + 1.2) * cache_file_size / + write_buffer_size); + } + + // writer-dispatch-size + // + // The writer thread will dispatch the IO at the specified IO size + // + // default: 1M + uint64_t writer_dispatch_size = 1ULL * 1024 * 1024; + + // is_compressed + // + // This option determines if the cache will run in compressed mode or + // uncompressed mode + bool is_compressed = true; + + PersistentCacheConfig MakePersistentCacheConfig( + const std::string& path, const uint64_t size, + const std::shared_ptr& log); + + std::string ToString() const; +}; + +// Persistent Cache Tier +// +// This a logical abstraction that defines a tier of the persistent cache. Tiers +// can be stacked over one another. PersistentCahe provides the basic definition +// for accessing/storing in the cache. PersistentCacheTier extends the interface +// to enable management and stacking of tiers. +class PersistentCacheTier : public PersistentCache { + public: + using Tier = std::shared_ptr; + + virtual ~PersistentCacheTier() {} + + // Open the persistent cache tier + virtual Status Open(); + + // Close the persistent cache tier + virtual Status Close(); + + // Reserve space up to 'size' bytes + virtual bool Reserve(const size_t size); + + // Erase a key from the cache + virtual bool Erase(const Slice& key); + + // Print stats to string recursively + virtual std::string PrintStats(); + + virtual PersistentCache::StatsType Stats() override; + + // Insert to page cache + virtual Status Insert(const Slice& page_key, const char* data, + const size_t size) override = 0; + + // Lookup page cache by page identifier + virtual Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override = 0; + + // Does it store compressed data ? + virtual bool IsCompressed() override = 0; + + virtual std::string GetPrintableOptions() const override = 0; + + virtual uint64_t NewId() override; + + // Return a reference to next tier + virtual Tier& next_tier() { return next_tier_; } + + // Set the value for next tier + virtual void set_next_tier(const Tier& tier) { + assert(!next_tier_); + next_tier_ = tier; + } + + virtual void TEST_Flush() { + if (next_tier_) { + next_tier_->TEST_Flush(); + } + } + + private: + Tier next_tier_; // next tier + std::atomic last_id_{1}; +}; + +// PersistentTieredCache +// +// Abstraction that helps you construct a tiers of persistent caches as a +// unified cache. The tier(s) of cache will act a single tier for management +// ease and support PersistentCache methods for accessing data. +class PersistentTieredCache : public PersistentCacheTier { + public: + virtual ~PersistentTieredCache(); + + Status Open() override; + Status Close() override; + bool Erase(const Slice& key) override; + std::string PrintStats() override; + PersistentCache::StatsType Stats() override; + Status Insert(const Slice& page_key, const char* data, + const size_t size) override; + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override; + bool IsCompressed() override; + + std::string GetPrintableOptions() const override { + return "PersistentTieredCache"; + } + + void AddTier(const Tier& tier); + + Tier& next_tier() override { + auto it = tiers_.end(); + return (*it)->next_tier(); + } + + void set_next_tier(const Tier& tier) override { + auto it = tiers_.end(); + (*it)->set_next_tier(tier); + } + + void TEST_Flush() override { + assert(!tiers_.empty()); + tiers_.front()->TEST_Flush(); + PersistentCacheTier::TEST_Flush(); + } + + protected: + std::list tiers_; // list of tiers top-down +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h new file mode 100644 index 000000000..2a769652d --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// +// Simple synchronized queue implementation with the option of +// bounding the queue +// +// On overflow, the elements will be discarded +// +template +class BoundedQueue { + public: + explicit BoundedQueue( + const size_t max_size = std::numeric_limits::max()) + : cond_empty_(&lock_), max_size_(max_size) {} + + virtual ~BoundedQueue() {} + + void Push(T&& t) { + MutexLock _(&lock_); + if (max_size_ != std::numeric_limits::max() && + size_ + t.Size() >= max_size_) { + // overflow + return; + } + + size_ += t.Size(); + q_.push_back(std::move(t)); + cond_empty_.SignalAll(); + } + + T Pop() { + MutexLock _(&lock_); + while (q_.empty()) { + cond_empty_.Wait(); + } + + T t = std::move(q_.front()); + size_ -= t.Size(); + q_.pop_front(); + return t; + } + + size_t Size() const { + MutexLock _(&lock_); + return size_; + } + + private: + mutable port::Mutex lock_; + port::CondVar cond_empty_; + std::list q_; + size_t size_ = 0; + const size_t max_size_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc new file mode 100644 index 000000000..45d2830aa --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/volatile_tier_impl.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +void VolatileCacheTier::DeleteCacheData(VolatileCacheTier::CacheData* data) { + assert(data); + delete data; +} + +VolatileCacheTier::~VolatileCacheTier() { index_.Clear(&DeleteCacheData); } + +PersistentCache::StatsType VolatileCacheTier::Stats() { + std::map stat; + stat.insert({"persistent_cache.volatile_cache.hits", + static_cast(stats_.cache_hits_)}); + stat.insert({"persistent_cache.volatile_cache.misses", + static_cast(stats_.cache_misses_)}); + stat.insert({"persistent_cache.volatile_cache.inserts", + static_cast(stats_.cache_inserts_)}); + stat.insert({"persistent_cache.volatile_cache.evicts", + static_cast(stats_.cache_evicts_)}); + stat.insert({"persistent_cache.volatile_cache.hit_pct", + static_cast(stats_.CacheHitPct())}); + stat.insert({"persistent_cache.volatile_cache.miss_pct", + static_cast(stats_.CacheMissPct())}); + + auto out = PersistentCacheTier::Stats(); + out.push_back(stat); + return out; +} + +Status VolatileCacheTier::Insert(const Slice& page_key, const char* data, + const size_t size) { + // precondition + assert(data); + assert(size); + + // increment the size + size_ += size; + + // check if we have overshot the limit, if so evict some space + while (size_ > max_size_) { + if (!Evict()) { + // unable to evict data, we give up so we don't spike read + // latency + assert(size_ >= size); + size_ -= size; + return Status::TryAgain("Unable to evict any data"); + } + } + + assert(size_ >= size); + + // insert order: LRU, followed by index + std::string key(page_key.data(), page_key.size()); + std::string value(data, size); + std::unique_ptr cache_data( + new CacheData(std::move(key), std::move(value))); + bool ok = index_.Insert(cache_data.get()); + if (!ok) { + // decrement the size that we incremented ahead of time + assert(size_ >= size); + size_ -= size; + // failed to insert to cache, block already in cache + return Status::TryAgain("key already exists in volatile cache"); + } + + cache_data.release(); + stats_.cache_inserts_++; + return Status::OK(); +} + +Status VolatileCacheTier::Lookup(const Slice& page_key, + std::unique_ptr* result, + size_t* size) { + CacheData key(std::move(page_key.ToString())); + CacheData* kv; + bool ok = index_.Find(&key, &kv); + if (ok) { + // set return data + result->reset(new char[kv->value.size()]); + memcpy(result->get(), kv->value.c_str(), kv->value.size()); + *size = kv->value.size(); + // drop the reference on cache data + kv->refs_--; + // update stats + stats_.cache_hits_++; + return Status::OK(); + } + + stats_.cache_misses_++; + + if (next_tier()) { + return next_tier()->Lookup(page_key, result, size); + } + + return Status::NotFound("key not found in volatile cache"); +} + +bool VolatileCacheTier::Erase(const Slice& /*key*/) { + assert(!"not supported"); + return true; +} + +bool VolatileCacheTier::Evict() { + CacheData* edata = index_.Evict(); + if (!edata) { + // not able to evict any object + return false; + } + + stats_.cache_evicts_++; + + // push the evicted object to the next level + if (next_tier()) { + // TODO: Should the insert error be ignored? + Status s = next_tier()->Insert(Slice(edata->key), edata->value.c_str(), + edata->value.size()); + s.PermitUncheckedError(); + } + + // adjust size and destroy data + size_ -= edata->value.size(); + delete edata; + + return true; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h new file mode 100644 index 000000000..09265e457 --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h @@ -0,0 +1,141 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "utilities/persistent_cache/hash_table.h" +#include "utilities/persistent_cache/hash_table_evictable.h" +#include "utilities/persistent_cache/persistent_cache_tier.h" + +// VolatileCacheTier +// +// This file provides persistent cache tier implementation for caching +// key/values in RAM. +// +// key/values +// | +// V +// +-------------------+ +// | VolatileCacheTier | Store in an evictable hash table +// +-------------------+ +// | +// V +// on eviction +// pushed to next tier +// +// The implementation is designed to be concurrent. The evictable hash table +// implementation is not concurrent at this point though. +// +// The eviction algorithm is LRU +namespace ROCKSDB_NAMESPACE { + +class VolatileCacheTier : public PersistentCacheTier { + public: + explicit VolatileCacheTier( + const bool is_compressed = true, + const size_t max_size = std::numeric_limits::max()) + : is_compressed_(is_compressed), max_size_(max_size) {} + + virtual ~VolatileCacheTier(); + + // insert to cache + Status Insert(const Slice& page_key, const char* data, + const size_t size) override; + // lookup key in cache + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override; + + // is compressed cache ? + bool IsCompressed() override { return is_compressed_; } + + // erase key from cache + bool Erase(const Slice& key) override; + + std::string GetPrintableOptions() const override { + return "VolatileCacheTier"; + } + + // Expose stats as map + PersistentCache::StatsType Stats() override; + + private: + // + // Cache data abstraction + // + struct CacheData : LRUElement { + explicit CacheData(CacheData&& rhs) noexcept + : key(std::move(rhs.key)), value(std::move(rhs.value)) {} + + explicit CacheData(const std::string& _key, const std::string& _value = "") + : key(_key), value(_value) {} + + virtual ~CacheData() {} + + const std::string key; + const std::string value; + }; + + static void DeleteCacheData(CacheData* data); + + // + // Index and LRU definition + // + struct CacheDataHash { + uint64_t operator()(const CacheData* obj) const { + assert(obj); + return std::hash()(obj->key); + } + }; + + struct CacheDataEqual { + bool operator()(const CacheData* lhs, const CacheData* rhs) const { + assert(lhs); + assert(rhs); + return lhs->key == rhs->key; + } + }; + + struct Statistics { + std::atomic cache_misses_{0}; + std::atomic cache_hits_{0}; + std::atomic cache_inserts_{0}; + std::atomic cache_evicts_{0}; + + double CacheHitPct() const { + auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_hits_ / static_cast(lookups) : 0.0; + } + + double CacheMissPct() const { + auto lookups = cache_hits_ + cache_misses_; + return lookups ? 100 * cache_misses_ / static_cast(lookups) : 0.0; + } + }; + + using IndexType = + EvictableHashTable; + + // Evict LRU tail + bool Evict(); + + const bool is_compressed_ = true; // does it store compressed data + IndexType index_; // in-memory cache + std::atomic max_size_{0}; // Maximum size of the cache + std::atomic size_{0}; // Size of the cache + Statistics stats_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc new file mode 100644 index 000000000..dc419e51a --- /dev/null +++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc @@ -0,0 +1,288 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/simulator_cache/cache_simulator.h" + +#include + +#include "db/dbformat.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +const std::string kGhostCachePrefix = "ghost_"; +} // namespace + +GhostCache::GhostCache(std::shared_ptr sim_cache) + : sim_cache_(sim_cache) {} + +bool GhostCache::Admit(const Slice& lookup_key) { + auto handle = sim_cache_->Lookup(lookup_key); + if (handle != nullptr) { + sim_cache_->Release(handle); + return true; + } + // TODO: Should we check for errors here? + auto s = sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(), + /*deleter=*/nullptr); + s.PermitUncheckedError(); + return false; +} + +CacheSimulator::CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {} + +void CacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool admit = true; + const bool is_user_access = + BlockCacheTraceHelper::IsUserAccess(access.caller); + bool is_cache_miss = true; + if (ghost_cache_ && !access.no_insert) { + admit = ghost_cache_->Admit(access.block_key); + } + auto handle = sim_cache_->Lookup(access.block_key); + if (handle != nullptr) { + sim_cache_->Release(handle); + is_cache_miss = false; + } else { + if (!access.no_insert && admit && access.block_size > 0) { + // Ignore errors on insert + auto s = sim_cache_->Insert(access.block_key, /*value=*/nullptr, + access.block_size, + /*deleter=*/nullptr); + s.PermitUncheckedError(); + } + } + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + is_cache_miss); +} + +void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms, + bool is_user_access, bool is_cache_miss) { + uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond; + num_accesses_timeline_[timestamp_in_seconds] += 1; + num_accesses_ += 1; + if (num_misses_timeline_.find(timestamp_in_seconds) == + num_misses_timeline_.end()) { + num_misses_timeline_[timestamp_in_seconds] = 0; + } + if (is_cache_miss) { + num_misses_ += 1; + num_misses_timeline_[timestamp_in_seconds] += 1; + } + if (is_user_access) { + user_accesses_ += 1; + if (is_cache_miss) { + user_misses_ += 1; + } + } +} + +Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority( + const BlockCacheTraceRecord& access) const { + if (access.block_type == TraceType::kBlockTraceFilterBlock || + access.block_type == TraceType::kBlockTraceIndexBlock || + access.block_type == TraceType::kBlockTraceUncompressionDictBlock) { + return Cache::Priority::HIGH; + } + return Cache::Priority::LOW; +} + +void PrioritizedCacheSimulator::AccessKVPair( + const Slice& key, uint64_t value_size, Cache::Priority priority, + const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access, + bool* is_cache_miss, bool* admitted, bool update_metrics) { + assert(is_cache_miss); + assert(admitted); + *is_cache_miss = true; + *admitted = true; + if (ghost_cache_ && !no_insert) { + *admitted = ghost_cache_->Admit(key); + } + auto handle = sim_cache_->Lookup(key); + if (handle != nullptr) { + sim_cache_->Release(handle); + *is_cache_miss = false; + } else if (!no_insert && *admitted && value_size > 0) { + // TODO: Should we check for an error here? + auto s = sim_cache_->Insert(key, /*value=*/nullptr, value_size, + /*deleter=*/nullptr, + /*handle=*/nullptr, priority); + s.PermitUncheckedError(); + } + if (update_metrics) { + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access, + *is_cache_miss); + } +} + +void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) { + bool is_cache_miss = true; + bool admitted = true; + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access, access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); +} + +void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { + // TODO (haoyu): We only support Get for now. We need to extend the tracing + // for MultiGet, i.e., non-data block accesses must log all keys in a + // MultiGet. + bool is_cache_miss = true; + bool admitted = false; + if (access.caller == TableReaderCaller::kUserGet && + access.get_id != BlockCacheTraceHelper::kReservedGetId) { + // This is a Get request. + const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access); + GetRequestStatus& status = getid_status_map_[access.get_id]; + if (status.is_complete) { + // This Get request completes. + // Skip future accesses to its index/filter/data + // blocks. These block lookups are unnecessary if we observe a hit for the + // referenced key-value pair already. Thus, we treat these lookups as + // hits. This is also to ensure the total number of accesses are the same + // when comparing to other policies. + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); + return; + } + if (status.row_key_status.find(row_key) == status.row_key_status.end()) { + // This is the first time that this key is accessed. Look up the key-value + // pair first. Do not update the miss/accesses metrics here since it will + // be updated later. + AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH, + access, + /*no_insert=*/false, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/false); + InsertResult result = InsertResult::NO_INSERT; + if (admitted && access.referenced_data_size > 0) { + result = InsertResult::INSERTED; + } else if (admitted) { + result = InsertResult::ADMITTED; + } + status.row_key_status[row_key] = result; + } + if (!is_cache_miss) { + // A cache hit. + status.is_complete = true; + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + /*is_user_access=*/true, + /*is_cache_miss=*/false); + return; + } + // The row key-value pair observes a cache miss. We need to access its + // index/filter/data blocks. + InsertResult inserted = status.row_key_status[row_key]; + AccessKVPair( + access.block_key, access.block_size, ComputeBlockPriority(access), + access, + /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert, + /*is_user_access=*/true, &is_cache_miss, &admitted, + /*update_metrics=*/true); + if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) { + // TODO: Should we check for an error here? + auto s = sim_cache_->Insert(row_key, /*value=*/nullptr, + access.referenced_data_size, + /*deleter=*/nullptr, + /*handle=*/nullptr, Cache::Priority::HIGH); + s.PermitUncheckedError(); + status.row_key_status[row_key] = InsertResult::INSERTED; + } + return; + } + AccessKVPair(access.block_key, access.block_size, + ComputeBlockPriority(access), access, access.no_insert, + BlockCacheTraceHelper::IsUserAccess(access.caller), + &is_cache_miss, &admitted, /*update_metrics=*/true); +} + +BlockCacheTraceSimulator::BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations) + : warmup_seconds_(warmup_seconds), + downsample_ratio_(downsample_ratio), + cache_configurations_(cache_configurations) {} + +Status BlockCacheTraceSimulator::InitializeCaches() { + for (auto const& config : cache_configurations_) { + for (auto cache_capacity : config.cache_capacities) { + // Scale down the cache capacity since the trace contains accesses on + // 1/'downsample_ratio' blocks. + uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_; + std::shared_ptr sim_cache; + std::unique_ptr ghost_cache; + std::string cache_name = config.cache_name; + if (cache_name.find(kGhostCachePrefix) != std::string::npos) { + ghost_cache.reset(new GhostCache( + NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_name = cache_name.substr(kGhostCachePrefix.size()); + } + if (cache_name == "lru") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0)); + } else if (cache_name == "lru_priority") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5)); + } else if (cache_name == "lru_hybrid") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*insert_blocks_upon_row_kvpair_miss=*/true); + } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") { + sim_cache = std::make_shared( + std::move(ghost_cache), + NewLRUCache(simulate_cache_capacity, config.num_shard_bits, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0.5), + /*insert_blocks_upon_row_kvpair_miss=*/false); + } else { + // Not supported. + return Status::InvalidArgument("Unknown cache name " + + config.cache_name); + } + sim_caches_[config].push_back(sim_cache); + } + } + return Status::OK(); +} + +void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { + if (trace_start_time_ == 0) { + trace_start_time_ = access.access_timestamp; + } + // access.access_timestamp is in microseconds. + if (!warmup_complete_ && + trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= + access.access_timestamp) { + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->reset_counter(); + } + } + warmup_complete_ = true; + } + for (auto& config_caches : sim_caches_) { + for (auto& sim_cache : config_caches.second) { + sim_cache->Access(access); + } + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.h b/src/rocksdb/utilities/simulator_cache/cache_simulator.h new file mode 100644 index 000000000..6d4979013 --- /dev/null +++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.h @@ -0,0 +1,231 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/lru_cache.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +// A cache configuration provided by user. +struct CacheConfiguration { + std::string cache_name; // LRU. + uint32_t num_shard_bits; + uint64_t ghost_cache_capacity; // ghost cache capacity in bytes. + std::vector + cache_capacities; // simulate cache capacities in bytes. + + bool operator==(const CacheConfiguration& o) const { + return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity == o.ghost_cache_capacity; + } + bool operator<(const CacheConfiguration& o) const { + return cache_name < o.cache_name || + (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) || + (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits && + ghost_cache_capacity < o.ghost_cache_capacity); + } +}; + +class MissRatioStats { + public: + void reset_counter() { + num_misses_ = 0; + num_accesses_ = 0; + user_accesses_ = 0; + user_misses_ = 0; + } + double miss_ratio() const { + if (num_accesses_ == 0) { + return -1; + } + return static_cast(num_misses_ * 100.0 / num_accesses_); + } + uint64_t total_accesses() const { return num_accesses_; } + uint64_t total_misses() const { return num_misses_; } + + const std::map& num_accesses_timeline() const { + return num_accesses_timeline_; + } + + const std::map& num_misses_timeline() const { + return num_misses_timeline_; + } + + double user_miss_ratio() const { + if (user_accesses_ == 0) { + return -1; + } + return static_cast(user_misses_ * 100.0 / user_accesses_); + } + uint64_t user_accesses() const { return user_accesses_; } + uint64_t user_misses() const { return user_misses_; } + + void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access, + bool is_cache_miss); + + private: + uint64_t num_accesses_ = 0; + uint64_t num_misses_ = 0; + uint64_t user_accesses_ = 0; + uint64_t user_misses_ = 0; + + std::map num_accesses_timeline_; + std::map num_misses_timeline_; +}; + +// A ghost cache admits an entry on its second access. +class GhostCache { + public: + explicit GhostCache(std::shared_ptr sim_cache); + ~GhostCache() = default; + // No copy and move. + GhostCache(const GhostCache&) = delete; + GhostCache& operator=(const GhostCache&) = delete; + GhostCache(GhostCache&&) = delete; + GhostCache& operator=(GhostCache&&) = delete; + + // Returns true if the lookup_key is in the ghost cache. + // Returns false otherwise. + bool Admit(const Slice& lookup_key); + + private: + std::shared_ptr sim_cache_; +}; + +// A cache simulator that runs against a block cache trace. +class CacheSimulator { + public: + CacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache); + virtual ~CacheSimulator() = default; + // No copy and move. + CacheSimulator(const CacheSimulator&) = delete; + CacheSimulator& operator=(const CacheSimulator&) = delete; + CacheSimulator(CacheSimulator&&) = delete; + CacheSimulator& operator=(CacheSimulator&&) = delete; + + virtual void Access(const BlockCacheTraceRecord& access); + + void reset_counter() { miss_ratio_stats_.reset_counter(); } + + const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; } + + protected: + MissRatioStats miss_ratio_stats_; + std::unique_ptr ghost_cache_; + std::shared_ptr sim_cache_; +}; + +// A prioritized cache simulator that runs against a block cache trace. +// It inserts missing index/filter/uncompression-dictionary blocks with high +// priority in the cache. +class PrioritizedCacheSimulator : public CacheSimulator { + public: + PrioritizedCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache) + : CacheSimulator(std::move(ghost_cache), sim_cache) {} + void Access(const BlockCacheTraceRecord& access) override; + + protected: + // Access the key-value pair and returns true upon a cache miss. + void AccessKVPair(const Slice& key, uint64_t value_size, + Cache::Priority priority, + const BlockCacheTraceRecord& access, bool no_insert, + bool is_user_access, bool* is_cache_miss, bool* admitted, + bool update_metrics); + + Cache::Priority ComputeBlockPriority( + const BlockCacheTraceRecord& access) const; +}; + +// A hybrid row and block cache simulator. It looks up/inserts key-value pairs +// referenced by Get/MultiGet requests, and not their accessed index/filter/data +// blocks. +// +// Upon a Get/MultiGet request, it looks up the referenced key first. +// If it observes a cache hit, future block accesses on this key-value pair is +// skipped since the request is served already. Otherwise, it continues to look +// up/insert its index/filter/data blocks. It also inserts the referenced +// key-value pair in the cache for future lookups. +class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator { + public: + HybridRowBlockCacheSimulator(std::unique_ptr&& ghost_cache, + std::shared_ptr sim_cache, + bool insert_blocks_upon_row_kvpair_miss) + : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache), + insert_blocks_upon_row_kvpair_miss_( + insert_blocks_upon_row_kvpair_miss) {} + void Access(const BlockCacheTraceRecord& access) override; + + private: + enum InsertResult : char { + INSERTED, + ADMITTED, + NO_INSERT, + }; + + // We set is_complete to true when the referenced row-key of a get request + // hits the cache. If is_complete is true, we treat future accesses of this + // get request as hits. + // + // For each row key, it stores an enum. It is INSERTED when the + // kv-pair has been inserted into the cache, ADMITTED if it should be inserted + // but haven't been, NO_INSERT if it should not be inserted. + // + // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not + // know its size. This may happen if the first access on the referenced key is + // an index/filter block. + struct GetRequestStatus { + bool is_complete = false; + std::map row_key_status; + }; + + // A map stores get_id to a map of row keys. + std::map getid_status_map_; + bool insert_blocks_upon_row_kvpair_miss_; +}; + +// A block cache simulator that reports miss ratio curves given a set of cache +// configurations. +class BlockCacheTraceSimulator { + public: + // warmup_seconds: The number of seconds to warmup simulated caches. The + // hit/miss counters are reset after the warmup completes. + BlockCacheTraceSimulator( + uint64_t warmup_seconds, uint32_t downsample_ratio, + const std::vector& cache_configurations); + ~BlockCacheTraceSimulator() = default; + // No copy and move. + BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete; + BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete; + BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete; + + Status InitializeCaches(); + + void Access(const BlockCacheTraceRecord& access); + + const std::map>>& + sim_caches() const { + return sim_caches_; + } + + private: + const uint64_t warmup_seconds_; + const uint32_t downsample_ratio_; + const std::vector cache_configurations_; + + bool warmup_complete_ = false; + std::map>> + sim_caches_; + uint64_t trace_start_time_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc new file mode 100644 index 000000000..2bc057c92 --- /dev/null +++ b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc @@ -0,0 +1,497 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/simulator_cache/cache_simulator.h" + +#include + +#include "rocksdb/env.h" +#include "rocksdb/trace_record.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +const std::string kBlockKeyPrefix = "test-block-"; +const std::string kRefKeyPrefix = "test-get-"; +const std::string kRefKeySequenceNumber = std::string(8, 'c'); +const uint64_t kGetId = 1; +const uint64_t kGetBlockId = 100; +const uint64_t kCompactionBlockId = 1000; +const uint64_t kCacheSize = 1024 * 1024 * 1024; +const uint64_t kGhostCacheSize = 1024 * 1024; +} // namespace + +class CacheSimulatorTest : public testing::Test { + public: + const size_t kNumBlocks = 5; + const size_t kValueSize = 1000; + + CacheSimulatorTest() { env_ = ROCKSDB_NAMESPACE::Env::Default(); } + + BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) { + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceDataBlock; + record.block_size = 4096; + record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId); + record.access_timestamp = env_->NowMicros(); + record.cf_id = 0; + record.cf_name = "test"; + record.caller = TableReaderCaller::kUserGet; + record.level = 6; + record.sst_fd_number = 0; + record.get_id = getid; + record.is_cache_hit = false; + record.no_insert = false; + record.referenced_key = + kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber; + record.referenced_key_exist_in_block = true; + record.referenced_data_size = 100; + record.num_keys_in_block = 300; + return record; + } + + BlockCacheTraceRecord GenerateCompactionRecord() { + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceDataBlock; + record.block_size = 4096; + record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId); + record.access_timestamp = env_->NowMicros(); + record.cf_id = 0; + record.cf_name = "test"; + record.caller = TableReaderCaller::kCompaction; + record.level = 6; + record.sst_fd_number = kCompactionBlockId; + record.is_cache_hit = false; + record.no_insert = true; + return record; + } + + void AssertCache(std::shared_ptr sim_cache, + const MissRatioStats& miss_ratio_stats, + uint64_t expected_usage, uint64_t expected_num_accesses, + uint64_t expected_num_misses, + std::vector blocks, + std::vector keys) { + EXPECT_EQ(expected_usage, sim_cache->GetUsage()); + EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses()); + EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses()); + for (auto const& block : blocks) { + auto handle = sim_cache->Lookup(block); + EXPECT_NE(nullptr, handle); + sim_cache->Release(handle); + } + for (auto const& key : keys) { + std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber; + auto handle = + sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString()); + EXPECT_NE(nullptr, handle); + sim_cache->Release(handle); + } + } + + Env* env_; +}; + +TEST_F(CacheSimulatorTest, GhostCache) { + const std::string key1 = "test1"; + const std::string key2 = "test2"; + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + EXPECT_FALSE(ghost_cache->Admit(key1)); + EXPECT_TRUE(ghost_cache->Admit(key1)); + EXPECT_TRUE(ghost_cache->Admit(key1)); + EXPECT_FALSE(ghost_cache->Admit(key2)); + EXPECT_TRUE(ghost_cache->Admit(key2)); +} + +TEST_F(CacheSimulatorTest, CacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord(); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new CacheSimulator(nullptr, sim_cache)); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio()); + + cache_simulator->Access(compaction_access); + cache_simulator->Access(compaction_access); + ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio()); + + cache_simulator->reset_counter(); + ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio()); + auto handle = sim_cache->Lookup(access.block_key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + handle = sim_cache->Lookup(compaction_access.block_key); + ASSERT_EQ(nullptr, handle); +} + +TEST_F(CacheSimulatorTest, GhostCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + std::unique_ptr cache_simulator(new CacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + // Both of them will be miss since we have a ghost cache. + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); +} + +TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new PrioritizedCacheSimulator(nullptr, sim_cache)); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio()); + + auto handle = sim_cache->Lookup(access.block_key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); +} + +TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) { + const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId); + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + std::unique_ptr cache_simulator( + new PrioritizedCacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + cache_simulator->Access(access); + cache_simulator->Access(access); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + // Both of them will be miss since we have a ghost cache. + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); +} + +TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) { + uint64_t block_id = 100; + BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); + first_get.get_from_user_specified_snapshot = true; + BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1); + second_get.referenced_data_size = 0; + second_get.referenced_key_exist_in_block = false; + second_get.get_from_user_specified_snapshot = true; + BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2); + third_get.referenced_data_size = 0; + third_get.referenced_key_exist_in_block = false; + third_get.referenced_key = kRefKeyPrefix + "third_get"; + // We didn't find the referenced key in the third get. + third_get.referenced_key_exist_in_block = false; + third_get.referenced_data_size = 0; + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true)); + // The first get request accesses 10 blocks. We should only report 10 accesses + // and 100% miss. + for (uint32_t i = 0; i < 10; i++) { + first_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(first_get); + block_id++; + } + + ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio()); + auto handle = + sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString()); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + } + + // The second get request accesses the same key. We should report 15 + // access and 66% miss, 10 misses with 15 accesses. + // We do not consider these 5 block lookups as misses since the row hits the + // cache. + for (uint32_t i = 0; i < 5; i++) { + second_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(second_get); + block_id++; + } + ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(66, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(66, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); + handle = + sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" + + ExtractUserKey(second_get.referenced_key).ToString()); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + if (i < 110) { + ASSERT_NE(nullptr, handle) << i; + sim_cache->Release(handle); + } else { + ASSERT_EQ(nullptr, handle) << i; + } + } + + // The third get on a different key and does not have a size. + // This key should not be inserted into the cache. + for (uint32_t i = 0; i < 5; i++) { + third_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(third_get); + block_id++; + } + ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(75, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(75, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); + // Assert that the third key is not inserted into the cache. + handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" + + third_get.referenced_key); + ASSERT_EQ(nullptr, handle); + for (uint32_t i = 100; i < block_id; i++) { + if (i < 110 || i >= 115) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_NE(nullptr, handle) << i; + sim_cache->Release(handle); + } else { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_EQ(nullptr, handle) << i; + } + } +} + +TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) { + BlockCacheTraceRecord get = GenerateGetRecord(kGetId); + get.block_size = 1; + get.referenced_data_size = 0; + get.access_timestamp = 0; + get.block_key = "1"; + get.get_id = 1; + get.get_from_user_specified_snapshot = false; + get.referenced_key = + kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber; + get.no_insert = false; + get.sst_fd_number = 0; + get.get_from_user_specified_snapshot = false; + + LRUCacheOptions co; + co.capacity = 16; + co.num_shard_bits = 1; + co.strict_capacity_limit = false; + co.high_pri_pool_ratio = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr sim_cache = NewLRUCache(co); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true)); + // Expect a miss and does not insert the row key-value pair since it does not + // have size. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"}, + {}); + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.block_key = "2"; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2, + {"1", "2"}, {"1"}); + get.access_timestamp += 1; + get.block_key = "3"; + // K1 should not inserted again. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3, + {"1", "2", "3"}, {"1"}); + + // A second get request referencing the same key. + get.access_timestamp += 1; + get.get_id = 2; + get.block_key = "4"; + get.referenced_data_size = 0; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3, + {"1", "2", "3"}, {"1"}); + + // A third get request searches three files, three different keys. + // And the second key observes a hit. + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "3"; + get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber; + // K2 should observe a miss. Block 3 observes a hit. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3, + {"1", "2", "3"}, {"1", "2"}); + + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "4"; + get.referenced_data_size = 1; + get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber; + // K1 should observe a hit. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3, + {"1", "2", "3"}, {"1", "2"}); + + get.access_timestamp += 1; + get.referenced_data_size = 1; + get.get_id = 3; + get.block_key = "4"; + get.referenced_data_size = 1; + get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber; + // K3 should observe a miss. + // However, as the get already complete, we should not access k3 any more. + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3, + {"1", "2", "3"}, {"1", "2"}); + + // A fourth get request searches one file and two blocks. One row key. + get.access_timestamp += 1; + get.get_id = 4; + get.block_key = "5"; + get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber; + get.referenced_data_size = 1; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4, + {"1", "2", "3", "5"}, {"1", "2", "4"}); + for (auto const& key : {"1", "2", "4"}) { + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + } + + // A bunch of insertions which evict cached row keys. + for (uint32_t i = 6; i < 100; i++) { + get.access_timestamp += 1; + get.get_id = 0; + get.block_key = std::to_string(i); + cache_simulator->Access(get); + } + + get.get_id = 4; + // A different block. + get.block_key = "100"; + // Same row key and should not be inserted again. + get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber; + get.referenced_data_size = 1; + cache_simulator->Access(get); + AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {}, + {}); + for (auto const& key : {"1", "2", "4"}) { + auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key); + ASSERT_EQ(nullptr, handle); + } +} + +TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) { + uint64_t block_id = 100; + BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId); + std::shared_ptr sim_cache = + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false)); + for (uint32_t i = 0; i < 9; i++) { + first_get.block_key = kBlockKeyPrefix + std::to_string(block_id); + cache_simulator->Access(first_get); + block_id++; + } + auto handle = + sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" + + ExtractUserKey(first_get.referenced_key).ToString()); + ASSERT_NE(nullptr, handle); + sim_cache->Release(handle); + // All blocks are missing from the cache since insert_blocks_row_kvpair_misses + // is set to false. + for (uint32_t i = 100; i < block_id; i++) { + handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i)); + ASSERT_EQ(nullptr, handle); + } +} + +TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) { + std::unique_ptr ghost_cache(new GhostCache( + NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0))); + const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId); + const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1); + const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2); + std::unique_ptr cache_simulator( + new HybridRowBlockCacheSimulator( + std::move(ghost_cache), + NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1, + /*strict_capacity_limit=*/false, + /*high_pri_pool_ratio=*/0), + /*insert_blocks_row_kvpair_misses=*/false)); + // Two get requests access the same key. + cache_simulator->Access(first_get); + cache_simulator->Access(second_get); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio()); + ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio()); + // We insert the key-value pair upon the second get request. A third get + // request should observe a hit. + for (uint32_t i = 0; i < 10; i++) { + cache_simulator->Access(third_get); + } + ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses()); + ASSERT_EQ(16, static_cast( + cache_simulator->miss_ratio_stats().miss_ratio())); + ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses()); + ASSERT_EQ(16, static_cast( + cache_simulator->miss_ratio_stats().user_miss_ratio())); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache.cc b/src/rocksdb/utilities/simulator_cache/sim_cache.cc new file mode 100644 index 000000000..a883b52e7 --- /dev/null +++ b/src/rocksdb/utilities/simulator_cache/sim_cache.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/sim_cache.h" + +#include +#include + +#include "file/writable_file_writer.h" +#include "monitoring/statistics.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class CacheActivityLogger { + public: + CacheActivityLogger() + : activity_logging_enabled_(false), max_logging_size_(0) {} + + ~CacheActivityLogger() { + MutexLock l(&mutex_); + + StopLoggingInternal(); + bg_status_.PermitUncheckedError(); + } + + Status StartLogging(const std::string& activity_log_file, Env* env, + uint64_t max_logging_size = 0) { + assert(activity_log_file != ""); + assert(env != nullptr); + + Status status; + FileOptions file_opts; + + MutexLock l(&mutex_); + + // Stop existing logging if any + StopLoggingInternal(); + + // Open log file + status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file, + file_opts, &file_writer_, nullptr); + if (!status.ok()) { + return status; + } + + max_logging_size_ = max_logging_size; + activity_logging_enabled_.store(true); + + return status; + } + + void StopLogging() { + MutexLock l(&mutex_); + + StopLoggingInternal(); + } + + void ReportLookup(const Slice& key) { + if (activity_logging_enabled_.load() == false) { + return; + } + + std::ostringstream oss; + // line format: "LOOKUP - " + oss << "LOOKUP - " << key.ToString(true) << std::endl; + + MutexLock l(&mutex_); + Status s = file_writer_->Append(oss.str()); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + if (MaxLoggingSizeReached() || !bg_status_.ok()) { + // Stop logging if we have reached the max file size or + // encountered an error + StopLoggingInternal(); + } + } + + void ReportAdd(const Slice& key, size_t size) { + if (activity_logging_enabled_.load() == false) { + return; + } + + std::ostringstream oss; + // line format: "ADD - - " + oss << "ADD - " << key.ToString(true) << " - " << size << std::endl; + MutexLock l(&mutex_); + Status s = file_writer_->Append(oss.str()); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + + if (MaxLoggingSizeReached() || !bg_status_.ok()) { + // Stop logging if we have reached the max file size or + // encountered an error + StopLoggingInternal(); + } + } + + Status& bg_status() { + MutexLock l(&mutex_); + return bg_status_; + } + + private: + bool MaxLoggingSizeReached() { + mutex_.AssertHeld(); + + return (max_logging_size_ > 0 && + file_writer_->GetFileSize() >= max_logging_size_); + } + + void StopLoggingInternal() { + mutex_.AssertHeld(); + + if (!activity_logging_enabled_) { + return; + } + + activity_logging_enabled_.store(false); + Status s = file_writer_->Close(); + if (!s.ok() && bg_status_.ok()) { + bg_status_ = s; + } + } + + // Mutex to sync writes to file_writer, and all following + // class data members + port::Mutex mutex_; + // Indicates if logging is currently enabled + // atomic to allow reads without mutex + std::atomic activity_logging_enabled_; + // When reached, we will stop logging and close the file + // Value of 0 means unlimited + uint64_t max_logging_size_; + std::unique_ptr file_writer_; + Status bg_status_; +}; + +// SimCacheImpl definition +class SimCacheImpl : public SimCache { + public: + // capacity for real cache (ShardedLRUCache) + // test_capacity for key only cache + SimCacheImpl(std::shared_ptr sim_cache, std::shared_ptr cache) + : cache_(cache), + key_only_cache_(sim_cache), + miss_times_(0), + hit_times_(0), + stats_(nullptr) {} + + ~SimCacheImpl() override {} + void SetCapacity(size_t capacity) override { cache_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + cache_->SetStrictCapacityLimit(strict_capacity_limit); + } + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), Handle** handle, + Priority priority) override { + // The handle and value passed in are for real cache, so we pass nullptr + // to key_only_cache_ for both instead. Also, the deleter function pointer + // will be called by user to perform some external operation which should + // be applied only once. Thus key_only_cache accepts an empty function. + // *Lambda function without capture can be assgined to a function pointer + Handle* h = key_only_cache_->Lookup(key); + if (h == nullptr) { + // TODO: Check for error here? + auto s = key_only_cache_->Insert( + key, nullptr, charge, [](const Slice& /*k*/, void* /*v*/) {}, nullptr, + priority); + s.PermitUncheckedError(); + } else { + key_only_cache_->Release(h); + } + + cache_activity_logger_.ReportAdd(key, charge); + if (!cache_) { + return Status::OK(); + } + return cache_->Insert(key, value, charge, deleter, handle, priority); + } + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats) override { + Handle* h = key_only_cache_->Lookup(key); + if (h != nullptr) { + key_only_cache_->Release(h); + inc_hit_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_HIT); + } else { + inc_miss_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_MISS); + } + + cache_activity_logger_.ReportLookup(key); + if (!cache_) { + return nullptr; + } + return cache_->Lookup(key, stats); + } + + bool Ref(Handle* handle) override { return cache_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return cache_->Release(handle, erase_if_last_ref); + } + + void Erase(const Slice& key) override { + cache_->Erase(key); + key_only_cache_->Erase(key); + } + + void* Value(Handle* handle) override { return cache_->Value(handle); } + + uint64_t NewId() override { return cache_->NewId(); } + + size_t GetCapacity() const override { return cache_->GetCapacity(); } + + bool HasStrictCapacityLimit() const override { + return cache_->HasStrictCapacityLimit(); + } + + size_t GetUsage() const override { return cache_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return cache_->GetUsage(handle); + } + + size_t GetCharge(Handle* handle) const override { + return cache_->GetCharge(handle); + } + + DeleterFn GetDeleter(Handle* handle) const override { + return cache_->GetDeleter(handle); + } + + size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } + + void DisownData() override { + cache_->DisownData(); + key_only_cache_->DisownData(); + } + + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + // only apply to _cache since key_only_cache doesn't hold value + cache_->ApplyToAllCacheEntries(callback, thread_safe); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + cache_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { + cache_->EraseUnRefEntries(); + key_only_cache_->EraseUnRefEntries(); + } + + size_t GetSimCapacity() const override { + return key_only_cache_->GetCapacity(); + } + size_t GetSimUsage() const override { return key_only_cache_->GetUsage(); } + void SetSimCapacity(size_t capacity) override { + key_only_cache_->SetCapacity(capacity); + } + + uint64_t get_miss_counter() const override { + return miss_times_.load(std::memory_order_relaxed); + } + + uint64_t get_hit_counter() const override { + return hit_times_.load(std::memory_order_relaxed); + } + + void reset_counter() override { + miss_times_.store(0, std::memory_order_relaxed); + hit_times_.store(0, std::memory_order_relaxed); + SetTickerCount(stats_, SIM_BLOCK_CACHE_HIT, 0); + SetTickerCount(stats_, SIM_BLOCK_CACHE_MISS, 0); + } + + std::string ToString() const override { + std::ostringstream oss; + oss << "SimCache MISSes: " << get_miss_counter() << std::endl; + oss << "SimCache HITs: " << get_hit_counter() << std::endl; + auto lookups = get_miss_counter() + get_hit_counter(); + oss << "SimCache HITRATE: " << std::fixed << std::setprecision(2) + << (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups) + << std::endl; + return oss.str(); + } + + std::string GetPrintableOptions() const override { + std::ostringstream oss; + oss << " cache_options:" << std::endl; + oss << cache_->GetPrintableOptions(); + oss << " sim_cache_options:" << std::endl; + oss << key_only_cache_->GetPrintableOptions(); + return oss.str(); + } + + Status StartActivityLogging(const std::string& activity_log_file, Env* env, + uint64_t max_logging_size = 0) override { + return cache_activity_logger_.StartLogging(activity_log_file, env, + max_logging_size); + } + + void StopActivityLogging() override { cache_activity_logger_.StopLogging(); } + + Status GetActivityLoggingStatus() override { + return cache_activity_logger_.bg_status(); + } + + private: + std::shared_ptr cache_; + std::shared_ptr key_only_cache_; + std::atomic miss_times_; + std::atomic hit_times_; + Statistics* stats_; + CacheActivityLogger cache_activity_logger_; + + void inc_miss_counter() { + miss_times_.fetch_add(1, std::memory_order_relaxed); + } + void inc_hit_counter() { hit_times_.fetch_add(1, std::memory_order_relaxed); } +}; + +} // end anonymous namespace + +// For instrumentation purpose, use NewSimCache instead +std::shared_ptr NewSimCache(std::shared_ptr cache, + size_t sim_capacity, int num_shard_bits) { + LRUCacheOptions co; + co.capacity = sim_capacity; + co.num_shard_bits = num_shard_bits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + return NewSimCache(NewLRUCache(co), cache, num_shard_bits); +} + +std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits) { + if (num_shard_bits >= 20) { + return nullptr; // the cache cannot be sharded into too many fine pieces + } + return std::make_shared(sim_cache, cache); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc new file mode 100644 index 000000000..2e37cd347 --- /dev/null +++ b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/utilities/sim_cache.h" + +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class SimCacheTest : public DBTestBase { + private: + size_t miss_count_ = 0; + size_t hit_count_ = 0; + size_t insert_count_ = 0; + size_t failure_count_ = 0; + + public: + const size_t kNumBlocks = 5; + const size_t kValueSize = 1000; + + SimCacheTest() : DBTestBase("sim_cache_test", /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + // Set a small enough block size so that each key-value get its own block. + table_options.block_size = 1; + return table_options; + } + + Options GetOptions(const BlockBasedTableOptions& table_options) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // options.compression = kNoCompression; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + return options; + } + + void InitTable(const Options& /*options*/) { + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks * 2; i++) { + ASSERT_OK(Put(std::to_string(i), value.c_str())); + } + } + + void RecordCacheCounters(const Options& options) { + miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS); + hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); + insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); + failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + } + + void CheckCacheCounters(const Options& options, size_t expected_misses, + size_t expected_hits, size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS); + size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT); + size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + ASSERT_EQ(miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(failure_count_ + expected_failures, new_failure_count); + miss_count_ = new_miss_count; + hit_count_ = new_hit_count; + insert_count_ = new_insert_count; + failure_count_ = new_failure_count; + } +}; + +TEST_F(SimCacheTest, SimCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr simCache = NewSimCache(NewLRUCache(co), 20000, 0); + table_options.block_cache = simCache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + // due to cache entry stats collector + uint64_t base_misses = simCache->get_miss_counter(); + + std::vector> iterators(kNumBlocks); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + ASSERT_EQ(kNumBlocks, simCache->get_hit_counter() + + simCache->get_miss_counter() - base_misses); + ASSERT_EQ(0, simCache->get_hit_counter()); + size_t usage = simCache->GetUsage(); + ASSERT_LT(0, usage); + ASSERT_EQ(usage, simCache->GetSimUsage()); + simCache->SetCapacity(usage); + ASSERT_EQ(usage, simCache->GetPinnedUsage()); + + // Test with strict capacity limit. + simCache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(std::to_string(kNumBlocks * 2 - 1)); + ASSERT_TRUE(iter->status().IsMemoryLimit()); + CheckCacheCounters(options, 1, 0, 0, 1); + delete iter; + iter = nullptr; + + // Release iterators and access cache again. + for (size_t i = 0; i < kNumBlocks; i++) { + iterators[i].reset(); + CheckCacheCounters(options, 0, 0, 0, 0); + } + // Add kNumBlocks again + for (size_t i = 0; i < kNumBlocks; i++) { + std::unique_ptr it(db_->NewIterator(read_options)); + it->Seek(std::to_string(i)); + ASSERT_OK(it->status()); + CheckCacheCounters(options, 0, 1, 0, 0); + } + ASSERT_EQ(5, simCache->get_hit_counter()); + for (size_t i = kNumBlocks; i < kNumBlocks * 2; i++) { + std::unique_ptr it(db_->NewIterator(read_options)); + it->Seek(std::to_string(i)); + ASSERT_OK(it->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + } + ASSERT_EQ(0, simCache->GetPinnedUsage()); + ASSERT_EQ(3 * kNumBlocks + 1, simCache->get_hit_counter() + + simCache->get_miss_counter() - base_misses); + ASSERT_EQ(6, simCache->get_hit_counter()); +} + +TEST_F(SimCacheTest, SimCacheLogging) { + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + options.disable_auto_compactions = true; + LRUCacheOptions co; + co.capacity = 1024 * 1024; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr sim_cache = NewSimCache(NewLRUCache(co), 20000, 0); + table_options.block_cache = sim_cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + int num_block_entries = 20; + for (int i = 0; i < num_block_entries; i++) { + ASSERT_OK(Put(Key(i), "val")); + ASSERT_OK(Flush()); + } + + std::string log_file = test::PerThreadDBPath(env_, "cache_log.txt"); + ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_)); + for (int i = 0; i < num_block_entries; i++) { + ASSERT_EQ(Get(Key(i)), "val"); + } + for (int i = 0; i < num_block_entries; i++) { + ASSERT_EQ(Get(Key(i)), "val"); + } + sim_cache->StopActivityLogging(); + ASSERT_OK(sim_cache->GetActivityLoggingStatus()); + + std::string file_contents = ""; + ASSERT_OK(ReadFileToString(env_, log_file, &file_contents)); + std::istringstream contents(file_contents); + + int lookup_num = 0; + int add_num = 0; + + std::string line; + // count number of lookups and additions + while (std::getline(contents, line)) { + // check if the line starts with LOOKUP or ADD + if (line.rfind("LOOKUP -", 0) == 0) { + ++lookup_num; + } + if (line.rfind("ADD -", 0) == 0) { + ++add_num; + } + } + + // We asked for every block twice + ASSERT_EQ(lookup_num, num_block_entries * 2); + + // We added every block only once, since the cache can hold all blocks + ASSERT_EQ(add_num, num_block_entries); + + // Log things again but stop logging automatically after reaching 512 bytes + int max_size = 512; + ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size)); + for (int it = 0; it < 10; it++) { + for (int i = 0; i < num_block_entries; i++) { + ASSERT_EQ(Get(Key(i)), "val"); + } + } + ASSERT_OK(sim_cache->GetActivityLoggingStatus()); + + uint64_t fsize = 0; + ASSERT_OK(env_->GetFileSize(log_file, &fsize)); + // error margin of 100 bytes + ASSERT_LT(fsize, max_size + 100); + ASSERT_GT(fsize, max_size - 100); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc new file mode 100644 index 000000000..16f33934d --- /dev/null +++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/table_properties_collectors/compact_on_deletion_collector.h" + +#include +#include + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE + +CompactOnDeletionCollector::CompactOnDeletionCollector( + size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio) + : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets), + current_bucket_(0), + num_keys_in_current_bucket_(0), + num_deletions_in_observation_window_(0), + deletion_trigger_(deletion_trigger), + deletion_ratio_(deletion_ratio), + deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1), + need_compaction_(false), + finished_(false) { + memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets); +} + +// AddUserKey() will be called when a new key/value pair is inserted into the +// table. +// @params key the user key that is inserted into the table. +// @params value the value that is inserted into the table. +// @params file_size file size up to now +Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/, + const Slice& /*value*/, + EntryType type, + SequenceNumber /*seq*/, + uint64_t /*file_size*/) { + assert(!finished_); + if (!bucket_size_ && !deletion_ratio_enabled_) { + // This collector is effectively disabled + return Status::OK(); + } + + if (need_compaction_) { + // If the output file already needs to be compacted, skip the check. + return Status::OK(); + } + + if (deletion_ratio_enabled_) { + total_entries_++; + if (type == kEntryDelete) { + deletion_entries_++; + } + } + + if (bucket_size_) { + if (num_keys_in_current_bucket_ == bucket_size_) { + // When the current bucket is full, advance the cursor of the + // ring buffer to the next bucket. + current_bucket_ = (current_bucket_ + 1) % kNumBuckets; + + // Update the current count of observed deletion keys by excluding + // the number of deletion keys in the oldest bucket in the + // observation window. + assert(num_deletions_in_observation_window_ >= + num_deletions_in_buckets_[current_bucket_]); + num_deletions_in_observation_window_ -= + num_deletions_in_buckets_[current_bucket_]; + num_deletions_in_buckets_[current_bucket_] = 0; + num_keys_in_current_bucket_ = 0; + } + + num_keys_in_current_bucket_++; + if (type == kEntryDelete) { + num_deletions_in_observation_window_++; + num_deletions_in_buckets_[current_bucket_]++; + if (num_deletions_in_observation_window_ >= deletion_trigger_) { + need_compaction_ = true; + } + } + } + + return Status::OK(); +} + +Status CompactOnDeletionCollector::Finish( + UserCollectedProperties* /*properties*/) { + if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) { + double ratio = static_cast(deletion_entries_) / total_entries_; + need_compaction_ = ratio >= deletion_ratio_; + } + finished_ = true; + return Status::OK(); +} +static std::unordered_map + on_deletion_collector_type_info = { +#ifndef ROCKSDB_LITE + {"window_size", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetWindowSize(ParseSizeT(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetWindowSize()); + return Status::OK(); + }, + nullptr}}, + {"deletion_trigger", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetDeletionTrigger(ParseSizeT(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetDeletionTrigger()); + return Status::OK(); + }, + nullptr}}, + {"deletion_ratio", + {0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, + [](const ConfigOptions&, const std::string&, const std::string& value, + void* addr) { + auto* factory = + static_cast(addr); + factory->SetDeletionRatio(ParseDouble(value)); + return Status::OK(); + }, + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* factory = + static_cast(addr); + *value = std::to_string(factory->GetDeletionRatio()); + return Status::OK(); + }, + nullptr}}, + +#endif // ROCKSDB_LITE +}; + +CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory( + size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio) + : sliding_window_size_(sliding_window_size), + deletion_trigger_(deletion_trigger), + deletion_ratio_(deletion_ratio) { + RegisterOptions("", this, &on_deletion_collector_type_info); +} + +TablePropertiesCollector* +CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) { + return new CompactOnDeletionCollector(sliding_window_size_.load(), + deletion_trigger_.load(), + deletion_ratio_.load()); +} + +std::string CompactOnDeletionCollectorFactory::ToString() const { + std::ostringstream cfg; + cfg << Name() << " (Sliding window size = " << sliding_window_size_.load() + << " Deletion trigger = " << deletion_trigger_.load() + << " Deletion ratio = " << deletion_ratio_.load() << ')'; + return cfg.str(); +} + +std::shared_ptr +NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio) { + return std::shared_ptr( + new CompactOnDeletionCollectorFactory(sliding_window_size, + deletion_trigger, deletion_ratio)); +} +namespace { +static int RegisterTablePropertiesCollectorFactories( + ObjectLibrary& library, const std::string& /*arg*/) { + library.AddFactory( + CompactOnDeletionCollectorFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + // By default, create a CompactionOnDeletionCollector that is disabled. + // Users will need to provide configuration parameters or call the + // corresponding Setter to enable the factory. + guard->reset(new CompactOnDeletionCollectorFactory(0, 0, 0)); + return guard->get(); + }); + return 1; +} +} // namespace +#endif // !ROCKSDB_LITE + +Status TablePropertiesCollectorFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()), + ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(options, value, + nullptr, result); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h new file mode 100644 index 000000000..2f7dc4f1b --- /dev/null +++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/table_properties_collectors.h" +namespace ROCKSDB_NAMESPACE { + +class CompactOnDeletionCollector : public TablePropertiesCollector { + public: + CompactOnDeletionCollector(size_t sliding_window_size, + size_t deletion_trigger, double deletion_raatio); + + // AddUserKey() will be called when a new key/value pair is inserted into the + // table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + // @params file_size file size up to now + virtual Status AddUserKey(const Slice& key, const Slice& value, + EntryType type, SequenceNumber seq, + uint64_t file_size) override; + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* /*properties*/) override; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties(); + } + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const override { + return "CompactOnDeletionCollector"; + } + + // EXPERIMENTAL Return whether the output file should be further compacted + virtual bool NeedCompact() const override { return need_compaction_; } + + static const int kNumBuckets = 128; + + private: + void Reset(); + + // A ring buffer that used to count the number of deletion entries for every + // "bucket_size_" keys. + size_t num_deletions_in_buckets_[kNumBuckets]; + // the number of keys in a bucket + size_t bucket_size_; + + size_t current_bucket_; + size_t num_keys_in_current_bucket_; + size_t num_deletions_in_observation_window_; + size_t deletion_trigger_; + const double deletion_ratio_; + const bool deletion_ratio_enabled_; + size_t total_entries_ = 0; + size_t deletion_entries_ = 0; + // true if the current SST file needs to be compacted. + bool need_compaction_; + bool finished_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc new file mode 100644 index 000000000..88aeb8d5c --- /dev/null +++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -0,0 +1,245 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#ifndef ROCKSDB_LITE +#include +#include +#include + +#include "port/stack_trace.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "utilities/table_properties_collectors/compact_on_deletion_collector.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(CompactOnDeletionCollector, DeletionRatio) { + TablePropertiesCollectorFactory::Context context; + context.column_family_id = + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + const size_t kTotalEntries = 100; + + { + // Disable deletion ratio. + for (double deletion_ratio : {-1.5, -1.0, 0.0, 1.5, 2.0}) { + auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio); + std::unique_ptr collector( + factory->CreateTablePropertiesCollector(context)); + for (size_t i = 0; i < kTotalEntries; i++) { + // All entries are deletion entries. + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0)); + ASSERT_FALSE(collector->NeedCompact()); + } + ASSERT_OK(collector->Finish(nullptr)); + ASSERT_FALSE(collector->NeedCompact()); + } + } + + { + for (double deletion_ratio : {0.3, 0.5, 0.8, 1.0}) { + auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio); + const size_t deletion_entries_trigger = + static_cast(deletion_ratio * kTotalEntries); + for (int delta : {-1, 0, 1}) { + // Actual deletion entry ratio <, =, > deletion_ratio + size_t actual_deletion_entries = deletion_entries_trigger + delta; + std::unique_ptr collector( + factory->CreateTablePropertiesCollector(context)); + for (size_t i = 0; i < kTotalEntries; i++) { + if (i < actual_deletion_entries) { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0)); + } else { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + ASSERT_FALSE(collector->NeedCompact()); + } + ASSERT_OK(collector->Finish(nullptr)); + if (delta >= 0) { + // >= deletion_ratio + ASSERT_TRUE(collector->NeedCompact()); + } else { + ASSERT_FALSE(collector->NeedCompact()); + } + } + } + } +} + +TEST(CompactOnDeletionCollector, SlidingWindow) { + const int kWindowSizes[] = {1000, 10000, 10000, 127, 128, 129, + 255, 256, 257, 2, 10000}; + const int kDeletionTriggers[] = {500, 9500, 4323, 47, 61, 128, + 250, 250, 250, 2, 2}; + TablePropertiesCollectorFactory::Context context; + context.column_family_id = + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + + std::vector window_sizes; + std::vector deletion_triggers; + // deterministic tests + for (int test = 0; test < 9; ++test) { + window_sizes.emplace_back(kWindowSizes[test]); + deletion_triggers.emplace_back(kDeletionTriggers[test]); + } + + // randomize tests + Random rnd(301); + const int kMaxTestSize = 100000l; + for (int random_test = 0; random_test < 10; random_test++) { + int window_size = rnd.Uniform(kMaxTestSize) + 1; + int deletion_trigger = rnd.Uniform(window_size); + window_sizes.emplace_back(window_size); + deletion_triggers.emplace_back(deletion_trigger); + } + + assert(window_sizes.size() == deletion_triggers.size()); + + for (size_t test = 0; test < window_sizes.size(); ++test) { + const int kBucketSize = 128; + const int kWindowSize = window_sizes[test]; + const int kPaddedWindowSize = + kBucketSize * ((window_sizes[test] + kBucketSize - 1) / kBucketSize); + const int kNumDeletionTrigger = deletion_triggers[test]; + const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize; + // Simple test + { + auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize, + kNumDeletionTrigger); + const int kSample = 10; + for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) { + std::unique_ptr collector( + factory->CreateTablePropertiesCollector(context)); + int deletions = 0; + for (int i = 0; i < kPaddedWindowSize; ++i) { + if (i % kSample < delete_rate) { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0)); + deletions++; + } else { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + } + if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) && + std::abs(deletions - kNumDeletionTrigger) > kBias) { + fprintf(stderr, + "[Error] collector->NeedCompact() != (%d >= %d)" + " with kWindowSize = %d and kNumDeletionTrigger = %d\n", + deletions, kNumDeletionTrigger, kWindowSize, + kNumDeletionTrigger); + ASSERT_TRUE(false); + } + ASSERT_OK(collector->Finish(nullptr)); + } + } + + // Only one section of a file satisfies the compaction trigger + { + auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize, + kNumDeletionTrigger); + const int kSample = 10; + for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) { + std::unique_ptr collector( + factory->CreateTablePropertiesCollector(context)); + int deletions = 0; + for (int section = 0; section < 5; ++section) { + int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize; + for (int i = 0; i < initial_entries; ++i) { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + } + for (int i = 0; i < kPaddedWindowSize; ++i) { + if (i % kSample < delete_rate) { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0)); + deletions++; + } else { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + } + for (int section = 0; section < 5; ++section) { + int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize; + for (int i = 0; i < ending_entries; ++i) { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + } + if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) && + std::abs(deletions - kNumDeletionTrigger) > kBias) { + fprintf(stderr, + "[Error] collector->NeedCompact() %d != (%d >= %d)" + " with kWindowSize = %d, kNumDeletionTrigger = %d\n", + collector->NeedCompact(), deletions, kNumDeletionTrigger, + kWindowSize, kNumDeletionTrigger); + ASSERT_TRUE(false); + } + ASSERT_OK(collector->Finish(nullptr)); + } + } + + // TEST 3: Issues a lots of deletes, but their density is not + // high enough to trigger compaction. + { + std::unique_ptr collector; + auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize, + kNumDeletionTrigger); + collector.reset(factory->CreateTablePropertiesCollector(context)); + assert(collector->NeedCompact() == false); + // Insert "kNumDeletionTrigger * 0.95" deletions for every + // "kWindowSize" and verify compaction is not needed. + const int kDeletionsPerSection = kNumDeletionTrigger * 95 / 100; + if (kDeletionsPerSection >= 0) { + for (int section = 0; section < 200; ++section) { + for (int i = 0; i < kPaddedWindowSize; ++i) { + if (i < kDeletionsPerSection) { + ASSERT_OK(collector->AddUserKey("hello", "rocksdb", kEntryDelete, + 0, 0)); + } else { + ASSERT_OK( + collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); + } + } + } + if (collector->NeedCompact() && + std::abs(kDeletionsPerSection - kNumDeletionTrigger) > kBias) { + fprintf(stderr, + "[Error] collector->NeedCompact() != false" + " with kWindowSize = %d and kNumDeletionTrigger = %d\n", + kWindowSize, kNumDeletionTrigger); + ASSERT_TRUE(false); + } + ASSERT_OK(collector->Finish(nullptr)); + } + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#else +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.cc b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc new file mode 100644 index 000000000..5886d3539 --- /dev/null +++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "utilities/trace/file_trace_reader_writer.h" + +#include "env/composite_env_wrapper.h" +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "trace_replay/trace_replay.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +const unsigned int FileTraceReader::kBufferSize = 1024; // 1KB + +FileTraceReader::FileTraceReader( + std::unique_ptr&& reader) + : file_reader_(std::move(reader)), + offset_(0), + buffer_(new char[kBufferSize]) {} + +FileTraceReader::~FileTraceReader() { + Close().PermitUncheckedError(); + delete[] buffer_; +} + +Status FileTraceReader::Close() { + file_reader_.reset(); + return Status::OK(); +} + +Status FileTraceReader::Reset() { + if (file_reader_ == nullptr) { + return Status::IOError("TraceReader is closed."); + } + offset_ = 0; + return Status::OK(); +} + +Status FileTraceReader::Read(std::string* data) { + assert(file_reader_ != nullptr); + Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize, + &result_, buffer_, nullptr, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + if (result_.size() == 0) { + // No more data to read + // Todo: Come up with a better way to indicate end of data. May be this + // could be avoided once footer is introduced. + return Status::Incomplete(); + } + if (result_.size() < kTraceMetadataSize) { + return Status::Corruption("Corrupted trace file."); + } + *data = result_.ToString(); + offset_ += kTraceMetadataSize; + + uint32_t payload_len = + DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]); + + // Read Payload + unsigned int bytes_to_read = payload_len; + unsigned int to_read = + bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; + while (to_read > 0) { + s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_, + nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + if (result_.size() < to_read) { + return Status::Corruption("Corrupted trace file."); + } + data->append(result_.data(), result_.size()); + + offset_ += to_read; + bytes_to_read -= to_read; + to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; + } + + return s; +} + +FileTraceWriter::FileTraceWriter( + std::unique_ptr&& file_writer) + : file_writer_(std::move(file_writer)) {} + +FileTraceWriter::~FileTraceWriter() { Close().PermitUncheckedError(); } + +Status FileTraceWriter::Close() { + file_writer_.reset(); + return Status::OK(); +} + +Status FileTraceWriter::Write(const Slice& data) { + return file_writer_->Append(data); +} + +uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); } + +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_reader) { + std::unique_ptr file_reader; + Status s = RandomAccessFileReader::Create( + env->GetFileSystem(), trace_filename, FileOptions(env_options), + &file_reader, nullptr); + if (!s.ok()) { + return s; + } + trace_reader->reset(new FileTraceReader(std::move(file_reader))); + return s; +} + +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_writer) { + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename, + FileOptions(env_options), &file_writer, + nullptr); + if (!s.ok()) { + return s; + } + trace_writer->reset(new FileTraceWriter(std::move(file_writer))); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.h b/src/rocksdb/utilities/trace/file_trace_reader_writer.h new file mode 100644 index 000000000..65d483108 --- /dev/null +++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/trace_reader_writer.h" + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class WritableFileWriter; + +// FileTraceReader allows reading RocksDB traces from a file. +class FileTraceReader : public TraceReader { + public: + explicit FileTraceReader(std::unique_ptr&& reader); + ~FileTraceReader(); + + virtual Status Read(std::string* data) override; + virtual Status Close() override; + virtual Status Reset() override; + + private: + std::unique_ptr file_reader_; + Slice result_; + size_t offset_; + char* const buffer_; + + static const unsigned int kBufferSize; +}; + +// FileTraceWriter allows writing RocksDB traces to a file. +class FileTraceWriter : public TraceWriter { + public: + explicit FileTraceWriter(std::unique_ptr&& file_writer); + ~FileTraceWriter(); + + virtual Status Write(const Slice& data) override; + virtual Status Close() override; + virtual uint64_t GetFileSize() override; + + private: + std::unique_ptr file_writer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/trace/replayer_impl.cc b/src/rocksdb/utilities/trace/replayer_impl.cc new file mode 100644 index 000000000..31023f1a2 --- /dev/null +++ b/src/rocksdb/utilities/trace/replayer_impl.cc @@ -0,0 +1,316 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/trace/replayer_impl.h" + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" + +namespace ROCKSDB_NAMESPACE { + +ReplayerImpl::ReplayerImpl(DB* db, + const std::vector& handles, + std::unique_ptr&& reader) + : Replayer(), + trace_reader_(std::move(reader)), + prepared_(false), + trace_end_(false), + header_ts_(0), + exec_handler_(TraceRecord::NewExecutionHandler(db, handles)), + env_(db->GetEnv()), + trace_file_version_(-1) {} + +ReplayerImpl::~ReplayerImpl() { + exec_handler_.reset(); + trace_reader_.reset(); +} + +Status ReplayerImpl::Prepare() { + Trace header; + int db_version; + Status s = ReadHeader(&header); + if (!s.ok()) { + return s; + } + s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version); + if (!s.ok()) { + return s; + } + header_ts_ = header.ts; + prepared_ = true; + trace_end_ = false; + return Status::OK(); +} + +Status ReplayerImpl::Next(std::unique_ptr* record) { + if (!prepared_) { + return Status::Incomplete("Not prepared!"); + } + if (trace_end_) { + return Status::Incomplete("Trace end."); + } + + Trace trace; + Status s = ReadTrace(&trace); // ReadTrace is atomic + // Reached the trace end. + if (s.ok() && trace.type == kTraceEnd) { + trace_end_ = true; + return Status::Incomplete("Trace end."); + } + if (!s.ok() || record == nullptr) { + return s; + } + + return TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, record); +} + +Status ReplayerImpl::Execute(const std::unique_ptr& record, + std::unique_ptr* result) { + return record->Accept(exec_handler_.get(), result); +} + +Status ReplayerImpl::Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) { + if (options.fast_forward <= 0.0) { + return Status::InvalidArgument("Wrong fast forward speed!"); + } + + if (!prepared_) { + return Status::Incomplete("Not prepared!"); + } + if (trace_end_) { + return Status::Incomplete("Trace end."); + } + + Status s = Status::OK(); + + if (options.num_threads <= 1) { + // num_threads == 0 or num_threads == 1 uses single thread. + std::chrono::system_clock::time_point replay_epoch = + std::chrono::system_clock::now(); + + while (s.ok()) { + Trace trace; + s = ReadTrace(&trace); + // If already at trace end, ReadTrace should return Status::Incomplete(). + if (!s.ok()) { + break; + } + + // No need to sleep before breaking the loop if at the trace end. + if (trace.type == kTraceEnd) { + trace_end_ = true; + s = Status::Incomplete("Trace end."); + break; + } + + // In single-threaded replay, decode first then sleep. + std::unique_ptr record; + s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, &record); + if (!s.ok() && !s.IsNotSupported()) { + break; + } + + std::chrono::system_clock::time_point sleep_to = + replay_epoch + + std::chrono::microseconds(static_cast(std::llround( + 1.0 * (trace.ts - header_ts_) / options.fast_forward))); + if (sleep_to > std::chrono::system_clock::now()) { + std::this_thread::sleep_until(sleep_to); + } + + // Skip unsupported traces, stop for other errors. + if (s.IsNotSupported()) { + if (result_callback != nullptr) { + result_callback(s, nullptr); + } + s = Status::OK(); + continue; + } + + if (result_callback == nullptr) { + s = Execute(record, nullptr); + } else { + std::unique_ptr res; + s = Execute(record, &res); + result_callback(s, std::move(res)); + } + } + } else { + // Multi-threaded replay. + ThreadPoolImpl thread_pool; + thread_pool.SetHostEnv(env_); + thread_pool.SetBackgroundThreads(static_cast(options.num_threads)); + + std::mutex mtx; + // Background decoding and execution status. + Status bg_s = Status::OK(); + uint64_t last_err_ts = static_cast(-1); + // Callback function used in background work to update bg_s for the ealiest + // TraceRecord which has execution error. This is different from the + // timestamp of the first execution error (either start or end timestamp). + // + // Suppose TraceRecord R1, R2, with timestamps T1 < T2. Their execution + // timestamps are T1_start, T1_end, T2_start, T2_end. + // Single-thread: there must be T1_start < T1_end < T2_start < T2_end. + // Multi-thread: T1_start < T2_start may not be enforced. Orders of them are + // totally unknown. + // In order to report the same `first` error in both single-thread and + // multi-thread replay, we can only rely on the TraceRecords' timestamps, + // rather than their executin timestamps. Although in single-thread replay, + // the first error is also the last error, while in multi-thread replay, the + // first error may not be the first error in execution, and it may not be + // the last error in exeution as well. + auto error_cb = [&mtx, &bg_s, &last_err_ts](Status err, uint64_t err_ts) { + std::lock_guard gd(mtx); + // Only record the first error. + if (!err.ok() && !err.IsNotSupported() && err_ts < last_err_ts) { + bg_s = err; + last_err_ts = err_ts; + } + }; + + std::chrono::system_clock::time_point replay_epoch = + std::chrono::system_clock::now(); + + while (bg_s.ok() && s.ok()) { + Trace trace; + s = ReadTrace(&trace); + // If already at trace end, ReadTrace should return Status::Incomplete(). + if (!s.ok()) { + break; + } + + TraceType trace_type = trace.type; + + // No need to sleep before breaking the loop if at the trace end. + if (trace_type == kTraceEnd) { + trace_end_ = true; + s = Status::Incomplete("Trace end."); + break; + } + + // In multi-threaded replay, sleep first then start decoding and + // execution in a thread. + std::chrono::system_clock::time_point sleep_to = + replay_epoch + + std::chrono::microseconds(static_cast(std::llround( + 1.0 * (trace.ts - header_ts_) / options.fast_forward))); + if (sleep_to > std::chrono::system_clock::now()) { + std::this_thread::sleep_until(sleep_to); + } + + if (trace_type == kTraceWrite || trace_type == kTraceGet || + trace_type == kTraceIteratorSeek || + trace_type == kTraceIteratorSeekForPrev || + trace_type == kTraceMultiGet) { + std::unique_ptr ra(new ReplayerWorkerArg); + ra->trace_entry = std::move(trace); + ra->handler = exec_handler_.get(); + ra->trace_file_version = trace_file_version_; + ra->error_cb = error_cb; + ra->result_cb = result_callback; + thread_pool.Schedule(&ReplayerImpl::BackgroundWork, ra.release(), + nullptr, nullptr); + } else { + // Skip unsupported traces. + if (result_callback != nullptr) { + result_callback(Status::NotSupported("Unsupported trace type."), + nullptr); + } + } + } + + thread_pool.WaitForJobsAndJoinAllThreads(); + if (!bg_s.ok()) { + s = bg_s; + } + } + + if (s.IsIncomplete()) { + // Reaching eof returns Incomplete status at the moment. + // Could happen when killing a process without calling EndTrace() API. + // TODO: Add better error handling. + trace_end_ = true; + return Status::OK(); + } + return s; +} + +uint64_t ReplayerImpl::GetHeaderTimestamp() const { return header_ts_; } + +Status ReplayerImpl::ReadHeader(Trace* header) { + assert(header != nullptr); + Status s = trace_reader_->Reset(); + if (!s.ok()) { + return s; + } + std::string encoded_trace; + // Read the trace head + s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + + return TracerHelper::DecodeHeader(encoded_trace, header); +} + +Status ReplayerImpl::ReadTrace(Trace* trace) { + assert(trace != nullptr); + std::string encoded_trace; + // We don't know if TraceReader is implemented thread-safe, so we protect the + // reading trace part with a mutex. The decoding part does not need to be + // protected since it's local. + { + std::lock_guard guard(mutex_); + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + } + return TracerHelper::DecodeTrace(encoded_trace, trace); +} + +void ReplayerImpl::BackgroundWork(void* arg) { + std::unique_ptr ra( + reinterpret_cast(arg)); + assert(ra != nullptr); + + std::unique_ptr record; + Status s = TracerHelper::DecodeTraceRecord(&(ra->trace_entry), + ra->trace_file_version, &record); + if (!s.ok()) { + // Stop the replay + if (ra->error_cb != nullptr) { + ra->error_cb(s, ra->trace_entry.ts); + } + // Report the result + if (ra->result_cb != nullptr) { + ra->result_cb(s, nullptr); + } + return; + } + + if (ra->result_cb == nullptr) { + s = record->Accept(ra->handler, nullptr); + } else { + std::unique_ptr res; + s = record->Accept(ra->handler, &res); + ra->result_cb(s, std::move(res)); + } + record.reset(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/trace/replayer_impl.h b/src/rocksdb/utilities/trace/replayer_impl.h new file mode 100644 index 000000000..367b0b51e --- /dev/null +++ b/src/rocksdb/utilities/trace/replayer_impl.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/trace_record_result.h" +#include "rocksdb/utilities/replayer.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { + +class ReplayerImpl : public Replayer { + public: + ReplayerImpl(DB* db, const std::vector& handles, + std::unique_ptr&& reader); + ~ReplayerImpl() override; + + using Replayer::Prepare; + Status Prepare() override; + + using Replayer::Next; + Status Next(std::unique_ptr* record) override; + + using Replayer::Execute; + Status Execute(const std::unique_ptr& record, + std::unique_ptr* result) override; + + using Replayer::Replay; + Status Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) override; + + using Replayer::GetHeaderTimestamp; + uint64_t GetHeaderTimestamp() const override; + + private: + Status ReadHeader(Trace* header); + Status ReadTrace(Trace* trace); + + // Generic function to execute a Trace in a thread pool. + static void BackgroundWork(void* arg); + + std::unique_ptr trace_reader_; + std::mutex mutex_; + std::atomic prepared_; + std::atomic trace_end_; + uint64_t header_ts_; + std::unique_ptr exec_handler_; + Env* env_; + // When reading the trace header, the trace file version can be parsed. + // Replayer will use different decode method to get the trace content based + // on different trace file version. + int trace_file_version_; +}; + +// Arguments passed to BackgroundWork() for replaying in a thread pool. +struct ReplayerWorkerArg { + Trace trace_entry; + int trace_file_version; + // Handler to execute TraceRecord. + TraceRecord::Handler* handler; + // Callback function to report the error status and the timestamp of the + // TraceRecord (not the start/end timestamp of executing the TraceRecord). + std::function error_cb; + // Callback function to report the trace execution status and operation + // execution status/result(s). + std::function&&)> result_cb; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.cc b/src/rocksdb/utilities/transactions/lock/lock_manager.cc new file mode 100644 index 000000000..df16b32ad --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_manager.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/lock_manager.h" + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt) { + assert(db); + if (opt.lock_mgr_handle) { + // A custom lock manager was provided in options + auto mgr = opt.lock_mgr_handle->getLockManager(); + return std::shared_ptr(opt.lock_mgr_handle, mgr); + } else { + // Use a point lock manager by default + return std::shared_ptr(new PointLockManager(db, opt)); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.h b/src/rocksdb/utilities/transactions/lock/lock_manager.h new file mode 100644 index 000000000..a5ce1948c --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_manager.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +class LockManager { + public: + virtual ~LockManager() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Locks acquired through this LockManager should be tracked by + // the LockTrackers created through the returned factory. + virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0; + + // Enable locking for the specified column family. + // Caller should guarantee that this column family is not already enabled. + virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Disable locking for the specified column family. + // Caller should guarantee that this column family is no longer used. + virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Attempt to lock a key or a key range. If OK status is returned, the caller + // is responsible for calling UnLock() on this key. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) = 0; + // The range [start, end] are inclusive at both sides. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env, bool exclusive) = 0; + + // Unlock a key or a range locked by TryLock(). txn must be the same + // Transaction that locked this key. + virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const std::string& key, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env) = 0; + + using PointLockStatus = std::unordered_multimap; + virtual PointLockStatus GetPointLockStatus() = 0; + + using RangeLockStatus = + std::unordered_multimap; + virtual RangeLockStatus GetRangeLockStatus() = 0; + + virtual std::vector GetDeadlockInfoBuffer() = 0; + + virtual void Resize(uint32_t new_size) = 0; +}; + +// LockManager should always be constructed through this factory method, +// instead of constructing through concrete implementations' constructor. +// Caller owns the returned pointer. +std::shared_ptr NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/lock_tracker.h b/src/rocksdb/utilities/transactions/lock/lock_tracker.h new file mode 100644 index 000000000..5fa228a82 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_tracker.h @@ -0,0 +1,209 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Request for locking a single key. +struct PointLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id = 0; + // The key to lock. + std::string key; + // The sequence number from which there is no concurrent update to key. + SequenceNumber seq = 0; + // Whether the lock is acquired only for read. + bool read_only = false; + // Whether the lock is in exclusive mode. + bool exclusive = true; +}; + +// Request for locking a range of keys. +struct RangeLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id; + + // The range to be locked + Endpoint start_endp; + Endpoint end_endp; +}; + +struct PointLockStatus { + // Whether the key is locked. + bool locked = false; + // Whether the key is locked in exclusive mode. + bool exclusive = true; + // The sequence number in the tracked PointLockRequest. + SequenceNumber seq = 0; +}; + +// Return status when calling LockTracker::Untrack. +enum class UntrackStatus { + // The lock is not tracked at all, so no lock to untrack. + NOT_TRACKED, + // The lock is untracked but not removed from the tracker. + UNTRACKED, + // The lock is removed from the tracker. + REMOVED, +}; + +// Tracks the lock requests. +// In PessimisticTransaction, it tracks the locks acquired through LockMgr; +// In OptimisticTransaction, since there is no LockMgr, it tracks the lock +// intention. Not thread-safe. +class LockTracker { + public: + virtual ~LockTracker() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Tracks the acquirement of a lock on key. + // + // If this method is not supported, leave it as a no-op. + virtual void Track(const PointLockRequest& /*lock_request*/) = 0; + + // Untracks the lock on a key. + // seq and exclusive in lock_request are not used. + // + // If this method is not supported, leave it as a no-op and + // returns NOT_TRACKED. + virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0; + + // Counterpart of Track(const PointLockRequest&) for RangeLockRequest. + virtual void Track(const RangeLockRequest& /*lock_request*/) = 0; + + // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest. + virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0; + + // Merges lock requests tracked in the specified tracker into the current + // tracker. + // + // E.g. for point lock, if a key in tracker is not yet tracked, + // track this new key; otherwise, merge the tracked information of the key + // such as lock's exclusiveness, read/write statistics. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: the specified tracker must be of the same concrete class type as + // the current tracker. + virtual void Merge(const LockTracker& /*tracker*/) = 0; + + // This is a reverse operation of Merge. + // + // E.g. for point lock, if a key exists in both current and the sepcified + // tracker, then subtract the information (such as read/write statistics) of + // the key in the specified tracker from the current tracker. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: + // The specified tracker must be of the same concrete class type as + // the current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual void Subtract(const LockTracker& /*tracker*/) = 0; + + // Clears all tracked locks. + virtual void Clear() = 0; + + // Gets the new locks (excluding the locks that have been tracked before the + // save point) tracked since the specified save point, the result is stored + // in an internally constructed LockTracker and returned. + // + // save_point_tracker is the tracker used by a SavePoint to track locks + // tracked after creating the SavePoint. + // + // The implementation should document whether point lock, or range lock, or + // both are considered in this method. + // If this method is not supported, returns nullptr. + // + // REQUIRED: + // The save_point_tracker must be of the same concrete class type as the + // current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& /*save_point_tracker*/) const = 0; + + // Gets lock related information of the key. + // + // If point lock is not supported, always returns LockStatus with + // locked=false. + virtual PointLockStatus GetPointLockStatus( + ColumnFamilyId /*column_family_id*/, + const std::string& /*key*/) const = 0; + + // Gets number of tracked point locks. + // + // If point lock is not supported, always returns 0. + virtual uint64_t GetNumPointLocks() const = 0; + + class ColumnFamilyIterator { + public: + virtual ~ColumnFamilyIterator() {} + + // Whether there are remaining column families. + virtual bool HasNext() const = 0; + + // Gets next column family id. + // + // If HasNext is false, calling this method has undefined behavior. + virtual ColumnFamilyId Next() = 0; + }; + + // Gets an iterator for column families. + // + // Returned iterator must not be nullptr. + // If there is no column family to iterate, + // returns an empty non-null iterator. + // Caller owns the returned pointer. + virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0; + + class KeyIterator { + public: + virtual ~KeyIterator() {} + + // Whether there are remaining keys. + virtual bool HasNext() const = 0; + + // Gets the next key. + // + // If HasNext is false, calling this method has undefined behavior. + virtual const std::string& Next() = 0; + }; + + // Gets an iterator for keys with tracked point locks in the column family. + // + // The column family must exist. + // Returned iterator must not be nullptr. + // Caller owns the returned pointer. + virtual KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const = 0; +}; + +// LockTracker should always be constructed through this factory. +// Each LockManager owns a LockTrackerFactory. +class LockTrackerFactory { + public: + // Caller owns the returned pointer. + virtual LockTracker* Create() const = 0; + virtual ~LockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc new file mode 100644 index 000000000..b362a164d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc @@ -0,0 +1,721 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +#include +#include +#include + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/thread_local.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +struct LockInfo { + bool exclusive; + autovector txn_ids; + + // Transaction locks are not valid after this time in us + uint64_t expiration_time; + + LockInfo(TransactionID id, uint64_t time, bool ex) + : exclusive(ex), expiration_time(time) { + txn_ids.push_back(id); + } + LockInfo(const LockInfo& lock_info) + : exclusive(lock_info.exclusive), + txn_ids(lock_info.txn_ids), + expiration_time(lock_info.expiration_time) {} + void operator=(const LockInfo& lock_info) { + exclusive = lock_info.exclusive; + txn_ids = lock_info.txn_ids; + expiration_time = lock_info.expiration_time; + } + DECLARE_DEFAULT_MOVES(LockInfo); +}; + +struct LockMapStripe { + explicit LockMapStripe(std::shared_ptr factory) { + stripe_mutex = factory->AllocateMutex(); + stripe_cv = factory->AllocateCondVar(); + assert(stripe_mutex); + assert(stripe_cv); + } + + // Mutex must be held before modifying keys map + std::shared_ptr stripe_mutex; + + // Condition Variable per stripe for waiting on a lock + std::shared_ptr stripe_cv; + + // Locked keys mapped to the info about the transactions that locked them. + // TODO(agiardullo): Explore performance of other data structures. + UnorderedMap keys; +}; + +// Map of #num_stripes LockMapStripes +struct LockMap { + explicit LockMap(size_t num_stripes, + std::shared_ptr factory) + : num_stripes_(num_stripes) { + lock_map_stripes_.reserve(num_stripes); + for (size_t i = 0; i < num_stripes; i++) { + LockMapStripe* stripe = new LockMapStripe(factory); + lock_map_stripes_.push_back(stripe); + } + } + + ~LockMap() { + for (auto stripe : lock_map_stripes_) { + delete stripe; + } + } + + // Number of sepearate LockMapStripes to create, each with their own Mutex + const size_t num_stripes_; + + // Count of keys that are currently locked in this column family. + // (Only maintained if PointLockManager::max_num_locks_ is positive.) + std::atomic lock_cnt{0}; + + std::vector lock_map_stripes_; + + size_t GetStripe(const std::string& key) const; +}; + +namespace { +void UnrefLockMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_maps_cache = + static_cast>*>(ptr); + delete lock_maps_cache; +} +} // anonymous namespace + +PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db, + const TransactionDBOptions& opt) + : txn_db_impl_(txn_db), + default_num_stripes_(opt.num_stripes), + max_num_locks_(opt.max_num_locks), + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)), + dlock_buffer_(opt.max_num_deadlocks), + mutex_factory_(opt.custom_mutex_factory + ? opt.custom_mutex_factory + : std::make_shared()) {} + +size_t LockMap::GetStripe(const std::string& key) const { + assert(num_stripes_ > 0); + return FastRange64(GetSliceNPHash64(key), num_stripes_); +} + +void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) { + InstrumentedMutexLock l(&lock_map_mutex_); + + if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) { + lock_maps_.emplace(cf->GetID(), std::make_shared( + default_num_stripes_, mutex_factory_)); + } else { + // column_family already exists in lock map + assert(false); + } +} + +void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) { + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep using it + // until they release their references to it. + { + InstrumentedMutexLock l(&lock_map_mutex_); + + auto lock_maps_iter = lock_maps_.find(cf->GetID()); + if (lock_maps_iter == lock_maps_.end()) { + return; + } + + lock_maps_.erase(lock_maps_iter); + } // lock_map_mutex_ + + // Clear all thread-local caches + autovector local_caches; + lock_maps_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } +} + +// Look up the LockMap std::shared_ptr for a given column_family_id. +// Note: The LockMap is only valid as long as the caller is still holding on +// to the returned std::shared_ptr. +std::shared_ptr PointLockManager::GetLockMap( + ColumnFamilyId column_family_id) { + // First check thread-local cache + if (lock_maps_cache_->Get() == nullptr) { + lock_maps_cache_->Reset(new LockMaps()); + } + + auto lock_maps_cache = static_cast(lock_maps_cache_->Get()); + + auto lock_map_iter = lock_maps_cache->find(column_family_id); + if (lock_map_iter != lock_maps_cache->end()) { + // Found lock map for this column family. + return lock_map_iter->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(&lock_map_mutex_); + + lock_map_iter = lock_maps_.find(column_family_id); + if (lock_map_iter == lock_maps_.end()) { + return std::shared_ptr(nullptr); + } else { + // Found lock map. Store in thread-local cache and return. + std::shared_ptr& lock_map = lock_map_iter->second; + lock_maps_cache->insert({column_family_id, lock_map}); + + return lock_map; + } +} + +// Returns true if this lock has expired and can be acquired by another +// transaction. +// If false, sets *expire_time to the expiration time of the lock according +// to Env->GetMicros() or 0 if no expiration. +bool PointLockManager::IsLockExpired(TransactionID txn_id, + const LockInfo& lock_info, Env* env, + uint64_t* expire_time) { + if (lock_info.expiration_time == 0) { + *expire_time = 0; + return false; + } + + auto now = env->NowMicros(); + bool expired = lock_info.expiration_time <= now; + if (!expired) { + // return how many microseconds until lock will be expired + *expire_time = lock_info.expiration_time; + } else { + for (auto id : lock_info.txn_ids) { + if (txn_id == id) { + continue; + } + + bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id); + if (!success) { + expired = false; + *expire_time = 0; + break; + } + } + } + + return expired; +} + +Status PointLockManager::TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, + bool exclusive) { + // Lookup lock map for this column family id + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + char msg[255]; + snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, + column_family_id); + + return Status::InvalidArgument(msg); + } + + // Need to lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive); + int64_t timeout = txn->GetLockTimeout(); + + return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env, + timeout, lock_info); +} + +// Helper function for TryLock(). +Status PointLockManager::AcquireWithTimeout( + PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe, + ColumnFamilyId column_family_id, const std::string& key, Env* env, + int64_t timeout, const LockInfo& lock_info) { + Status result; + uint64_t end_time = 0; + + if (timeout > 0) { + uint64_t start_time = env->NowMicros(); + end_time = start_time + timeout; + } + + if (timeout < 0) { + // If timeout is negative, we wait indefinitely to acquire the lock + result = stripe->stripe_mutex->Lock(); + } else { + result = stripe->stripe_mutex->TryLockFor(timeout); + } + + if (!result.ok()) { + // failed to acquire mutex + return result; + } + + // Acquire lock if we are able to + uint64_t expire_time_hint = 0; + autovector wait_ids; + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + + if (!result.ok() && timeout != 0) { + PERF_TIMER_GUARD(key_lock_wait_time); + PERF_COUNTER_ADD(key_lock_wait_count, 1); + // If we weren't able to acquire the lock, we will keep retrying as long + // as the timeout allows. + bool timed_out = false; + do { + // Decide how long to wait + int64_t cv_end_time = -1; + if (expire_time_hint > 0 && end_time > 0) { + cv_end_time = std::min(expire_time_hint, end_time); + } else if (expire_time_hint > 0) { + cv_end_time = expire_time_hint; + } else if (end_time > 0) { + cv_end_time = end_time; + } + + assert(result.IsBusy() || wait_ids.size() != 0); + + // We are dependent on a transaction to finish, so perform deadlock + // detection. + if (wait_ids.size() != 0) { + if (txn->IsDeadlockDetect()) { + if (IncrementWaiters(txn, wait_ids, key, column_family_id, + lock_info.exclusive, env)) { + result = Status::Busy(Status::SubCode::kDeadlock); + stripe->stripe_mutex->UnLock(); + return result; + } + } + txn->SetWaitingTxn(wait_ids, column_family_id, &key); + } + + TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn"); + if (cv_end_time < 0) { + // Wait indefinitely + result = stripe->stripe_cv->Wait(stripe->stripe_mutex); + } else { + uint64_t now = env->NowMicros(); + if (static_cast(cv_end_time) > now) { + result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex, + cv_end_time - now); + } + } + + if (wait_ids.size() != 0) { + txn->ClearWaitingTxn(); + if (txn->IsDeadlockDetect()) { + DecrementWaiters(txn, wait_ids); + } + } + + if (result.IsTimedOut()) { + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). + } + + if (result.ok() || result.IsTimedOut()) { + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + } + } while (!result.ok() && !timed_out); + } + + stripe->stripe_mutex->UnLock(); + + return result; +} + +void PointLockManager::DecrementWaiters( + const PessimisticTransaction* txn, + const autovector& wait_ids) { + std::lock_guard lock(wait_txn_map_mutex_); + DecrementWaitersImpl(txn, wait_ids); +} + +void PointLockManager::DecrementWaitersImpl( + const PessimisticTransaction* txn, + const autovector& wait_ids) { + auto id = txn->GetID(); + assert(wait_txn_map_.Contains(id)); + wait_txn_map_.Delete(id); + + for (auto wait_id : wait_ids) { + rev_wait_txn_map_.Get(wait_id)--; + if (rev_wait_txn_map_.Get(wait_id) == 0) { + rev_wait_txn_map_.Delete(wait_id); + } + } +} + +bool PointLockManager::IncrementWaiters( + const PessimisticTransaction* txn, + const autovector& wait_ids, const std::string& key, + const uint32_t& cf_id, const bool& exclusive, Env* const env) { + auto id = txn->GetID(); + std::vector queue_parents( + static_cast(txn->GetDeadlockDetectDepth())); + std::vector queue_values( + static_cast(txn->GetDeadlockDetectDepth())); + std::lock_guard lock(wait_txn_map_mutex_); + assert(!wait_txn_map_.Contains(id)); + + wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key}); + + for (auto wait_id : wait_ids) { + if (rev_wait_txn_map_.Contains(wait_id)) { + rev_wait_txn_map_.Get(wait_id)++; + } else { + rev_wait_txn_map_.Insert(wait_id, 1); + } + } + + // No deadlock if nobody is waiting on self. + if (!rev_wait_txn_map_.Contains(id)) { + return false; + } + + const auto* next_ids = &wait_ids; + int parent = -1; + int64_t deadlock_time = 0; + for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) { + int i = 0; + if (next_ids) { + for (; i < static_cast(next_ids->size()) && + tail + i < txn->GetDeadlockDetectDepth(); + i++) { + queue_values[tail + i] = (*next_ids)[i]; + queue_parents[tail + i] = parent; + } + tail += i; + } + + // No more items in the list, meaning no deadlock. + if (tail == head) { + return false; + } + + auto next = queue_values[head]; + if (next == id) { + std::vector path; + while (head != -1) { + assert(wait_txn_map_.Contains(queue_values[head])); + + auto extracted_info = wait_txn_map_.Get(queue_values[head]); + path.push_back({queue_values[head], extracted_info.m_cf_id, + extracted_info.m_exclusive, + extracted_info.m_waiting_key}); + head = queue_parents[head]; + } + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the + status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED + passes. Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + std::reverse(path.begin(), path.end()); + dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time)); + deadlock_time = 0; + DecrementWaitersImpl(txn, wait_ids); + return true; + } else if (!wait_txn_map_.Contains(next)) { + next_ids = nullptr; + continue; + } else { + parent = head; + next_ids = &(wait_txn_map_.Get(next).m_neighbors); + } + } + + // Wait cycle too big, just assume deadlock. + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the status + of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes. + Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true)); + DecrementWaitersImpl(txn, wait_ids); + return true; +} + +// Try to lock this key after we have acquired the mutex. +// Sets *expire_time to the expiration time in microseconds +// or 0 if no expiration. +// REQUIRED: Stripe mutex must be held. +Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& txn_lock_info, + uint64_t* expire_time, + autovector* txn_ids) { + assert(txn_lock_info.txn_ids.size() == 1); + + Status result; + // Check if this key is already locked + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + // Lock already held + LockInfo& lock_info = stripe_iter->second; + assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive); + + if (lock_info.exclusive || txn_lock_info.exclusive) { + if (lock_info.txn_ids.size() == 1 && + lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) { + // The list contains one txn and we're it, so just take it. + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + } else { + // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case + // it's there for a shared lock with multiple holders which was not + // caught in the first case. + if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env, + expire_time)) { + // lock is expired, can steal it + lock_info.txn_ids = txn_lock_info.txn_ids; + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + // lock_cnt does not change + } else { + result = Status::TimedOut(Status::SubCode::kLockTimeout); + *txn_ids = lock_info.txn_ids; + } + } + } else { + // We are requesting shared access to a shared lock, so just grant it. + lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]); + // Using std::max means that expiration time never goes down even when + // a transaction is removed from the list. The correct solution would be + // to track expiry for every transaction, but this would also work for + // now. + lock_info.expiration_time = + std::max(lock_info.expiration_time, txn_lock_info.expiration_time); + } + } else { // Lock not held. + // Check lock limit + if (max_num_locks_ > 0 && + lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { + result = Status::Busy(Status::SubCode::kLockLimit); + } else { + // acquire lock + stripe->keys.emplace(key, txn_lock_info); + + // Maintain lock count if there is a limit on the number of locks + if (max_num_locks_) { + lock_map->lock_cnt++; + } + } + } + + return result; +} + +void PointLockManager::UnLockKey(PessimisticTransaction* txn, + const std::string& key, LockMapStripe* stripe, + LockMap* lock_map, Env* env) { +#ifdef NDEBUG + (void)env; +#endif + TransactionID txn_id = txn->GetID(); + + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + auto& txns = stripe_iter->second.txn_ids; + auto txn_it = std::find(txns.begin(), txns.end(), txn_id); + // Found the key we locked. unlock it. + if (txn_it != txns.end()) { + if (txns.size() == 1) { + stripe->keys.erase(stripe_iter); + } else { + auto last_it = txns.end() - 1; + if (txn_it != last_it) { + *txn_it = *last_it; + } + txns.pop_back(); + } + + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } + } else { + // This key is either not locked or locked by someone else. This should + // only happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() < env->NowMicros()); + } +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env) { + std::shared_ptr lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + UnLockKey(txn, key, stripe, lock_map, env); + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + const LockTracker& tracker, Env* env) { + std::unique_ptr cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::shared_ptr lock_map_ptr = GetLockMap(cf); + LockMap* lock_map = lock_map_ptr.get(); + if (!lock_map) { + // Column Family must have been dropped. + return; + } + + // Bucket keys by lock_map_ stripe + UnorderedMap> keys_by_stripe( + lock_map->num_stripes_); + std::unique_ptr key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + size_t stripe_num = lock_map->GetStripe(key); + keys_by_stripe[stripe_num].push_back(&key); + } + + // For each stripe, grab the stripe mutex and unlock all keys in this stripe + for (auto& stripe_iter : keys_by_stripe) { + size_t stripe_num = stripe_iter.first; + auto& stripe_keys = stripe_iter.second; + + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + + for (const std::string* key : stripe_keys) { + UnLockKey(txn, *key, stripe, lock_map, env); + } + + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); + } + } +} + +PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() { + PointLockStatus data; + // Lock order here is important. The correct order is lock_map_mutex_, then + // for every column family ID in ascending order lock every stripe in + // ascending order. + InstrumentedMutexLock l(&lock_map_mutex_); + + std::vector cf_ids; + for (const auto& map : lock_maps_) { + cf_ids.push_back(map.first); + } + std::sort(cf_ids.begin(), cf_ids.end()); + + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + // Iterate and lock all stripes in ascending order. + for (const auto& j : stripes) { + j->stripe_mutex->Lock().PermitUncheckedError(); + for (const auto& it : j->keys) { + struct KeyLockInfo info; + info.exclusive = it.second.exclusive; + info.key = it.first; + for (const auto& id : it.second.txn_ids) { + info.ids.push_back(id); + } + data.insert({i, info}); + } + } + } + + // Unlock everything. Unlocking order is not important. + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + for (const auto& j : stripes) { + j->stripe_mutex->UnLock(); + } + } + + return data; +} + +std::vector PointLockManager::GetDeadlockInfoBuffer() { + return dlock_buffer_.PrepareBuffer(); +} + +void PointLockManager::Resize(uint32_t target_size) { + dlock_buffer_.Resize(target_size); +} + +PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() { + return {}; +} + +Status PointLockManager::TryLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */, + bool /* exclusive */) { + return Status::NotSupported( + "PointLockManager does not support range locking"); +} + +void PointLockManager::UnLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */) { + // no-op +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h new file mode 100644 index 000000000..eeb34f3be --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h @@ -0,0 +1,224 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/utilities/transaction.h" +#include "util/autovector.h" +#include "util/hash_containers.h" +#include "util/hash_map.h" +#include "util/thread_local.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +struct LockInfo; +struct LockMap; +struct LockMapStripe; + +template +class DeadlockInfoBufferTempl { + private: + std::vector paths_buffer_; + uint32_t buffer_idx_; + std::mutex paths_buffer_mutex_; + + std::vector Normalize() { + auto working = paths_buffer_; + + if (working.empty()) { + return working; + } + + // Next write occurs at a nonexistent path's slot + if (paths_buffer_[buffer_idx_].empty()) { + working.resize(buffer_idx_); + } else { + std::rotate(working.begin(), working.begin() + buffer_idx_, + working.end()); + } + + return working; + } + + public: + explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks) + : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {} + + void AddNewPath(Path path) { + std::lock_guard lock(paths_buffer_mutex_); + + if (paths_buffer_.empty()) { + return; + } + + paths_buffer_[buffer_idx_] = std::move(path); + buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size(); + } + + void Resize(uint32_t target_size) { + std::lock_guard lock(paths_buffer_mutex_); + + paths_buffer_ = Normalize(); + + // Drop the deadlocks that will no longer be needed ater the normalize + if (target_size < paths_buffer_.size()) { + paths_buffer_.erase( + paths_buffer_.begin(), + paths_buffer_.begin() + (paths_buffer_.size() - target_size)); + buffer_idx_ = 0; + } + // Resize the buffer to the target size and restore the buffer's idx + else { + auto prev_size = paths_buffer_.size(); + paths_buffer_.resize(target_size); + buffer_idx_ = (uint32_t)prev_size; + } + } + + std::vector PrepareBuffer() { + std::lock_guard lock(paths_buffer_mutex_); + + // Reversing the normalized vector returns the latest deadlocks first + auto working = Normalize(); + std::reverse(working.begin(), working.end()); + + return working; + } +}; + +using DeadlockInfoBuffer = DeadlockInfoBufferTempl; + +struct TrackedTrxInfo { + autovector m_neighbors; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +class PointLockManager : public LockManager { + public: + PointLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + // No copying allowed + PointLockManager(const PointLockManager&) = delete; + PointLockManager& operator=(const PointLockManager&) = delete; + + ~PointLockManager() override {} + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return PointLockTrackerFactory::Get(); + } + + // Creates a new LockMap for this column family. Caller should guarantee + // that this column family does not already exist. + void AddColumnFamily(const ColumnFamilyHandle* cf) override; + // Deletes the LockMap for this column family. Caller should guarantee that + // this column family is no longer in use. + void RemoveColumnFamily(const ColumnFamilyHandle* cf) override; + + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env) override; + + PointLockStatus GetPointLockStatus() override; + + RangeLockStatus GetRangeLockStatus() override; + + std::vector GetDeadlockInfoBuffer() override; + + void Resize(uint32_t new_size) override; + + private: + PessimisticTransactionDB* txn_db_impl_; + + // Default number of lock map stripes per column family + const size_t default_num_stripes_; + + // Limit on number of keys locked per column family + const int64_t max_num_locks_; + + // The following lock order must be satisfied in order to avoid deadlocking + // ourselves. + // - lock_map_mutex_ + // - stripe mutexes in ascending cf id, ascending stripe order + // - wait_txn_map_mutex_ + // + // Must be held when accessing/modifying lock_maps_. + InstrumentedMutex lock_map_mutex_; + + // Map of ColumnFamilyId to locked key info + using LockMaps = UnorderedMap>; + LockMaps lock_maps_; + + // Thread-local cache of entries in lock_maps_. This is an optimization + // to avoid acquiring a mutex in order to look up a LockMap + std::unique_ptr lock_maps_cache_; + + // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_. + std::mutex wait_txn_map_mutex_; + + // Maps from waitee -> number of waiters. + HashMap rev_wait_txn_map_; + // Maps from waiter -> waitee. + HashMap wait_txn_map_; + DeadlockInfoBuffer dlock_buffer_; + + // Used to allocate mutexes/condvars to use when locking keys + std::shared_ptr mutex_factory_; + + bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env, + uint64_t* wait_time); + + std::shared_ptr GetLockMap(uint32_t column_family_id); + + Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map, + LockMapStripe* stripe, uint32_t column_family_id, + const std::string& key, Env* env, int64_t timeout, + const LockInfo& lock_info); + + Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& lock_info, uint64_t* wait_time, + autovector* txn_ids); + + void UnLockKey(PessimisticTransaction* txn, const std::string& key, + LockMapStripe* stripe, LockMap* lock_map, Env* env); + + bool IncrementWaiters(const PessimisticTransaction* txn, + const autovector& wait_ids, + const std::string& key, const uint32_t& cf_id, + const bool& exclusive, Env* const env); + void DecrementWaiters(const PessimisticTransaction* txn, + const autovector& wait_ids); + void DecrementWaitersImpl(const PessimisticTransaction* txn, + const autovector& wait_ids); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc new file mode 100644 index 000000000..525fdea71 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_manager_test.h" + +namespace ROCKSDB_NAMESPACE { + +// This test is not applicable for Range Lock manager as Range Lock Manager +// operates on Column Families, not their ids. +TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) { + MockColumnFamilyHandle cf(1024); + locker_->RemoveColumnFamily(&cf); + auto txn = NewTxn(); + auto s = locker_->TryLock(txn, 1024, "k", env_, true); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STREQ(s.getState(), "Column family id not found: 1024"); + delete txn; +} + +TEST_F(PointLockManagerTest, LockStatus) { + MockColumnFamilyHandle cf1(1024), cf2(2048); + locker_->AddColumnFamily(&cf1); + locker_->AddColumnFamily(&cf2); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true)); + ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true)); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false)); + ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false)); + + auto s = locker_->GetPointLockStatus(); + ASSERT_EQ(s.size(), 4u); + for (uint32_t cf_id : {1024, 2048}) { + ASSERT_EQ(s.count(cf_id), 2u); + auto range = s.equal_range(cf_id); + for (auto it = range.first; it != range.second; it++) { + ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2"); + if (it->second.key == "k1") { + ASSERT_EQ(it->second.exclusive, true); + ASSERT_EQ(it->second.ids.size(), 1u); + ASSERT_EQ(it->second.ids[0], txn1->GetID()); + } else if (it->second.key == "k2") { + ASSERT_EQ(it->second.exclusive, false); + ASSERT_EQ(it->second.ids.size(), 1u); + ASSERT_EQ(it->second.ids[0], txn2->GetID()); + } + } + } + + // Cleanup + locker_->UnLock(txn1, 1024, "k1", env_); + locker_->UnLock(txn1, 2048, "k1", env_); + locker_->UnLock(txn2, 1024, "k2", env_); + locker_->UnLock(txn2, 2048, "k2", env_); + + delete txn1; + delete txn2; +} + +TEST_F(PointLockManagerTest, UnlockExclusive) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true)); + locker_->UnLock(txn1, 1, "k", env_); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +TEST_F(PointLockManagerTest, UnlockShared) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + locker_->UnLock(txn1, 1, "k", env_); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +// This test doesn't work with Range Lock Manager, because Range Lock Manager +// doesn't support deadlock_detect_depth. + +TEST_F(PointLockManagerTest, DeadlockDepthExceeded) { + // Tests that when detecting deadlock, if the detection depth is exceeded, + // it's also viewed as deadlock. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + TransactionOptions txn_opt; + txn_opt.deadlock_detect = true; + txn_opt.deadlock_detect_depth = 1; + txn_opt.lock_timeout = 1000000; + auto txn1 = NewTxn(txn_opt); + auto txn2 = NewTxn(txn_opt); + auto txn3 = NewTxn(txn_opt); + auto txn4 = NewTxn(txn_opt); + // "a ->(k) b" means transaction a is waiting for transaction b to release + // the held lock on key k. + // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1 + // txn3's deadlock detection will exceed the detection depth 1, + // which will be viewed as a deadlock. + // NOTE: + // txn4 ->(k3) -> txn3 must be set up before + // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3, + // it must have another txn waiting on it, which is txn4 in this case. + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + + port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); + // block because txn1 is holding a lock on k1. + locker_->TryLock(txn2, 1, "k1", env_, true); + }); + + ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true)); + + port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + // block because txn3 is holding a lock on k1. + locker_->TryLock(txn4, 1, "k3", env_, true); + }); + + auto s = locker_->TryLock(txn3, 1, "k2", env_, true); + ASSERT_TRUE(s.IsBusy()); + ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock); + + std::vector deadlock_paths = locker_->GetDeadlockInfoBuffer(); + ASSERT_EQ(deadlock_paths.size(), 1u); + ASSERT_TRUE(deadlock_paths[0].limit_exceeded); + + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn3, 1, "k3", env_); + t1.join(); + t2.join(); + + delete txn4; + delete txn3; + delete txn2; + delete txn1; +} + +INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest, + ::testing::Values(nullptr)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h new file mode 100644 index 000000000..ca9f46bf9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h @@ -0,0 +1,324 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class MockColumnFamilyHandle : public ColumnFamilyHandle { + public: + explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {} + + ~MockColumnFamilyHandle() override {} + + const std::string& GetName() const override { return name_; } + + ColumnFamilyId GetID() const override { return cf_id_; } + + Status GetDescriptor(ColumnFamilyDescriptor*) override { + return Status::OK(); + } + + const Comparator* GetComparator() const override { + return BytewiseComparator(); + } + + private: + ColumnFamilyId cf_id_; + std::string name_ = "MockCF"; +}; + +class PointLockManagerTest : public testing::Test { + public: + void SetUp() override { + env_ = Env::Default(); + db_dir_ = test::PerThreadDBPath("point_lock_manager_test"); + ASSERT_OK(env_->CreateDir(db_dir_)); + + Options opt; + opt.create_if_missing = true; + TransactionDBOptions txn_opt; + txn_opt.transaction_lock_timeout = 0; + + ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_)); + + // CAUTION: This test creates a separate lock manager object (right, NOT + // the one that the TransactionDB is using!), and runs tests on it. + locker_.reset(new PointLockManager( + static_cast(db_), txn_opt)); + + wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn"; + } + + void TearDown() override { + delete db_; + EXPECT_OK(DestroyDir(env_, db_dir_)); + } + + PessimisticTransaction* NewTxn( + TransactionOptions txn_opt = TransactionOptions()) { + Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt); + return reinterpret_cast(txn); + } + + protected: + Env* env_; + std::shared_ptr locker_; + const char* wait_sync_point_name_; + friend void PointLockManagerTestExternalSetup(PointLockManagerTest*); + + private: + std::string db_dir_; + TransactionDB* db_; +}; + +using init_func_t = void (*)(PointLockManagerTest*); + +class AnyLockManagerTest : public PointLockManagerTest, + public testing::WithParamInterface { + public: + void SetUp() override { + // If a custom setup function was provided, use it. Otherwise, use what we + // have inherited. + auto init_func = GetParam(); + if (init_func) + (*init_func)(this); + else + PointLockManagerTest::SetUp(); + } +}; + +TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) { + // Tests that a txn can acquire exclusive lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, ReentrantSharedLock) { + // Tests that a txn can acquire shared lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, LockUpgrade) { + // Tests that a txn can upgrade from a shared lock to an exclusive lock. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockDowngrade) { + // Tests that a txn can acquire a shared lock after acquiring an exclusive + // lock on the same key. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockConflict) { + // Tests that lock conflicts lead to lock timeout. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + + { + // exclusive-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // exclusive-shared conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, false); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // shared-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + + delete txn1; + delete txn2; +} + +port::Thread BlockUntilWaitingTxn(const char* sync_point_name, + std::function f) { + std::atomic reached(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + sync_point_name, [&](void* /*arg*/) { reached.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t(f); + + while (!reached.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + return t; +} + +TEST_P(AnyLockManagerTest, SharedLocks) { + // Tests that shared locks can be concurrently held by multiple transactions. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +TEST_P(AnyLockManagerTest, Deadlock) { + // Tests that deadlock can be detected. + // Deadlock scenario: + // txn1 exclusively locks k1, and wants to lock k2; + // txn2 exclusively locks k2, and wants to lock k1. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + TransactionOptions txn_opt; + txn_opt.deadlock_detect = true; + txn_opt.lock_timeout = 1000000; + auto txn1 = NewTxn(txn_opt); + auto txn2 = NewTxn(txn_opt); + + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); + + // txn1 tries to lock k2, will block forever. + port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + // block because txn2 is holding a lock on k2. + locker_->TryLock(txn1, 1, "k2", env_, true); + }); + + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsBusy()); + ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock); + + std::vector deadlock_paths = locker_->GetDeadlockInfoBuffer(); + ASSERT_EQ(deadlock_paths.size(), 1u); + ASSERT_FALSE(deadlock_paths[0].limit_exceeded); + + std::vector deadlocks = deadlock_paths[0].path; + ASSERT_EQ(deadlocks.size(), 2u); + + ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID()); + ASSERT_EQ(deadlocks[0].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[0].m_exclusive); + ASSERT_EQ(deadlocks[0].m_waiting_key, "k2"); + + ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID()); + ASSERT_EQ(deadlocks[1].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[1].m_exclusive); + ASSERT_EQ(deadlocks[1].m_waiting_key, "k1"); + + locker_->UnLock(txn2, 1, "k2", env_); + t.join(); + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + delete txn2; + delete txn1; +} + +TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + auto txn3 = NewTxn(); + txn3->SetLockTimeout(10000); + port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true)); + locker_->UnLock(txn3, 1, "k", env_); + }); + + // Ok, now txn3 is waiting for lock on "k", which is owned by two + // transactions. Check that GetWaitingTxns reports this correctly + uint32_t wait_cf_id; + std::string wait_key; + auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key); + + ASSERT_EQ(wait_cf_id, 1u); + ASSERT_EQ(wait_key, "k"); + ASSERT_EQ(waiters.size(), 2); + bool waits_correct = + (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) || + (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID()); + ASSERT_EQ(waits_correct, true); + + // Release locks so txn3 can proceed with execution + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + // Wait until txn3 finishes + t1.join(); + + delete txn1; + delete txn2; + delete txn3; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc new file mode 100644 index 000000000..6204a8f02 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class TrackedKeysColumnFamilyIterator + : public LockTracker::ColumnFamilyIterator { + public: + explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys) + : tracked_keys_(keys), it_(keys.begin()) {} + + bool HasNext() const override { return it_ != tracked_keys_.end(); } + + ColumnFamilyId Next() override { return (it_++)->first; } + + private: + const TrackedKeys& tracked_keys_; + TrackedKeys::const_iterator it_; +}; + +class TrackedKeysIterator : public LockTracker::KeyIterator { + public: + TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id) + : key_infos_(keys.at(id)), it_(key_infos_.begin()) {} + + bool HasNext() const override { return it_ != key_infos_.end(); } + + const std::string& Next() override { return (it_++)->first; } + + private: + const TrackedKeyInfos& key_infos_; + TrackedKeyInfos::const_iterator it_; +}; + +} // namespace + +void PointLockTracker::Track(const PointLockRequest& r) { + auto& keys = tracked_keys_[r.column_family_id]; + auto result = keys.try_emplace(r.key, r.seq); + auto it = result.first; + if (!result.second && r.seq < it->second.seq) { + // Now tracking this key with an earlier sequence number + it->second.seq = r.seq; + } + // else we do not update the seq. The smaller the tracked seq, the stronger it + // the guarantee since it implies from the seq onward there has not been a + // concurrent update to the key. So we update the seq if it implies stronger + // guarantees, i.e., if it is smaller than the existing tracked seq. + + if (r.read_only) { + it->second.num_reads++; + } else { + it->second.num_writes++; + } + + it->second.exclusive = it->second.exclusive || r.exclusive; +} + +UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) { + auto cf_keys = tracked_keys_.find(r.column_family_id); + if (cf_keys == tracked_keys_.end()) { + return UntrackStatus::NOT_TRACKED; + } + + auto& keys = cf_keys->second; + auto it = keys.find(r.key); + if (it == keys.end()) { + return UntrackStatus::NOT_TRACKED; + } + + bool untracked = false; + auto& info = it->second; + if (r.read_only) { + if (info.num_reads > 0) { + info.num_reads--; + untracked = true; + } + } else { + if (info.num_writes > 0) { + info.num_writes--; + untracked = true; + } + } + + bool removed = false; + if (info.num_reads == 0 && info.num_writes == 0) { + keys.erase(it); + if (keys.empty()) { + tracked_keys_.erase(cf_keys); + } + removed = true; + } + + if (removed) { + return UntrackStatus::REMOVED; + } + if (untracked) { + return UntrackStatus::UNTRACKED; + } + return UntrackStatus::NOT_TRACKED; +} + +void PointLockTracker::Merge(const LockTracker& tracker) { + const PointLockTracker& t = static_cast(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto current_cf_keys = tracked_keys_.find(cf); + if (current_cf_keys == tracked_keys_.end()) { + tracked_keys_.emplace(cf_keys); + } else { + auto& current_keys = current_cf_keys->second; + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + // If key was not previously tracked, just copy the whole struct over. + // Otherwise, some merging needs to occur. + auto current_info = current_keys.find(key); + if (current_info == current_keys.end()) { + current_keys.emplace(key_info); + } else { + current_info->second.Merge(info); + } + } + } + } +} + +void PointLockTracker::Subtract(const LockTracker& tracker) { + const PointLockTracker& t = static_cast(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + + // Decrement the total reads/writes of this key by the number of + // reads/writes done since the last SavePoint. + if (num_reads > 0) { + assert(current_key_info->second.num_reads >= num_reads); + current_key_info->second.num_reads -= num_reads; + } + if (num_writes > 0) { + assert(current_key_info->second.num_writes >= num_writes); + current_key_info->second.num_writes -= num_writes; + } + if (current_key_info->second.num_reads == 0 && + current_key_info->second.num_writes == 0) { + current_keys.erase(current_key_info); + } + } + } +} + +LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const { + // Examine the number of reads/writes performed on all keys written + // since the last SavePoint and compare to the total number of reads/writes + // for each key. + LockTracker* t = new PointLockTracker(); + const PointLockTracker& save_point_t = + static_cast(save_point_tracker); + for (const auto& cf_keys : save_point_t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + assert(current_key_info->second.num_reads >= num_reads); + assert(current_key_info->second.num_writes >= num_writes); + + if (current_key_info->second.num_reads == num_reads && + current_key_info->second.num_writes == num_writes) { + // All the reads/writes to this key were done in the last savepoint. + PointLockRequest r; + r.column_family_id = cf; + r.key = key; + r.seq = info.seq; + r.read_only = (num_writes == 0); + r.exclusive = info.exclusive; + t->Track(r); + } + } + } + return t; +} + +PointLockStatus PointLockTracker::GetPointLockStatus( + ColumnFamilyId column_family_id, const std::string& key) const { + assert(IsPointLockSupported()); + PointLockStatus status; + auto it = tracked_keys_.find(column_family_id); + if (it == tracked_keys_.end()) { + return status; + } + + const auto& keys = it->second; + auto key_it = keys.find(key); + if (key_it == keys.end()) { + return status; + } + + const TrackedKeyInfo& key_info = key_it->second; + status.locked = true; + status.exclusive = key_info.exclusive; + status.seq = key_info.seq; + return status; +} + +uint64_t PointLockTracker::GetNumPointLocks() const { + uint64_t num_keys = 0; + for (const auto& cf_keys : tracked_keys_) { + num_keys += cf_keys.second.size(); + } + return num_keys; +} + +LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator() + const { + return new TrackedKeysColumnFamilyIterator(tracked_keys_); +} + +LockTracker::KeyIterator* PointLockTracker::GetKeyIterator( + ColumnFamilyId column_family_id) const { + assert(tracked_keys_.find(column_family_id) != tracked_keys_.end()); + return new TrackedKeysIterator(tracked_keys_, column_family_id); +} + +void PointLockTracker::Clear() { tracked_keys_.clear(); } + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h new file mode 100644 index 000000000..daf6f9aa2 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +struct TrackedKeyInfo { + // Earliest sequence number that is relevant to this transaction for this key + SequenceNumber seq; + + uint32_t num_writes; + uint32_t num_reads; + + bool exclusive; + + explicit TrackedKeyInfo(SequenceNumber seq_no) + : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {} + + void Merge(const TrackedKeyInfo& info) { + assert(seq <= info.seq); + num_reads += info.num_reads; + num_writes += info.num_writes; + exclusive = exclusive || info.exclusive; + } +}; + +using TrackedKeyInfos = std::unordered_map; + +using TrackedKeys = std::unordered_map; + +// Tracks point locks on single keys. +class PointLockTracker : public LockTracker { + public: + PointLockTracker() = default; + + PointLockTracker(const PointLockTracker&) = delete; + PointLockTracker& operator=(const PointLockTracker&) = delete; + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + void Track(const PointLockRequest& lock_request) override; + + UntrackStatus Untrack(const PointLockRequest& lock_request) override; + + void Track(const RangeLockRequest& /*lock_request*/) override {} + + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + void Merge(const LockTracker& tracker) override; + + void Subtract(const LockTracker& tracker) override; + + void Clear() override; + + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const override; + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + uint64_t GetNumPointLocks() const override; + + ColumnFamilyIterator* GetColumnFamilyIterator() const override; + + KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override; + + private: + TrackedKeys tracked_keys_; +}; + +class PointLockTrackerFactory : public LockTrackerFactory { + public: + static const PointLockTrackerFactory& Get() { + static const PointLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new PointLockTracker(); } + + private: + PointLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h new file mode 100644 index 000000000..01899542e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h @@ -0,0 +1,36 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// Generic definitions for a Range-based Lock Manager +// +#pragma once +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +/* + A base class for all Range-based lock managers + + See also class RangeLockManagerHandle in + include/rocksdb/utilities/transaction_db.h +*/ +class RangeLockManagerBase : public LockManager { + public: + // Geting a point lock is reduced to getting a range lock on a single-point + // range + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override { + Endpoint endp(key.data(), key.size(), false); + return TryLock(txn, column_family_id, endp, endp, env, exclusive); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc new file mode 100644 index 000000000..bce66c1f3 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc @@ -0,0 +1,459 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/lock/point/point_lock_manager_test.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_test.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +class RangeLockingTest : public ::testing::Test { + public: + TransactionDB* db; + std::string dbname; + Options options; + + std::shared_ptr range_lock_mgr; + TransactionDBOptions txn_db_options; + + RangeLockingTest() : db(nullptr) { + options.create_if_missing = true; + dbname = test::PerThreadDBPath("range_locking_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + + range_lock_mgr.reset(NewRangeLockManager(nullptr)); + txn_db_options.lock_mgr_handle = range_lock_mgr; + + auto s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(s.ok()); + } + + ~RangeLockingTest() { + delete db; + db = nullptr; + // This is to skip the assert statement in FaultInjectionTestEnv. There + // seems to be a bug in btrfs that the makes readdir return recently + // unlink-ed files. By using the default fs we simply ignore errors resulted + // from attempting to delete such files in DestroyDB. + EXPECT_OK(DestroyDB(dbname, options)); + } + + PessimisticTransaction* NewTxn( + TransactionOptions txn_opt = TransactionOptions()) { + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt); + return reinterpret_cast(txn); + } +}; + +// TODO: set a smaller lock wait timeout so that the test runs faster. +TEST_F(RangeLockingTest, BasicRangeLocking) { + WriteOptions write_options; + TransactionOptions txn_options; + std::string value; + ReadOptions read_options; + auto cf = db->DefaultColumnFamily(); + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + // Check that range Lock inhibits an overlapping range lock + { + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Check that range Lock inhibits an overlapping point lock + { + auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Get a point lock, check that it inhibits range locks + ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value"))); + { + auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p")); + ASSERT_TRUE(s.IsTimedOut()); + } + + ASSERT_OK(txn0->Commit()); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, MyRocksLikeUpdate) { + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto cf = db->DefaultColumnFamily(); + Status s; + + // Get a range lock for the range we are about to update + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + bool try_range_lock_called = false; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RangeTreeLockManager::TryRangeLock:enter", + [&](void* /*arg*/) { try_range_lock_called = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // For performance reasons, the following must NOT call lock_mgr->TryLock(): + // We verify that by checking the value of try_range_lock_called. + ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"), + /*assume_tracked=*/true)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_FALSE(try_range_lock_called); + + txn0->Rollback(); + + delete txn0; +} + +TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) { + WriteOptions write_options; + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + Status s; + std::string value; + txn_options.lock_timeout = 10; + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get the shared lock in txn0 + s = txn0->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, + false /*exclusive*/); + ASSERT_TRUE(s.IsNotFound()); + + // Get the shared lock on the same key in txn1 + s = txn1->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, + false /*exclusive*/); + ASSERT_TRUE(s.IsNotFound()); + + // Now, try getting an exclusive lock that overlaps with the above + s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b")); + ASSERT_TRUE(s.IsTimedOut()); + + txn0->Rollback(); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, SnapshotValidation) { + Status s; + Slice key_slice = Slice("k"); + ColumnFamilyHandle* cfh = db->DefaultColumnFamily(); + + auto txn0 = NewTxn(); + txn0->Put(key_slice, Slice("initial")); + txn0->Commit(); + + // txn1 + auto txn1 = NewTxn(); + txn1->SetSnapshot(); + std::string val1; + ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1)); + ASSERT_EQ(val1, "initial"); + val1 = val1 + std::string("-txn1"); + + ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1))); + + // txn2 + auto txn2 = NewTxn(); + txn2->SetSnapshot(); + std::string val2; + // This will see the original value as nothing is committed + // This is also Get, so it is doesn't acquire any locks. + ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2)); + ASSERT_EQ(val2, "initial"); + + // txn1 + ASSERT_OK(txn1->Commit()); + + // txn2 + val2 = val2 + std::string("-txn2"); + // Now, this call should do Snapshot Validation and fail: + s = txn2->Put(cfh, key_slice, Slice(val2)); + ASSERT_TRUE(s.IsBusy()); + + ASSERT_OK(txn2->Commit()); + + delete txn0; + delete txn1; + delete txn2; +} + +TEST_F(RangeLockingTest, MultipleTrxLockStatusData) { + WriteOptions write_options; + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z"))); + ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e"))); + + auto s = range_lock_mgr->GetRangeLockStatusData(); + ASSERT_EQ(s.size(), 2); + for (auto it = s.begin(); it != s.end(); ++it) { + ASSERT_EQ(it->first, cf->GetID()); + auto val = it->second; + ASSERT_FALSE(val.start.inf_suffix); + ASSERT_FALSE(val.end.inf_suffix); + ASSERT_TRUE(val.exclusive); + ASSERT_EQ(val.ids.size(), 1); + if (val.ids[0] == txn0->GetID()) { + ASSERT_EQ(val.start.slice, "z"); + ASSERT_EQ(val.end.slice, "z"); + } else if (val.ids[0] == txn1->GetID()) { + ASSERT_EQ(val.start.slice, "b"); + ASSERT_EQ(val.end.slice, "e"); + } else { + FAIL(); // Unknown transaction ID. + } + } + + delete txn0; + delete txn1; +} + +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define SKIP_LOCK_ESCALATION_TEST 1 +#endif +#else +#define SKIP_LOCK_ESCALATION_TEST 1 +#endif + +#ifndef SKIP_LOCK_ESCALATION_TEST +TEST_F(RangeLockingTest, BasicLockEscalation) { + auto cf = db->DefaultColumnFamily(); + + auto counters = range_lock_mgr->GetStatus(); + + // Initially not using any lock memory + ASSERT_EQ(counters.current_lock_memory, 0); + ASSERT_EQ(counters.escalation_count, 0); + + ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000)); + + // Insert until we see lock escalations + auto txn = NewTxn(); + + // Get the locks until we hit an escalation + for (int i = 0; i < 2020; i++) { + std::ostringstream buf; + buf << std::setw(8) << std::setfill('0') << i; + std::string buf_str = buf.str(); + ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + counters = range_lock_mgr->GetStatus(); + ASSERT_GT(counters.escalation_count, 0); + ASSERT_LE(counters.current_lock_memory, 2000); + + delete txn; +} + +// An escalation barrier function. Allow escalation iff the first two bytes are +// identical. +static bool escalation_barrier(const Endpoint& a, const Endpoint& b) { + assert(a.slice.size() > 2); + assert(b.slice.size() > 2); + if (memcmp(a.slice.data(), b.slice.data(), 2)) { + return true; // This is a barrier + } else { + return false; // No barrier + } +} + +TEST_F(RangeLockingTest, LockEscalationBarrier) { + auto cf = db->DefaultColumnFamily(); + + auto counters = range_lock_mgr->GetStatus(); + + // Initially not using any lock memory + ASSERT_EQ(counters.escalation_count, 0); + + range_lock_mgr->SetMaxLockMemory(8000); + range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier); + + // Insert enough locks to cause lock escalations to happen + auto txn = NewTxn(); + const int N = 2000; + for (int i = 0; i < N; i++) { + std::ostringstream buf; + buf << std::setw(4) << std::setfill('0') << i; + std::string buf_str = buf.str(); + ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + counters = range_lock_mgr->GetStatus(); + ASSERT_GT(counters.escalation_count, 0); + + // Check that lock escalation was not performed across escalation barriers: + // Use another txn to acquire locks near the barriers. + auto txn2 = NewTxn(); + range_lock_mgr->SetMaxLockMemory(500000); + for (int i = 100; i < N; i += 100) { + std::ostringstream buf; + buf << std::setw(4) << std::setfill('0') << i - 1 << "-a"; + std::string buf_str = buf.str(); + // Check that we CAN get a lock near the escalation barrier + ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + + txn->Rollback(); + txn2->Rollback(); + delete txn; + delete txn2; +} + +#endif + +TEST_F(RangeLockingTest, LockWaitCount) { + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + txn_options.lock_timeout = 50; + Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options); + Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + uint64_t lock_waits1 = range_lock_mgr->GetStatus().lock_wait_count; + // Attempt to get a conflicting lock + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.IsTimedOut()); + + // Check that the counter was incremented + uint64_t lock_waits2 = range_lock_mgr->GetStatus().lock_wait_count; + ASSERT_EQ(lock_waits1 + 1, lock_waits2); + + txn0->Rollback(); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, LockWaiteeAccess) { + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + txn_options.lock_timeout = 60; + Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options); + Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + std::atomic reached(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RangeTreeLockManager::TryRangeLock:EnterWaitingTxn", [&](void* /*arg*/) { + reached.store(true); + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t([&]() { + // Attempt to get a conflicting lock + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.ok()); + txn1->Rollback(); + }); + + while (!reached.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Release locks and free the transaction + txn0->Rollback(); + delete txn0; + + t.join(); + + delete txn1; +} + +void PointLockManagerTestExternalSetup(PointLockManagerTest* self) { + self->env_ = Env::Default(); + self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test"); + ASSERT_OK(self->env_->CreateDir(self->db_dir_)); + + Options opt; + opt.create_if_missing = true; + TransactionDBOptions txn_opt; + txn_opt.transaction_lock_timeout = 0; + + auto mutex_factory = std::make_shared(); + self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager()); + std::shared_ptr range_lock_mgr = + std::dynamic_pointer_cast(self->locker_); + txn_opt.lock_mgr_handle = range_lock_mgr; + + ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_)); + self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn"; +} + +INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest, + ::testing::Values(PointLockManagerTestExternalSetup)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else // OS_WIN + +#include +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "skipped as Range Locking is not supported on Windows\n"); + return 0; +} + +#endif // OS_WIN + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "skipped as transactions are not supported in rocksdb_lite\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 new file mode 100644 index 000000000..dba13ed2d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 new file mode 100644 index 000000000..ecbfc770f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 @@ -0,0 +1,174 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 new file mode 100644 index 000000000..d511905c1 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README new file mode 100644 index 000000000..2ea86bf46 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README @@ -0,0 +1,13 @@ +The files in this directory originally come from +https://github.com/percona/PerconaFT/. + +This directory only includes the "locktree" part of PerconaFT, and its +dependencies. + +The following modifications were made: +- Make locktree usable outside of PerconaFT library +- Add shared read-only lock support + +The files named *_subst.* are substitutes of the PerconaFT's files, they +contain replacements of PerconaFT's functionality. + diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h new file mode 100644 index 000000000..5aa826c8e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h @@ -0,0 +1,76 @@ +#ifndef _DB_H +#define _DB_H + +#include +#include + +typedef struct __toku_dbt DBT; + +// port: this is currently not used +struct simple_dbt { + uint32_t len; + void *data; +}; + +// engine status info +// engine status is passed to handlerton as an array of +// TOKU_ENGINE_STATUS_ROW_S[] +typedef enum { + STATUS_FS_STATE = 0, // interpret as file system state (redzone) enum + STATUS_UINT64, // interpret as uint64_t + STATUS_CHARSTR, // interpret as char * + STATUS_UNIXTIME, // interpret as time_t + STATUS_TOKUTIME, // interpret as tokutime_t + STATUS_PARCOUNT, // interpret as PARTITIONED_COUNTER + STATUS_DOUBLE // interpret as double +} toku_engine_status_display_type; + +typedef enum { + TOKU_ENGINE_STATUS = (1ULL << 0), // Include when asking for engine status + TOKU_GLOBAL_STATUS = + (1ULL << 1), // Include when asking for information_schema.global_status +} toku_engine_status_include_type; + +typedef struct __toku_engine_status_row { + const char *keyname; // info schema key, should not change across revisions + // without good reason + const char + *columnname; // column for mysql, e.g. information_schema.global_status. + // TOKUDB_ will automatically be prefixed. + const char *legend; // the text that will appear at user interface + toku_engine_status_display_type type; // how to interpret the value + toku_engine_status_include_type + include; // which kinds of callers should get read this row? + union { + double dnum; + uint64_t num; + const char *str; + char datebuf[26]; + struct partitioned_counter *parcount; + } value; +} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; + +#define DB_BUFFER_SMALL -30999 +#define DB_LOCK_DEADLOCK -30995 +#define DB_LOCK_NOTGRANTED -30994 +#define DB_NOTFOUND -30989 +#define DB_KEYEXIST -30996 +#define DB_DBT_MALLOC 8 +#define DB_DBT_REALLOC 64 +#define DB_DBT_USERMEM 256 + +/* PerconaFT specific error codes */ +#define TOKUDB_OUT_OF_LOCKS -100000 + +typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid, + uint64_t blocking_txnid); + +struct __toku_dbt { + void *data; + size_t size; + size_t ulen; + // One of DB_DBT_XXX flags + uint32_t flags; +}; + +#endif diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h new file mode 100644 index 000000000..718efc623 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h @@ -0,0 +1,138 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../db.h" +#include "../portability/memory.h" +#include "../util/dbt.h" + +typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b); + +int toku_keycompare(const void *key1, size_t key1len, const void *key2, + size_t key2len); + +int toku_builtin_compare_fun(const DBT *, const DBT *) + __attribute__((__visibility__("default"))); + +namespace toku { + +// a comparator object encapsulates the data necessary for +// comparing two keys in a fractal tree. it further understands +// that points may be positive or negative infinity. + +class comparator { + void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) { + _cmp = cmp; + _cmp_arg = cmp_arg; + _memcmp_magic = memcmp_magic; + } + + public: + // This magic value is reserved to mean that the magic has not been set. + static const uint8_t MEMCMP_MAGIC_NONE = 0; + + void create(ft_compare_func cmp, void *cmp_arg, + uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) { + init(cmp, cmp_arg, memcmp_magic); + } + + // inherit the attributes of another comparator, but keep our own + // copy of fake_db that is owned separately from the one given. + void inherit(const comparator &cmp) { + invariant_notnull(cmp._cmp); + init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic); + } + + // like inherit, but doesn't require that the this comparator + // was already created + void create_from(const comparator &cmp) { inherit(cmp); } + + void destroy() {} + + ft_compare_func get_compare_func() const { return _cmp; } + + uint8_t get_memcmp_magic() const { return _memcmp_magic; } + + bool valid() const { return _cmp != nullptr; } + + inline bool dbt_has_memcmp_magic(const DBT *dbt) const { + return *reinterpret_cast(dbt->data) == _memcmp_magic; + } + + int operator()(const DBT *a, const DBT *b) const { + if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b), + 0)) { + return toku_dbt_infinite_compare(a, b); + } else if (_memcmp_magic != MEMCMP_MAGIC_NONE + // If `a' has the memcmp magic.. + && dbt_has_memcmp_magic(a) + // ..then we expect `b' to also have the memcmp magic + && __builtin_expect(dbt_has_memcmp_magic(b), 1)) { + assert(0); // psergey: this branch should not be taken. + return toku_builtin_compare_fun(a, b); + } else { + // yikes, const sadness here + return _cmp(_cmp_arg, a, b); + } + } + + private: + ft_compare_func _cmp; + void *_cmp_arg; + + uint8_t _memcmp_magic; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h new file mode 100644 index 000000000..1b4511172 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h @@ -0,0 +1,102 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../portability/toku_race_tools.h" +#include "../util/status.h" + +// +// Lock Tree Manager statistics +// +class LTM_STATUS_S { + public: + enum { + LTM_SIZE_CURRENT = 0, + LTM_SIZE_LIMIT, + LTM_ESCALATION_COUNT, + LTM_ESCALATION_TIME, + LTM_ESCALATION_LATEST_RESULT, + LTM_NUM_LOCKTREES, + LTM_LOCK_REQUESTS_PENDING, + LTM_STO_NUM_ELIGIBLE, + LTM_STO_END_EARLY_COUNT, + LTM_STO_END_EARLY_TIME, + LTM_WAIT_COUNT, + LTM_WAIT_TIME, + LTM_LONG_WAIT_COUNT, + LTM_LONG_WAIT_TIME, + LTM_TIMEOUT_COUNT, + LTM_WAIT_ESCALATION_COUNT, + LTM_WAIT_ESCALATION_TIME, + LTM_LONG_WAIT_ESCALATION_COUNT, + LTM_LONG_WAIT_ESCALATION_TIME, + LTM_STATUS_NUM_ROWS // must be last + }; + + void init(void); + void destroy(void); + + TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS]; + + private: + bool m_initialized = false; +}; +typedef LTM_STATUS_S* LTM_STATUS; +extern LTM_STATUS_S ltm_status; + +#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num + +void toku_status_init(void); // just call ltm_status.init(); +void toku_status_destroy(void); // just call ltm_status.destroy(); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc new file mode 100644 index 000000000..5110cd482 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc @@ -0,0 +1,139 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "concurrent_tree.h" + +// PORT #include +namespace toku { + +void concurrent_tree::create(const comparator *cmp) { + // start with an empty root node. we do this instead of + // setting m_root to null so there's always a root to lock + m_root.create_root(cmp); +} + +void concurrent_tree::destroy(void) { m_root.destroy_root(); } + +bool concurrent_tree::is_empty(void) { return m_root.is_empty(); } + +uint64_t concurrent_tree::get_insertion_memory_overhead(void) { + return sizeof(treenode); +} + +void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) { + // the first step in acquiring a locked keyrange is locking the root + treenode *const root = &tree->m_root; + m_tree = tree; + m_subtree = root; + m_range = keyrange::get_infinite_range(); + root->mutex_lock(); +} + +void concurrent_tree::locked_keyrange::acquire(const keyrange &range) { + treenode *const root = &m_tree->m_root; + + treenode *subtree; + if (root->is_empty() || root->range_overlaps(range)) { + subtree = root; + } else { + // we do not have a precomputed comparison hint, so pass null + const keyrange::comparison *cmp_hint = nullptr; + subtree = root->find_node_with_overlapping_child(range, cmp_hint); + } + + // subtree is locked. it will be unlocked when this is release()'d + invariant_notnull(subtree); + m_range = range; + m_subtree = subtree; +} + +bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range, + TXNID new_owner) { + return m_subtree->insert(range, new_owner, /*is_shared*/ true); +} + +void concurrent_tree::locked_keyrange::release(void) { + m_subtree->mutex_unlock(); +} + +void concurrent_tree::locked_keyrange::insert(const keyrange &range, + TXNID txnid, bool is_shared) { + // empty means no children, and only the root should ever be empty + if (m_subtree->is_empty()) { + m_subtree->set_range_and_txnid(range, txnid, is_shared); + } else { + m_subtree->insert(range, txnid, is_shared); + } +} + +void concurrent_tree::locked_keyrange::remove(const keyrange &range, + TXNID txnid) { + invariant(!m_subtree->is_empty()); + treenode *new_subtree = m_subtree->remove(range, txnid); + // if removing range changed the root of the subtree, + // then the subtree must be the root of the entire tree. + if (new_subtree == nullptr) { + invariant(m_subtree->is_root()); + invariant(m_subtree->is_empty()); + } +} + +void concurrent_tree::locked_keyrange::remove_all(void) { + m_subtree->recursive_remove(); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h new file mode 100644 index 000000000..e1bfb86c5 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h @@ -0,0 +1,174 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" +#include "keyrange.h" +#include "treenode.h" + +namespace toku { + +// A concurrent_tree stores non-overlapping ranges. +// Access to disjoint parts of the tree usually occurs concurrently. + +class concurrent_tree { + public: + // A locked_keyrange gives you exclusive access to read and write + // operations that occur on any keys in that range. You only have + // the right to operate on keys in that range or keys that were read + // from the keyrange using iterate() + // + // Access model: + // - user prepares a locked keyrange. all threads serialize behind prepare(). + // - user breaks the serialzation point by acquiring a range, or releasing. + // - one thread operates on a certain locked_keyrange object at a time. + // - when the thread is finished, it releases + + class locked_keyrange { + public: + // effect: prepare to acquire a locked keyrange over the given + // concurrent_tree, preventing other threads from preparing + // until this thread either does acquire() or release(). + // note: operations performed on a prepared keyrange are equivalent + // to ones performed on an acquired keyrange over -inf, +inf. + // rationale: this provides the user with a serialization point for + // descending + // or modifying the the tree. it also proives a convenient way of + // doing serializable operations on the tree. + // There are two valid sequences of calls: + // - prepare, acquire, [operations], release + // - prepare, [operations],release + void prepare(concurrent_tree *tree); + + // requires: the locked keyrange was prepare()'d + // effect: acquire a locked keyrange over the given concurrent_tree. + // the locked keyrange represents the range of keys overlapped + // by the given range + void acquire(const keyrange &range); + + // effect: releases a locked keyrange and the mutex it holds + void release(void); + + // effect: iterate over each range this locked_keyrange represents, + // calling function->fn() on each node's keyrange and txnid + // until there are no more or the function returns false + template + void iterate(F *function) const { + // if the subtree is non-empty, traverse it by calling the given + // function on each range, txnid pair found that overlaps. + if (!m_subtree->is_empty()) { + m_subtree->traverse_overlaps(m_range, function); + } + } + + // Adds another owner to the lock on the specified keyrange. + // requires: the keyrange contains one treenode whose bounds are + // exactly equal to the specifed range (no sub/supersets) + bool add_shared_owner(const keyrange &range, TXNID new_owner); + + // inserts the given range into the tree, with an associated txnid. + // requires: range does not overlap with anything in this locked_keyrange + // rationale: caller is responsible for only inserting unique ranges + void insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the tree. + // - txnid=TXNID_ANY means remove the range no matter what its + // owners are + // - Other value means remove the specified txnid from + // ownership (if the range has other owners, it will remain + // in the tree) + // requires: range exists exactly in this locked_keyrange + // rationale: caller is responsible for only removing existing ranges + void remove(const keyrange &range, TXNID txnid); + + // effect: removes all of the keys represented by this locked keyrange + // rationale: we'd like a fast way to empty out a tree + void remove_all(void); + + private: + // the concurrent tree this locked keyrange is for + concurrent_tree *m_tree; + + // the range of keys this locked keyrange represents + keyrange m_range; + + // the subtree under which all overlapping ranges exist + treenode *m_subtree; + + friend class concurrent_tree_unit_test; + }; + + // effect: initialize the tree to an empty state + void create(const comparator *cmp); + + // effect: destroy the tree. + // requires: tree is empty + void destroy(void); + + // returns: true iff the tree is empty + bool is_empty(void); + + // returns: the memory overhead of a single insertion into the tree + static uint64_t get_insertion_memory_overhead(void); + + private: + // the root needs to always exist so there's a lock to grab + // even if the tree is empty. that's why we store a treenode + // here and not a pointer to one. + treenode m_root; + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc new file mode 100644 index 000000000..e50ace5a9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc @@ -0,0 +1,222 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "keyrange.h" + +#include "../util/dbt.h" + +namespace toku { + +// create a keyrange by borrowing the left and right dbt +// pointers. no memory is copied. no checks for infinity needed. +void keyrange::create(const DBT *left, const DBT *right) { + init_empty(); + m_left_key = left; + m_right_key = right; +} + +// destroy the key copies. if they were never set, then destroy does nothing. +void keyrange::destroy(void) { + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); +} + +// create a keyrange by copying the keys from the given range. +void keyrange::create_copy(const keyrange &range) { + // start with an initialized, empty range + init_empty(); + + // optimize the case where the left and right keys are the same. + // we'd like to only have one copy of the data. + if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) { + set_both_keys(range.get_left_key()); + } else { + // replace our empty left and right keys with + // copies of the range's left and right keys + replace_left_key(range.get_left_key()); + replace_right_key(range.get_right_key()); + } +} + +// extend this keyrange by choosing the leftmost and rightmost +// endpoints between this range and the given. replaced keys +// in this range are freed and inherited keys are copied. +void keyrange::extend(const comparator &cmp, const keyrange &range) { + const DBT *range_left = range.get_left_key(); + const DBT *range_right = range.get_right_key(); + if (cmp(range_left, get_left_key()) < 0) { + replace_left_key(range_left); + } + if (cmp(range_right, get_right_key()) > 0) { + replace_right_key(range_right); + } +} + +// how much memory does this keyrange take? +// - the size of the left and right keys +// --- ignore the fact that we may have optimized the point case. +// it complicates things for little gain. +// - the size of the keyrange class itself +uint64_t keyrange::get_memory_size(void) const { + const DBT *left_key = get_left_key(); + const DBT *right_key = get_right_key(); + return left_key->size + right_key->size + sizeof(keyrange); +} + +// compare ranges. +keyrange::comparison keyrange::compare(const comparator &cmp, + const keyrange &range) const { + if (cmp(get_right_key(), range.get_left_key()) < 0) { + return comparison::LESS_THAN; + } else if (cmp(get_left_key(), range.get_right_key()) > 0) { + return comparison::GREATER_THAN; + } else if (cmp(get_left_key(), range.get_left_key()) == 0 && + cmp(get_right_key(), range.get_right_key()) == 0) { + return comparison::EQUALS; + } else { + return comparison::OVERLAPS; + } +} + +bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const { + // equality is a stronger form of overlapping. + // so two ranges "overlap" if they're either equal or just overlapping. + comparison c = compare(cmp, range); + return c == comparison::EQUALS || c == comparison::OVERLAPS; +} + +keyrange keyrange::get_infinite_range(void) { + keyrange range; + range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity()); + return range; +} + +void keyrange::init_empty(void) { + m_left_key = nullptr; + m_right_key = nullptr; + toku_init_dbt(&m_left_key_copy); + toku_init_dbt(&m_right_key_copy); + m_point_range = false; +} + +const DBT *keyrange::get_left_key(void) const { + if (m_left_key) { + return m_left_key; + } else { + return &m_left_key_copy; + } +} + +const DBT *keyrange::get_right_key(void) const { + if (m_right_key) { + return m_right_key; + } else { + return &m_right_key_copy; + } +} + +// copy the given once and set both the left and right pointers. +// optimization for point ranges, so the left and right ranges +// are not copied twice. +void keyrange::set_both_keys(const DBT *key) { + if (toku_dbt_is_infinite(key)) { + m_left_key = key; + m_right_key = key; + } else { + toku_clone_dbt(&m_left_key_copy, *key); + toku_copyref_dbt(&m_right_key_copy, m_left_key_copy); + } + m_point_range = true; +} + +// destroy the current left key. set and possibly copy the new one +void keyrange::replace_left_key(const DBT *key) { + // a little magic: + // + // if this is a point range, then the left and right keys share + // one copy of the data, and it lives in the left key copy. so + // if we're replacing the left key, move the real data to the + // right key copy instead of destroying it. now, the memory is + // owned by the right key and the left key may be replaced. + if (m_point_range) { + m_right_key_copy = m_left_key_copy; + } else { + toku_destroy_dbt(&m_left_key_copy); + } + + if (toku_dbt_is_infinite(key)) { + m_left_key = key; + } else { + toku_clone_dbt(&m_left_key_copy, *key); + m_left_key = nullptr; + } + m_point_range = false; +} + +// destroy the current right key. set and possibly copy the new one +void keyrange::replace_right_key(const DBT *key) { + toku_destroy_dbt(&m_right_key_copy); + if (toku_dbt_is_infinite(key)) { + m_right_key = key; + } else { + toku_clone_dbt(&m_right_key_copy, *key); + m_right_key = nullptr; + } + m_point_range = false; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h new file mode 100644 index 000000000..f9aeea0c4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" + +namespace toku { + +// A keyrange has a left and right key as endpoints. +// +// When a keyrange is created it owns no memory, but when it copies +// or extends another keyrange, it copies memory as necessary. This +// means it is cheap in the common case. + +class keyrange { + public: + // effect: constructor that borrows left and right key pointers. + // no memory is allocated or copied. + void create(const DBT *left_key, const DBT *right_key); + + // effect: constructor that allocates and copies another keyrange's points. + void create_copy(const keyrange &range); + + // effect: destroys the keyrange, freeing any allocated memory + void destroy(void); + + // effect: extends the keyrange by choosing the leftmost and rightmost + // endpoints from this range and the given range. + // replaced keys in this range are freed, new keys are copied. + void extend(const comparator &cmp, const keyrange &range); + + // returns: the amount of memory this keyrange takes. does not account + // for point optimizations or malloc overhead. + uint64_t get_memory_size(void) const; + + // returns: pointer to the left key of this range + const DBT *get_left_key(void) const; + + // returns: pointer to the right key of this range + const DBT *get_right_key(void) const; + + // two ranges are either equal, lt, gt, or overlapping + enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS }; + + // effect: compares this range to the given range + // returns: LESS_THAN if given range is strictly to the left + // GREATER_THAN if given range is strictly to the right + // EQUALS if given range has the same left and right endpoints + // OVERLAPS if at least one of the given range's endpoints falls + // between this range's endpoints + comparison compare(const comparator &cmp, const keyrange &range) const; + + // returns: true if the range and the given range are equal or overlapping + bool overlaps(const comparator &cmp, const keyrange &range) const; + + // returns: a keyrange representing -inf, +inf + static keyrange get_infinite_range(void); + + private: + // some keys should be copied, some keys should not be. + // + // to support both, we use two DBTs for copies and two pointers + // for temporaries. the access rule is: + // - if a pointer is non-null, then it reprsents the key. + // - otherwise the pointer is null, and the key is in the copy. + DBT m_left_key_copy; + DBT m_right_key_copy; + const DBT *m_left_key; + const DBT *m_right_key; + + // if this range is a point range, then m_left_key == m_right_key + // and the actual data is stored exactly once in m_left_key_copy. + bool m_point_range; + + // effect: initializes a keyrange to be empty + void init_empty(void); + + // effect: copies the given key once into the left key copy + // and sets the right key copy to share the left. + // rationale: optimization for point ranges to only do one malloc + void set_both_keys(const DBT *key); + + // effect: destroys the current left key. sets and copies the new one. + void replace_left_key(const DBT *key); + + // effect: destroys the current right key. sets and copies the new one. + void replace_right_key(const DBT *key); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc new file mode 100644 index 000000000..3d217be70 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc @@ -0,0 +1,527 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "lock_request.h" + +#include "../portability/toku_race_tools.h" +#include "../portability/txn_subst.h" +#include "../util/dbt.h" +#include "locktree.h" + +namespace toku { + +// initialize a lock request's internals +void lock_request::create(toku_external_mutex_factory_t mutex_factory) { + m_txnid = TXNID_NONE; + m_conflicting_txnid = TXNID_NONE; + m_start_time = 0; + m_left_key = nullptr; + m_right_key = nullptr; + toku_init_dbt(&m_left_key_copy); + toku_init_dbt(&m_right_key_copy); + + m_type = type::UNKNOWN; + m_lt = nullptr; + + m_complete_r = 0; + m_state = state::UNINITIALIZED; + m_info = nullptr; + + // psergey-todo: this condition is for interruptible wait + // note: moved to here from lock_request::create: + toku_external_cond_init(mutex_factory, &m_wait_cond); + + m_start_test_callback = nullptr; + m_start_before_pending_test_callback = nullptr; + m_retry_test_callback = nullptr; +} + +// destroy a lock request. +void lock_request::destroy(void) { + invariant(m_state != state::PENDING); + invariant(m_state != state::DESTROYED); + m_state = state::DESTROYED; + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); + toku_external_cond_destroy(&m_wait_cond); +} + +// set the lock request parameters. this API allows a lock request to be reused. +void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, + const DBT *right_key, lock_request::type lock_type, + bool big_txn, void *extra) { + invariant(m_state != state::PENDING); + m_lt = lt; + + m_txnid = txnid; + m_left_key = left_key; + m_right_key = right_key; + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); + m_type = lock_type; + m_state = state::INITIALIZED; + m_info = lt ? lt->get_lock_request_info() : nullptr; + m_big_txn = big_txn; + m_extra = extra; +} + +// get rid of any stored left and right key copies and +// replace them with copies of the given left and right key +void lock_request::copy_keys() { + if (!toku_dbt_is_infinite(m_left_key)) { + toku_clone_dbt(&m_left_key_copy, *m_left_key); + m_left_key = &m_left_key_copy; + } + if (!toku_dbt_is_infinite(m_right_key)) { + toku_clone_dbt(&m_right_key_copy, *m_right_key); + m_right_key = &m_right_key_copy; + } +} + +// what are the conflicts for this pending lock request? +void lock_request::get_conflicts(txnid_set *conflicts) { + invariant(m_state == state::PENDING); + const bool is_write_request = m_type == type::WRITE; + m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key, + conflicts); +} + +// build a wait-for-graph for this lock request and the given conflict set +// for each transaction B that blocks A's lock request +// if B is blocked then +// add (A,T) to the WFG and if B is new, fill in the WFG from B +void lock_request::build_wait_graph(wfg *wait_graph, + const txnid_set &conflicts) { + uint32_t num_conflicts = conflicts.size(); + for (uint32_t i = 0; i < num_conflicts; i++) { + TXNID conflicting_txnid = conflicts.get(i); + lock_request *conflicting_request = find_lock_request(conflicting_txnid); + invariant(conflicting_txnid != m_txnid); + invariant(conflicting_request != this); + if (conflicting_request) { + bool already_exists = wait_graph->node_exists(conflicting_txnid); + wait_graph->add_edge(m_txnid, conflicting_txnid); + if (!already_exists) { + // recursively build the wait for graph rooted at the conflicting + // request, given its set of lock conflicts. + txnid_set other_conflicts; + other_conflicts.create(); + conflicting_request->get_conflicts(&other_conflicts); + conflicting_request->build_wait_graph(wait_graph, other_conflicts); + other_conflicts.destroy(); + } + } + } +} + +// returns: true if the current set of lock requests contains +// a deadlock, false otherwise. +bool lock_request::deadlock_exists(const txnid_set &conflicts) { + wfg wait_graph; + wait_graph.create(); + + build_wait_graph(&wait_graph, conflicts); + + std::function reporter; + if (m_deadlock_cb) { + reporter = [this](TXNID a) { + lock_request *req = find_lock_request(a); + if (req) { + m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE), + req->m_left_key, req->m_right_key); + } + }; + } + + bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter); + wait_graph.destroy(); + return deadlock; +} + +// try to acquire a lock described by this lock request. +int lock_request::start(void) { + int r; + + txnid_set conflicts; + conflicts.create(); + if (m_type == type::WRITE) { + r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } else { + invariant(m_type == type::READ); + r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } + + // if the lock is not granted, save it to the set of lock requests + // and check for a deadlock. if there is one, complete it as failed + if (r == DB_LOCK_NOTGRANTED) { + copy_keys(); + m_state = state::PENDING; + m_start_time = toku_current_time_microsec() / 1000; + m_conflicting_txnid = conflicts.get(0); + if (m_start_before_pending_test_callback) + m_start_before_pending_test_callback(); + toku_external_mutex_lock(&m_info->mutex); + insert_into_lock_requests(); + if (deadlock_exists(conflicts)) { + remove_from_lock_requests(); + r = DB_LOCK_DEADLOCK; + } + toku_external_mutex_unlock(&m_info->mutex); + if (m_start_test_callback) m_start_test_callback(); // test callback + } + + if (r != DB_LOCK_NOTGRANTED) { + complete(r); + } + + conflicts.destroy(); + return r; +} + +// sleep on the lock request until it becomes resolved or the wait time has +// elapsed. +int lock_request::wait(uint64_t wait_time_ms) { + return wait(wait_time_ms, 0, nullptr); +} + +int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, + int (*killed_callback)(void), + void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg) { + uint64_t t_now = toku_current_time_microsec(); + uint64_t t_start = t_now; + uint64_t t_end = t_start + wait_time_ms * 1000; + + toku_external_mutex_lock(&m_info->mutex); + + // check again, this time locking out other retry calls + if (m_state == state::PENDING) { + lock_wait_infos conflicts_collector; + retry(&conflicts_collector); + if (m_state == state::PENDING) { + report_waits(&conflicts_collector, lock_wait_callback, callback_arg); + } + } + + while (m_state == state::PENDING) { + // check if this thread is killed + if (killed_callback && killed_callback()) { + remove_from_lock_requests(); + complete(DB_LOCK_NOTGRANTED); + continue; + } + + // compute the time until we should wait + uint64_t t_wait; + if (killed_time_ms == 0) { + t_wait = t_end; + } else { + t_wait = t_now + killed_time_ms * 1000; + if (t_wait > t_end) t_wait = t_end; + } + + int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex, + (int64_t)(t_wait - t_now)); + invariant(r == 0 || r == ETIMEDOUT); + + t_now = toku_current_time_microsec(); + if (m_state == state::PENDING && (t_now >= t_end)) { + m_info->counters.timeout_count += 1; + + // if we're still pending and we timed out, then remove our + // request from the set of lock requests and fail. + remove_from_lock_requests(); + + // complete sets m_state to COMPLETE, breaking us out of the loop + complete(DB_LOCK_NOTGRANTED); + } + } + + uint64_t t_real_end = toku_current_time_microsec(); + uint64_t duration = t_real_end - t_start; + m_info->counters.wait_count += 1; + m_info->counters.wait_time += duration; + if (duration >= 1000000) { + m_info->counters.long_wait_count += 1; + m_info->counters.long_wait_time += duration; + } + toku_external_mutex_unlock(&m_info->mutex); + + invariant(m_state == state::COMPLETE); + return m_complete_r; +} + +// complete this lock request with the given return value +void lock_request::complete(int complete_r) { + m_complete_r = complete_r; + m_state = state::COMPLETE; +} + +const DBT *lock_request::get_left_key(void) const { return m_left_key; } + +const DBT *lock_request::get_right_key(void) const { return m_right_key; } + +TXNID lock_request::get_txnid(void) const { return m_txnid; } + +uint64_t lock_request::get_start_time(void) const { return m_start_time; } + +TXNID lock_request::get_conflicting_txnid(void) const { + return m_conflicting_txnid; +} + +int lock_request::retry(lock_wait_infos *conflicts_collector) { + invariant(m_state == state::PENDING); + int r; + txnid_set conflicts; + conflicts.create(); + + if (m_type == type::WRITE) { + r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } else { + r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } + + // if the acquisition succeeded then remove ourselves from the + // set of lock requests, complete, and signal the waiting thread. + if (r == 0) { + remove_from_lock_requests(); + complete(r); + if (m_retry_test_callback) m_retry_test_callback(); // test callback + toku_external_cond_broadcast(&m_wait_cond); + } else { + m_conflicting_txnid = conflicts.get(0); + add_conflicts_to_waits(&conflicts, conflicts_collector); + } + conflicts.destroy(); + + return r; +} + +void lock_request::retry_all_lock_requests( + locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg, void (*after_retry_all_test_callback)(void)) { + lt_lock_request_info *info = lt->get_lock_request_info(); + + // if there are no pending lock requests than there is nothing to do + // the unlocked data race on pending_is_empty is OK since lock requests + // are retried after added to the pending set. + if (info->pending_is_empty) return; + + // get my retry generation (post increment of retry_want) + unsigned long long my_retry_want = (info->retry_want += 1); + + toku_mutex_lock(&info->retry_mutex); + + // here is the group retry algorithm. + // get the latest retry_want count and use it as the generation number of + // this retry operation. if this retry generation is > the last retry + // generation, then do the lock retries. otherwise, no lock retries + // are needed. + if ((my_retry_want - 1) == info->retry_done) { + for (;;) { + if (!info->running_retry) { + info->running_retry = true; + info->retry_done = info->retry_want; + toku_mutex_unlock(&info->retry_mutex); + retry_all_lock_requests_info(info, lock_wait_callback, callback_arg); + if (after_retry_all_test_callback) after_retry_all_test_callback(); + toku_mutex_lock(&info->retry_mutex); + info->running_retry = false; + toku_cond_broadcast(&info->retry_cv); + break; + } else { + toku_cond_wait(&info->retry_cv, &info->retry_mutex); + } + } + } + toku_mutex_unlock(&info->retry_mutex); +} + +void lock_request::retry_all_lock_requests_info( + lt_lock_request_info *info, + void (*lock_wait_callback)(void *, lock_wait_infos *), void *callback_arg) { + toku_external_mutex_lock(&info->mutex); + // retry all of the pending lock requests. + lock_wait_infos conflicts_collector; + for (uint32_t i = 0; i < info->pending_lock_requests.size();) { + lock_request *request; + int r = info->pending_lock_requests.fetch(i, &request); + invariant_zero(r); + + // retry the lock request. if it didn't succeed, + // move on to the next lock request. otherwise + // the request is gone from the list so we may + // read the i'th entry for the next one. + r = request->retry(&conflicts_collector); + if (r != 0) { + i++; + } + } + + // call report_waits while holding the pending queue lock since + // the waiter object is still valid while it's in the queue + report_waits(&conflicts_collector, lock_wait_callback, callback_arg); + + // future threads should only retry lock requests if some still exist + info->should_retry_lock_requests = info->pending_lock_requests.size() > 0; + toku_external_mutex_unlock(&info->mutex); +} + +void lock_request::add_conflicts_to_waits(txnid_set *conflicts, + lock_wait_infos *wait_conflicts) { + wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}}); + uint32_t num_conflicts = conflicts->size(); + for (uint32_t i = 0; i < num_conflicts; i++) { + wait_conflicts->back().waitees.push_back(conflicts->get(i)); + } +} + +void lock_request::report_waits(lock_wait_infos *wait_conflicts, + void (*lock_wait_callback)(void *, + lock_wait_infos *), + void *callback_arg) { + if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts); +} + +void *lock_request::get_extra(void) const { return m_extra; } + +void lock_request::kill_waiter(void) { + remove_from_lock_requests(); + complete(DB_LOCK_NOTGRANTED); + toku_external_cond_broadcast(&m_wait_cond); +} + +void lock_request::kill_waiter(locktree *lt, void *extra) { + lt_lock_request_info *info = lt->get_lock_request_info(); + toku_external_mutex_lock(&info->mutex); + for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) { + lock_request *request; + int r = info->pending_lock_requests.fetch(i, &request); + if (r == 0 && request->get_extra() == extra) { + request->kill_waiter(); + break; + } + } + toku_external_mutex_unlock(&info->mutex); +} + +// find another lock request by txnid. must hold the mutex. +lock_request *lock_request::find_lock_request(const TXNID &txnid) { + lock_request *request; + int r = m_info->pending_lock_requests.find_zero( + txnid, &request, nullptr); + if (r != 0) { + request = nullptr; + } + return request; +} + +// insert this lock request into the locktree's set. must hold the mutex. +void lock_request::insert_into_lock_requests(void) { + uint32_t idx; + lock_request *request; + int r = m_info->pending_lock_requests.find_zero( + m_txnid, &request, &idx); + invariant(r == DB_NOTFOUND); + r = m_info->pending_lock_requests.insert_at(this, idx); + invariant_zero(r); + m_info->pending_is_empty = false; +} + +// remove this lock request from the locktree's set. must hold the mutex. +void lock_request::remove_from_lock_requests(void) { + uint32_t idx; + lock_request *request; + int r = m_info->pending_lock_requests.find_zero( + m_txnid, &request, &idx); + invariant_zero(r); + invariant(request == this); + r = m_info->pending_lock_requests.delete_at(idx); + invariant_zero(r); + if (m_info->pending_lock_requests.size() == 0) + m_info->pending_is_empty = true; +} + +int lock_request::find_by_txnid(lock_request *const &request, + const TXNID &txnid) { + TXNID request_txnid = request->m_txnid; + if (request_txnid < txnid) { + return -1; + } else if (request_txnid == txnid) { + return 0; + } else { + return 1; + } +} + +void lock_request::set_start_test_callback(void (*f)(void)) { + m_start_test_callback = f; +} + +void lock_request::set_start_before_pending_test_callback(void (*f)(void)) { + m_start_before_pending_test_callback = f; +} + +void lock_request::set_retry_test_callback(void (*f)(void)) { + m_retry_test_callback = f; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h new file mode 100644 index 000000000..d30e1e2ca --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h @@ -0,0 +1,255 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_pthread.h" +#include "locktree.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +// Information about a lock wait +struct lock_wait_info { + locktree *ltree; // the tree where wait happens + TXNID waiter; // the waiting transaction + void *m_extra; // lock_request's m_extra + + // The transactions that are waited for. + std::vector waitees; +}; + +typedef std::vector lock_wait_infos; + +// A lock request contains the db, the key range, the lock type, and +// the transaction id that describes a potential row range lock. +// +// the typical use case is: +// - initialize a lock request +// - start to try to acquire the lock +// - do something else +// - wait for the lock request to be resolved on a timed condition +// - destroy the lock request +// a lock request is resolved when its state is no longer pending, or +// when it becomes granted, or timedout, or deadlocked. when resolved, the +// state of the lock request is changed and any waiting threads are awakened. + +class lock_request { + public: + enum type { UNKNOWN, READ, WRITE }; + + // effect: Initializes a lock request. + void create(toku_external_mutex_factory_t mutex_factory); + + // effect: Destroys a lock request. + void destroy(void); + + // effect: Resets the lock request parameters, allowing it to be reused. + // requires: Lock request was already created at some point + void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, + type lock_type, bool big_txn, void *extra = nullptr); + + // effect: Tries to acquire a lock described by this lock request. + // returns: The return code of locktree::acquire_[write,read]_lock() + // or DB_LOCK_DEADLOCK if this request would end up deadlocked. + int start(void); + + // effect: Sleeps until either the request is granted or the wait time + // expires. returns: The return code of locktree::acquire_[write,read]_lock() + // or simply DB_LOCK_NOTGRANTED if the wait time expired. + int wait(uint64_t wait_time_ms); + int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, + int (*killed_callback)(void), + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr); + + // return: left end-point of the lock range + const DBT *get_left_key(void) const; + + // return: right end-point of the lock range + const DBT *get_right_key(void) const; + + // return: the txnid waiting for a lock + TXNID get_txnid(void) const; + + // return: when this lock request started, as milliseconds from epoch + uint64_t get_start_time(void) const; + + // return: which txnid is blocking this request (there may be more, though) + TXNID get_conflicting_txnid(void) const; + + // effect: Retries all of the lock requests for the given locktree. + // Any lock requests successfully restarted is completed and woken + // up. + // The rest remain pending. + static void retry_all_lock_requests( + locktree *lt, + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr, + void (*after_retry_test_callback)(void) = nullptr); + static void retry_all_lock_requests_info( + lt_lock_request_info *info, + void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg); + + void set_start_test_callback(void (*f)(void)); + void set_start_before_pending_test_callback(void (*f)(void)); + void set_retry_test_callback(void (*f)(void)); + + void *get_extra(void) const; + + void kill_waiter(void); + static void kill_waiter(locktree *lt, void *extra); + + private: + enum state { + UNINITIALIZED, + INITIALIZED, + PENDING, + COMPLETE, + DESTROYED, + }; + + // The keys for a lock request are stored "unowned" in m_left_key + // and m_right_key. When the request is about to go to sleep, it + // copies these keys and stores them in m_left_key_copy etc and + // sets the temporary pointers to null. + TXNID m_txnid; + TXNID m_conflicting_txnid; + uint64_t m_start_time; + const DBT *m_left_key; + const DBT *m_right_key; + DBT m_left_key_copy; + DBT m_right_key_copy; + + // The lock request type and associated locktree + type m_type; + locktree *m_lt; + + // If the lock request is in the completed state, then its + // final return value is stored in m_complete_r + int m_complete_r; + state m_state; + + toku_external_cond_t m_wait_cond; + + bool m_big_txn; + + // the lock request info state stored in the + // locktree that this lock request is for. + struct lt_lock_request_info *m_info; + + void *m_extra; + + // effect: tries again to acquire the lock described by this lock request + // returns: 0 if retrying the request succeeded and is now complete + int retry(lock_wait_infos *collector); + + void complete(int complete_r); + + // effect: Finds another lock request by txnid. + // requires: The lock request info mutex is held + lock_request *find_lock_request(const TXNID &txnid); + + // effect: Insert this lock request into the locktree's set. + // requires: the locktree's mutex is held + void insert_into_lock_requests(void); + + // effect: Removes this lock request from the locktree's set. + // requires: The lock request info mutex is held + void remove_from_lock_requests(void); + + // effect: Asks this request's locktree which txnids are preventing + // us from getting the lock described by this request. + // returns: conflicts is populated with the txnid's that this request + // is blocked on + void get_conflicts(txnid_set *conflicts); + + // effect: Builds a wait-for-graph for this lock request and the given + // conflict set + void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts); + + // returns: True if this lock request is in deadlock with the given conflicts + // set + bool deadlock_exists(const txnid_set &conflicts); + + void copy_keys(void); + + static int find_by_txnid(lock_request *const &request, const TXNID &txnid); + + // Report list of conflicts to lock wait callback. + static void report_waits(lock_wait_infos *wait_conflicts, + void (*lock_wait_callback)(void *, + lock_wait_infos *), + void *callback_arg); + void add_conflicts_to_waits(txnid_set *conflicts, + lock_wait_infos *wait_conflicts); + + void (*m_start_test_callback)(void); + void (*m_start_before_pending_test_callback)(void); + void (*m_retry_test_callback)(void); + + public: + std::function m_deadlock_cb; + + friend class lock_request_unit_test; +}; +// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t +// This is ok as the PODness is not really required: lock_request objects are +// not moved in memory or anything. +// ENSURE_POD(lock_request); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc new file mode 100644 index 000000000..3d6a590c7 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -0,0 +1,1023 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "locktree.h" + +#include + +#include "../portability/toku_pthread.h" +#include "../portability/toku_time.h" +#include "../util/growable_array.h" +#include "range_buffer.h" + +// including the concurrent_tree here expands the templates +// and "defines" the implementation, so we do it here in +// the locktree source file instead of the header. +#include "concurrent_tree.h" + +namespace toku { +// A locktree represents the set of row locks owned by all transactions +// over an open dictionary. Read and write ranges are represented as +// a left and right key which are compared with the given descriptor +// and comparison fn. +// +// Each locktree has a reference count which it manages +// but does nothing based on the value of the reference count - it is +// up to the user of the locktree to destroy it when it sees fit. + +void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id, + const comparator &cmp, + toku_external_mutex_factory_t mutex_factory) { + m_mgr = mgr; + m_dict_id = dict_id; + + m_cmp.create_from(cmp); + m_reference_count = 1; + m_userdata = nullptr; + + XCALLOC(m_rangetree); + m_rangetree->create(&m_cmp); + + m_sto_txnid = TXNID_NONE; + m_sto_buffer.create(); + m_sto_score = STO_SCORE_THRESHOLD; + m_sto_end_early_count = 0; + m_sto_end_early_time = 0; + + m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool { + return false; + }; + + m_lock_request_info.init(mutex_factory); +} + +void locktree::set_escalation_barrier_func( + lt_escalation_barrier_check_func func, void *extra) { + m_escalation_barrier = func; + m_escalation_barrier_arg = extra; +} + +void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) { + pending_lock_requests.create(); + pending_is_empty = true; + toku_external_mutex_init(mutex_factory, &mutex); + retry_want = retry_done = 0; + ZERO_STRUCT(counters); + ZERO_STRUCT(retry_mutex); + toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr); + toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr); + running_retry = false; + + TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty, + sizeof(pending_is_empty)); + TOKU_DRD_IGNORE_VAR(pending_is_empty); +} + +void locktree::destroy(void) { + invariant(m_reference_count == 0); + invariant(m_lock_request_info.pending_lock_requests.size() == 0); + m_cmp.destroy(); + m_rangetree->destroy(); + toku_free(m_rangetree); + m_sto_buffer.destroy(); + m_lock_request_info.destroy(); +} + +void lt_lock_request_info::destroy(void) { + pending_lock_requests.destroy(); + toku_external_mutex_destroy(&mutex); + toku_mutex_destroy(&retry_mutex); + toku_cond_destroy(&retry_cv); +} + +void locktree::add_reference(void) { + (void)toku_sync_add_and_fetch(&m_reference_count, 1); +} + +uint32_t locktree::release_reference(void) { + return toku_sync_sub_and_fetch(&m_reference_count, 1); +} + +uint32_t locktree::get_reference_count(void) { return m_reference_count; } + +// a container for a range/txnid pair +struct row_lock { + keyrange range; + TXNID txnid; + bool is_shared; + TxnidVector *owners; +}; + +// iterate over a locked keyrange and copy out all of the data, +// storing each row lock into the given growable array. the +// caller does not own the range inside the returned row locks, +// so remove from the tree with care using them as keys. +static void iterate_and_get_overlapping_row_locks( + const concurrent_tree::locked_keyrange *lkr, + GrowableArray *row_locks) { + struct copy_fn_obj { + GrowableArray *row_locks; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + row_lock lock = {.range = range, + .txnid = txnid, + .is_shared = is_shared, + .owners = owners}; + row_locks->push(lock); + return true; + } + } copy_fn; + copy_fn.row_locks = row_locks; + lkr->iterate(©_fn); +} + +// given a txnid and a set of overlapping row locks, determine +// which txnids are conflicting, and store them in the conflicts +// set, if given. +static bool determine_conflicting_txnids( + const GrowableArray &row_locks, const TXNID &txnid, + txnid_set *conflicts) { + bool conflicts_exist = false; + const size_t num_overlaps = row_locks.get_size(); + for (size_t i = 0; i < num_overlaps; i++) { + const row_lock lock = row_locks.fetch_unchecked(i); + const TXNID other_txnid = lock.txnid; + if (other_txnid != txnid) { + if (conflicts) { + if (other_txnid == TXNID_SHARED) { + // Add all shared lock owners, except this transaction. + for (TXNID shared_id : *lock.owners) { + if (shared_id != txnid) conflicts->add(shared_id); + } + } else { + conflicts->add(other_txnid); + } + } + conflicts_exist = true; + } + } + return conflicts_exist; +} + +// how much memory does a row lock take up in a concurrent tree? +static uint64_t row_lock_size_in_tree(const row_lock &lock) { + const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead(); + return lock.range.get_memory_size() + overhead; +} + +// remove and destroy the given row lock from the locked keyrange, +// then notify the memory tracker of the newly freed lock. +static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr, + const row_lock &lock, TXNID txnid, + locktree_manager *mgr) { + const uint64_t mem_released = row_lock_size_in_tree(lock); + lkr->remove(lock.range, txnid); + if (mgr != nullptr) { + mgr->note_mem_released(mem_released); + } +} + +// insert a row lock into the locked keyrange, then notify +// the memory tracker of this newly acquired lock. +static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr, + const row_lock &lock, + locktree_manager *mgr) { + uint64_t mem_used = row_lock_size_in_tree(lock); + lkr->insert(lock.range, lock.txnid, lock.is_shared); + if (mgr != nullptr) { + mgr->note_mem_used(mem_used); + } +} + +void locktree::sto_begin(TXNID txnid) { + invariant(m_sto_txnid == TXNID_NONE); + invariant(m_sto_buffer.is_empty()); + m_sto_txnid = txnid; +} + +void locktree::sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { + uint64_t buffer_mem, delta; + + // psergey: the below two lines do not make any sense + // (and it's the same in upstream TokuDB) + keyrange range; + range.create(left_key, right_key); + + buffer_mem = m_sto_buffer.total_memory_size(); + m_sto_buffer.append(left_key, right_key, is_write_request); + delta = m_sto_buffer.total_memory_size() - buffer_mem; + if (m_mgr != nullptr) { + m_mgr->note_mem_used(delta); + } +} + +void locktree::sto_end(void) { + uint64_t mem_size = m_sto_buffer.total_memory_size(); + if (m_mgr != nullptr) { + m_mgr->note_mem_released(mem_size); + } + m_sto_buffer.destroy(); + m_sto_buffer.create(); + m_sto_txnid = TXNID_NONE; +} + +void locktree::sto_end_early_no_accounting(void *prepared_lkr) { + sto_migrate_buffer_ranges_to_tree(prepared_lkr); + sto_end(); + toku_unsafe_set(m_sto_score, 0); +} + +void locktree::sto_end_early(void *prepared_lkr) { + m_sto_end_early_count++; + + tokutime_t t0 = toku_time_now(); + sto_end_early_no_accounting(prepared_lkr); + tokutime_t t1 = toku_time_now(); + + m_sto_end_early_time += (t1 - t0); +} + +void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) { + // There should be something to migrate, and nothing in the rangetree. + invariant(!m_sto_buffer.is_empty()); + invariant(m_rangetree->is_empty()); + + concurrent_tree sto_rangetree; + concurrent_tree::locked_keyrange sto_lkr; + sto_rangetree.create(&m_cmp); + + // insert all of the ranges from the single txnid buffer into a new rangtree + range_buffer::iterator iter(&m_sto_buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + sto_lkr.prepare(&sto_rangetree); + int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(), + rec.get_right_key(), + rec.get_exclusive_flag(), nullptr); + invariant_zero(r); + sto_lkr.release(); + iter.next(); + } + + // Iterate the newly created rangetree and insert each range into the + // locktree's rangetree, on behalf of the old single txnid. + struct migrate_fn_obj { + concurrent_tree::locked_keyrange *dst_lkr; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + // There can't be multiple owners in STO mode + invariant_zero(owners); + dst_lkr->insert(range, txnid, is_shared); + return true; + } + } migrate_fn; + migrate_fn.dst_lkr = + static_cast(prepared_lkr); + sto_lkr.prepare(&sto_rangetree); + sto_lkr.iterate(&migrate_fn); + sto_lkr.remove_all(); + sto_lkr.release(); + sto_rangetree.destroy(); + invariant(!m_rangetree->is_empty()); +} + +bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid, + const DBT *left_key, const DBT *right_key, + bool is_write_request) { + if (m_rangetree->is_empty() && m_sto_buffer.is_empty() && + toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) { + // We can do the optimization because the rangetree is empty, and + // we know its worth trying because the sto score is big enough. + sto_begin(txnid); + } else if (m_sto_txnid != TXNID_NONE) { + // We are currently doing the optimization. Check if we need to cancel + // it because a new txnid appeared, or if the current single txnid has + // taken too many locks already. + if (m_sto_txnid != txnid || + m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) { + sto_end_early(prepared_lkr); + } + } + + // At this point the sto txnid is properly set. If it is valid, then + // this txnid can append its lock to the sto buffer successfully. + if (m_sto_txnid != TXNID_NONE) { + invariant(m_sto_txnid == txnid); + sto_append(left_key, right_key, is_write_request); + return true; + } else { + invariant(m_sto_buffer.is_empty()); + return false; + } +} + +/* + Do the same as iterate_and_get_overlapping_row_locks does, but also check for + this: + The set of overlapping rows locks consists of just one read-only shared + lock with the same endpoints as specified (in that case, we can just add + ourselves into that list) + + @return true - One compatible shared lock + false - Otherwise +*/ +static bool iterate_and_get_overlapping_row_locks2( + const concurrent_tree::locked_keyrange *lkr, const DBT *left_key, + const DBT *right_key, comparator *cmp, TXNID, + GrowableArray *row_locks) { + struct copy_fn_obj { + GrowableArray *row_locks; + bool first_call = true; + bool matching_lock_found = false; + const DBT *left_key, *right_key; + comparator *cmp; + + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + if (first_call) { + first_call = false; + if (is_shared && !(*cmp)(left_key, range.get_left_key()) && + !(*cmp)(right_key, range.get_right_key())) { + matching_lock_found = true; + } + } else { + // if we see multiple matching locks, it doesn't matter whether + // the first one was matching. + matching_lock_found = false; + } + row_lock lock = {.range = range, + .txnid = txnid, + .is_shared = is_shared, + .owners = owners}; + row_locks->push(lock); + return true; + } + } copy_fn; + copy_fn.row_locks = row_locks; + copy_fn.left_key = left_key; + copy_fn.right_key = right_key; + copy_fn.cmp = cmp; + lkr->iterate(©_fn); + return copy_fn.matching_lock_found; +} + +// try to acquire a lock and consolidate it with existing locks if possible +// param: lkr, a prepared locked keyrange +// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist. +int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, + const DBT *left_key, + const DBT *right_key, + bool is_write_request, + txnid_set *conflicts) { + int r = 0; + concurrent_tree::locked_keyrange *lkr; + + keyrange requested_range; + requested_range.create(left_key, right_key); + lkr = static_cast(prepared_lkr); + lkr->acquire(requested_range); + + // copy out the set of overlapping row locks. + GrowableArray overlapping_row_locks; + overlapping_row_locks.init(); + bool matching_shared_lock_found = false; + + if (is_write_request) + iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks); + else { + matching_shared_lock_found = iterate_and_get_overlapping_row_locks2( + lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks); + // psergey-todo: what to do now? So, we have figured we have just one + // shareable lock. Need to add us into it as an owner but the lock + // pointer cannot be kept? + // A: use find_node_with_overlapping_child(key_range, nullptr); + // then, add ourselves to the owner list. + // Dont' foreget to release the subtree after that. + } + + if (matching_shared_lock_found) { + // there is just one non-confliting matching shared lock. + // we are hilding a lock on it (see acquire() call above). + // we need to modify it to indicate there is another locker... + if (lkr->add_shared_owner(requested_range, txnid)) { + // Pretend shared lock uses as much memory. + row_lock new_lock = {.range = requested_range, + .txnid = txnid, + .is_shared = false, + .owners = nullptr}; + uint64_t mem_used = row_lock_size_in_tree(new_lock); + if (m_mgr) { + m_mgr->note_mem_used(mem_used); + } + } + requested_range.destroy(); + overlapping_row_locks.deinit(); + return 0; + } + + size_t num_overlapping_row_locks = overlapping_row_locks.get_size(); + + // if any overlapping row locks conflict with this request, bail out. + + bool conflicts_exist = + determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts); + if (!conflicts_exist) { + // there are no conflicts, so all of the overlaps are for the requesting + // txnid. so, we must consolidate all existing overlapping ranges and the + // requested range into one dominating range. then we insert the dominating + // range. + bool all_shared = !is_write_request; + for (size_t i = 0; i < num_overlapping_row_locks; i++) { + row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i); + invariant(overlapping_lock.txnid == txnid); + requested_range.extend(m_cmp, overlapping_lock.range); + remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr); + all_shared = all_shared && overlapping_lock.is_shared; + } + + row_lock new_lock = {.range = requested_range, + .txnid = txnid, + .is_shared = all_shared, + .owners = nullptr}; + insert_row_lock_into_tree(lkr, new_lock, m_mgr); + } else { + r = DB_LOCK_NOTGRANTED; + } + + requested_range.destroy(); + overlapping_row_locks.deinit(); + return r; +} + +// acquire a lock in the given key range, inclusive. if successful, +// return 0. otherwise, populate the conflicts txnid_set with the set of +// transactions that conflict with this request. +int locktree::acquire_lock(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts) { + int r = 0; + + // we are only supporting write locks for simplicity + // invariant(is_write_request); + + // acquire and prepare a locked keyrange over the requested range. + // prepare is a serialzation point, so we take the opportunity to + // try the single txnid optimization first. + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + + bool acquired = + sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request); + if (!acquired) { + r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, + is_write_request, conflicts); + } + + lkr.release(); + return r; +} + +int locktree::try_acquire_lock(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn) { + // All ranges in the locktree must have left endpoints <= right endpoints. + // Range comparisons rely on this fact, so we make a paranoid invariant here. + paranoid_invariant(m_cmp(left_key, right_key) <= 0); + int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn); + if (r == 0) { + r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts); + } + return r; +} + +// the locktree silently upgrades read locks to write locks for simplicity +int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn) { + return try_acquire_lock(false, txnid, left_key, right_key, conflicts, + big_txn); +} + +int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn) { + return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn); +} + +// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, +// TXNID txnid); +void locktree::dump_locks(void *cdata, dump_callback cb) { + concurrent_tree::locked_keyrange lkr; + keyrange range; + range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity()); + + lkr.prepare(m_rangetree); + lkr.acquire(range); + + TXNID sto_txn; + if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) { + // insert all of the ranges from the single txnid buffer into a new rangtree + range_buffer::iterator iter(&m_sto_buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn, + !rec.get_exclusive_flag(), nullptr); + iter.next(); + } + } else { + GrowableArray all_locks; + all_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &all_locks); + + const size_t n_locks = all_locks.get_size(); + for (size_t i = 0; i < n_locks; i++) { + const row_lock lock = all_locks.fetch_unchecked(i); + (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(), + lock.txnid, lock.is_shared, lock.owners); + } + all_locks.deinit(); + } + lkr.release(); + range.destroy(); +} + +void locktree::get_conflicts(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts) { + // because we only support write locks, ignore this bit for now. + (void)is_write_request; + + // preparing and acquire a locked keyrange over the range + keyrange range; + range.create(left_key, right_key); + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + lkr.acquire(range); + + // copy out the set of overlapping row locks and determine the conflicts + GrowableArray overlapping_row_locks; + overlapping_row_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks); + + // we don't care if conflicts exist. we just want the conflicts set populated. + (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts); + + lkr.release(); + overlapping_row_locks.deinit(); + range.destroy(); +} + +// Effect: +// For each range in the lock tree that overlaps the given range and has +// the given txnid, remove it. +// Rationale: +// In the common case, there is only the range [left_key, right_key] and +// it is associated with txnid, so this is a single tree delete. +// +// However, consolidation and escalation change the objects in the tree +// without telling the txn anything. In this case, the txn may own a +// large range lock that represents its ownership of many smaller range +// locks. For example, the txn may think it owns point locks on keys 1, +// 2, and 3, but due to escalation, only the object [1,3] exists in the +// tree. +// +// The first call for a small lock will remove the large range lock, and +// the rest of the calls should do nothing. After the first release, +// another thread can acquire one of the locks that the txn thinks it +// still owns. That's ok, because the txn doesn't want it anymore (it +// unlocks everything at once), but it may find a lock that it does not +// own. +// +// In our example, the txn unlocks key 1, which actually removes the +// whole lock [1,3]. Now, someone else can lock 2 before our txn gets +// around to unlocking 2, so we should not remove that lock. +void locktree::remove_overlapping_locks_for_txnid(TXNID txnid, + const DBT *left_key, + const DBT *right_key) { + keyrange release_range; + release_range.create(left_key, right_key); + + // acquire and prepare a locked keyrange over the release range + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + lkr.acquire(release_range); + + // copy out the set of overlapping row locks. + GrowableArray overlapping_row_locks; + overlapping_row_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks); + size_t num_overlapping_row_locks = overlapping_row_locks.get_size(); + + for (size_t i = 0; i < num_overlapping_row_locks; i++) { + row_lock lock = overlapping_row_locks.fetch_unchecked(i); + // If this isn't our lock, that's ok, just don't remove it. + // See rationale above. + // psergey-todo: for shared locks, just remove ourselves from the + // owners. + if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) { + remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr); + } + } + + lkr.release(); + overlapping_row_locks.deinit(); + release_range.destroy(); +} + +bool locktree::sto_txnid_is_valid_unsafe(void) const { + return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE; +} + +int locktree::sto_get_score_unsafe(void) const { + return toku_unsafe_fetch(m_sto_score); +} + +bool locktree::sto_try_release(TXNID txnid) { + bool released = false; + if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) { + // check the bit again with a prepared locked keyrange, + // which protects the optimization bits and rangetree data + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + if (m_sto_txnid != TXNID_NONE) { + // this txnid better be the single txnid on this locktree, + // or else we are in big trouble (meaning the logic is broken) + invariant(m_sto_txnid == txnid); + invariant(m_rangetree->is_empty()); + sto_end(); + released = true; + } + lkr.release(); + } + return released; +} + +// release all of the locks for a txnid whose endpoints are pairs +// in the given range buffer. +void locktree::release_locks(TXNID txnid, const range_buffer *ranges, + bool all_trx_locks_hint) { + // try the single txn optimization. if it worked, then all of the + // locks are already released, otherwise we need to do it here. + bool released; + if (all_trx_locks_hint) { + // This will release all of the locks the transaction is holding + released = sto_try_release(txnid); + } else { + /* + psergey: we are asked to release *Some* of the locks the transaction + is holding. + We could try doing that without leaving the STO mode, but right now, + the easiest way is to exit the STO mode and let the non-STO code path + handle it. + */ + if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) { + // check the bit again with a prepared locked keyrange, + // which protects the optimization bits and rangetree data + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + if (m_sto_txnid != TXNID_NONE) { + sto_end_early(&lkr); + } + lkr.release(); + } + released = false; + } + if (!released) { + range_buffer::iterator iter(ranges); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + const DBT *left_key = rec.get_left_key(); + const DBT *right_key = rec.get_right_key(); + // All ranges in the locktree must have left endpoints <= right endpoints. + // Range comparisons rely on this fact, so we make a paranoid invariant + // here. + paranoid_invariant(m_cmp(left_key, right_key) <= 0); + remove_overlapping_locks_for_txnid(txnid, left_key, right_key); + iter.next(); + } + // Increase the sto score slightly. Eventually it will hit + // the threshold and we'll try the optimization again. This + // is how a previously multithreaded system transitions into + // a single threaded system that benefits from the optimization. + if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) { + toku_sync_fetch_and_add(&m_sto_score, 1); + } + } +} + +// iterate over a locked keyrange and extract copies of the first N +// row locks, storing each one into the given array of size N, +// then removing each extracted lock from the locked keyrange. +static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr, + locktree_manager *mgr, row_lock *row_locks, + int num_to_extract) { + struct extract_fn_obj { + int num_extracted; + int num_to_extract; + row_lock *row_locks; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + if (num_extracted < num_to_extract) { + row_lock lock; + lock.range.create_copy(range); + lock.txnid = txnid; + lock.is_shared = is_shared; + // deep-copy the set of owners: + if (owners) + lock.owners = new TxnidVector(*owners); + else + lock.owners = nullptr; + row_locks[num_extracted++] = lock; + return true; + } else { + return false; + } + } + } extract_fn; + + extract_fn.row_locks = row_locks; + extract_fn.num_to_extract = num_to_extract; + extract_fn.num_extracted = 0; + lkr->iterate(&extract_fn); + + // now that the ranges have been copied out, complete + // the extraction by removing the ranges from the tree. + // use remove_row_lock_from_tree() so we properly track the + // amount of memory and number of locks freed. + int num_extracted = extract_fn.num_extracted; + invariant(num_extracted <= num_to_extract); + for (int i = 0; i < num_extracted; i++) { + remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr); + } + + return num_extracted; +} + +// Store each newly escalated lock in a range buffer for appropriate txnid. +// We'll rebuild the locktree by iterating over these ranges, and then we +// can pass back each txnid/buffer pair individually through a callback +// to notify higher layers that locks have changed. +struct txnid_range_buffer { + TXNID txnid; + range_buffer buffer; + + static int find_by_txnid(struct txnid_range_buffer *const &other_buffer, + const TXNID &txnid) { + if (txnid < other_buffer->txnid) { + return -1; + } else if (other_buffer->txnid == txnid) { + return 0; + } else { + return 1; + } + } +}; + +// escalate the locks in the locktree by merging adjacent +// locks that have the same txnid into one larger lock. +// +// if there's only one txnid in the locktree then this +// approach works well. if there are many txnids and each +// has locks in a random/alternating order, then this does +// not work so well. +void locktree::escalate(lt_escalate_cb after_escalate_callback, + void *after_escalate_callback_extra) { + omt range_buffers; + range_buffers.create(); + + // prepare and acquire a locked keyrange on the entire locktree + concurrent_tree::locked_keyrange lkr; + keyrange infinite_range = keyrange::get_infinite_range(); + lkr.prepare(m_rangetree); + lkr.acquire(infinite_range); + + // if we're in the single txnid optimization, simply call it off. + // if you have to run escalation, you probably don't care about + // the optimization anyway, and this makes things easier. + if (m_sto_txnid != TXNID_NONE) { + // We are already accounting for this escalation time and + // count, so don't do it for sto_end_early too. + sto_end_early_no_accounting(&lkr); + } + + // extract and remove batches of row locks from the locktree + int num_extracted; + const int num_row_locks_per_batch = 128; + row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf); + + // we always remove the "first" n because we are removing n + // each time we do an extraction. so this loops until its empty. + while ((num_extracted = extract_first_n_row_locks( + &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) { + int current_index = 0; + while (current_index < num_extracted) { + // every batch of extracted locks is in range-sorted order. search + // through them and merge adjacent locks with the same txnid into + // one dominating lock and save it to a set of escalated locks. + // + // first, find the index of the next row lock that + // - belongs to a different txnid, or + // - belongs to several txnids, or + // - is a shared lock (we could potentially merge those but + // currently we don't), or + // - is across a lock escalation barrier. + int next_txnid_index = current_index + 1; + + while (next_txnid_index < num_extracted && + (extracted_buf[current_index].txnid == + extracted_buf[next_txnid_index].txnid) && + !extracted_buf[next_txnid_index].is_shared && + !extracted_buf[next_txnid_index].owners && + !m_escalation_barrier( + extracted_buf[current_index].range.get_right_key(), + extracted_buf[next_txnid_index].range.get_left_key(), + m_escalation_barrier_arg)) { + next_txnid_index++; + } + + // Create an escalated range for the current txnid that dominates + // each range between the current indext and the next txnid's index. + // const TXNID current_txnid = extracted_buf[current_index].txnid; + const DBT *escalated_left_key = + extracted_buf[current_index].range.get_left_key(); + const DBT *escalated_right_key = + extracted_buf[next_txnid_index - 1].range.get_right_key(); + + // Try to find a range buffer for the current txnid. Create one if it + // doesn't exist. Then, append the new escalated range to the buffer. (If + // a lock is shared by multiple txnids, append it each of txnid's lists) + TxnidVector *owners_ptr; + TxnidVector singleton_owner; + if (extracted_buf[current_index].owners) + owners_ptr = extracted_buf[current_index].owners; + else { + singleton_owner.insert(extracted_buf[current_index].txnid); + owners_ptr = &singleton_owner; + } + + for (auto cur_txnid : *owners_ptr) { + uint32_t idx; + struct txnid_range_buffer *existing_range_buffer; + int r = + range_buffers.find_zero( + cur_txnid, &existing_range_buffer, &idx); + if (r == DB_NOTFOUND) { + struct txnid_range_buffer *XMALLOC(new_range_buffer); + new_range_buffer->txnid = cur_txnid; + new_range_buffer->buffer.create(); + new_range_buffer->buffer.append( + escalated_left_key, escalated_right_key, + !extracted_buf[current_index].is_shared); + range_buffers.insert_at(new_range_buffer, idx); + } else { + invariant_zero(r); + invariant(existing_range_buffer->txnid == cur_txnid); + existing_range_buffer->buffer.append( + escalated_left_key, escalated_right_key, + !extracted_buf[current_index].is_shared); + } + } + + current_index = next_txnid_index; + } + + // destroy the ranges copied during the extraction + for (int i = 0; i < num_extracted; i++) { + delete extracted_buf[i].owners; + extracted_buf[i].range.destroy(); + } + } + toku_free(extracted_buf); + + // Rebuild the locktree from each range in each range buffer, + // then notify higher layers that the txnid's locks have changed. + // + // (shared locks: if a lock was initially shared between transactions TRX1, + // TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on + // TRX2, etc. This will succeed and an identical shared lock will be + // constructed) + + invariant(m_rangetree->is_empty()); + const uint32_t num_range_buffers = range_buffers.size(); + for (uint32_t i = 0; i < num_range_buffers; i++) { + struct txnid_range_buffer *current_range_buffer; + int r = range_buffers.fetch(i, ¤t_range_buffer); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + + const TXNID current_txnid = current_range_buffer->txnid; + range_buffer::iterator iter(¤t_range_buffer->buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + keyrange range; + range.create(rec.get_left_key(), rec.get_right_key()); + row_lock lock = {.range = range, + .txnid = current_txnid, + .is_shared = !rec.get_exclusive_flag(), + .owners = nullptr}; + insert_row_lock_into_tree(&lkr, lock, m_mgr); + iter.next(); + } + + // Notify higher layers that locks have changed for the current txnid + if (after_escalate_callback) { + after_escalate_callback(current_txnid, this, current_range_buffer->buffer, + after_escalate_callback_extra); + } + current_range_buffer->buffer.destroy(); + } + + while (range_buffers.size() > 0) { + struct txnid_range_buffer *buffer; + int r = range_buffers.fetch(0, &buffer); + invariant_zero(r); + r = range_buffers.delete_at(0); + invariant_zero(r); + toku_free(buffer); + } + range_buffers.destroy(); + + lkr.release(); +} + +void *locktree::get_userdata(void) const { return m_userdata; } + +void locktree::set_userdata(void *userdata) { m_userdata = userdata; } + +struct lt_lock_request_info *locktree::get_lock_request_info(void) { + return &m_lock_request_info; +} + +void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); } + +locktree_manager *locktree::get_manager(void) const { return m_mgr; } + +int locktree::compare(const locktree *lt) const { + if (m_dict_id.dictid < lt->m_dict_id.dictid) { + return -1; + } else if (m_dict_id.dictid == lt->m_dict_id.dictid) { + return 0; + } else { + return 1; + } +} + +DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; } + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h new file mode 100644 index 000000000..f0f4b042d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h @@ -0,0 +1,580 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_external_pthread.h" +#include "../portability/toku_pthread.h" +#include "../portability/toku_time.h" +// PORT #include // just for DICTIONARY_ID.. +// PORT: ft-status for LTM_STATUS: +#include "../ft/ft-status.h" + +struct DICTIONARY_ID { + uint64_t dictid; +}; + +#include "../util/omt.h" +#include "range_buffer.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +class locktree; +class locktree_manager; +class lock_request; +class concurrent_tree; + +typedef int (*lt_create_cb)(locktree *lt, void *extra); +typedef void (*lt_destroy_cb)(locktree *lt); +typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, + const range_buffer &buffer, void *extra); + +typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b, + void *extra); + +struct lt_counters { + uint64_t wait_count, wait_time; + uint64_t long_wait_count, long_wait_time; + uint64_t timeout_count; + + void add(const lt_counters &rhs) { + wait_count += rhs.wait_count; + wait_time += rhs.wait_time; + long_wait_count += rhs.long_wait_count; + long_wait_time += rhs.long_wait_time; + timeout_count += rhs.timeout_count; + } +}; + +// Lock request state for some locktree +struct lt_lock_request_info { + omt pending_lock_requests; + std::atomic_bool pending_is_empty; + toku_external_mutex_t mutex; + bool should_retry_lock_requests; + lt_counters counters; + std::atomic_ullong retry_want; + unsigned long long retry_done; + toku_mutex_t retry_mutex; + toku_cond_t retry_cv; + bool running_retry; + + void init(toku_external_mutex_factory_t mutex_factory); + void destroy(void); +}; + +// The locktree manager manages a set of locktrees, one for each open +// dictionary. Locktrees are retrieved from the manager. When they are no +// longer needed, they are be released by the user. +class locktree_manager { + public: + // param: create_cb, called just after a locktree is first created. + // destroy_cb, called just before a locktree is destroyed. + // escalate_cb, called after a locktree is escalated (with extra + // param) + void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, + lt_escalate_cb escalate_cb, void *extra, + toku_external_mutex_factory_t mutex_factory_arg); + + void destroy(void); + + size_t get_max_lock_memory(void); + + int set_max_lock_memory(size_t max_lock_memory); + + // effect: Get a locktree from the manager. If a locktree exists with the + // given + // dict_id, it is referenced and then returned. If one did not exist, + // it is created. It will use the comparator for comparing keys. The + // on_create callback (passed to locktree_manager::create()) will be + // called with the given extra parameter. + locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, + void *on_create_extra); + + void reference_lt(locktree *lt); + + // effect: Releases one reference on a locktree. If the reference count + // transitions + // to zero, the on_destroy callback is called before it gets + // destroyed. + void release_lt(locktree *lt); + + void get_status(LTM_STATUS status); + + // effect: calls the iterate function on each pending lock request + // note: holds the manager's mutex + typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id, + TXNID txnid, const DBT *left_key, + const DBT *right_key, + TXNID blocking_txnid, + uint64_t start_time, + void *extra); + int iterate_pending_lock_requests(lock_request_iterate_callback cb, + void *extra); + + // effect: Determines if too many locks or too much memory is being used, + // Runs escalation on the manager if so. + // param: big_txn, if the current transaction is 'big' (has spilled rollback + // logs) returns: 0 if there enough resources to create a new lock, or + // TOKUDB_OUT_OF_LOCKS + // if there are not enough resources and lock escalation failed to + // free up enough resources for a new lock. + int check_current_lock_constraints(bool big_txn); + + bool over_big_threshold(void); + + void note_mem_used(uint64_t mem_used); + + void note_mem_released(uint64_t mem_freed); + + bool out_of_locks(void) const; + + // Escalate all locktrees + void escalate_all_locktrees(void); + + // Escalate a set of locktrees + void escalate_locktrees(locktree **locktrees, int num_locktrees); + + // effect: calls the private function run_escalation(), only ok to + // do for tests. + // rationale: to get better stress test coverage, we want a way to + // deterministicly trigger lock escalation. + void run_escalation_for_test(void); + void run_escalation(void); + + // Add time t to the escalator's wait time statistics + void add_escalator_wait_time(uint64_t t); + + void kill_waiter(void *extra); + + private: + static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; + + // tracks the current number of locks and lock memory + uint64_t m_max_lock_memory; + uint64_t m_current_lock_memory; + + struct lt_counters m_lt_counters; + + // the create and destroy callbacks for the locktrees + lt_create_cb m_lt_create_callback; + lt_destroy_cb m_lt_destroy_callback; + lt_escalate_cb m_lt_escalate_callback; + void *m_lt_escalate_callback_extra; + + omt m_locktree_map; + + toku_external_mutex_factory_t mutex_factory; + + // the manager's mutex protects the locktree map + toku_mutex_t m_mutex; + + void mutex_lock(void); + + void mutex_unlock(void); + + // Manage the set of open locktrees + locktree *locktree_map_find(const DICTIONARY_ID &dict_id); + void locktree_map_put(locktree *lt); + void locktree_map_remove(locktree *lt); + + static int find_by_dict_id(locktree *const <, const DICTIONARY_ID &dict_id); + + void escalator_init(void); + void escalator_destroy(void); + + // statistics about lock escalation. + toku_mutex_t m_escalation_mutex; + uint64_t m_escalation_count; + tokutime_t m_escalation_time; + uint64_t m_escalation_latest_result; + uint64_t m_wait_escalation_count; + uint64_t m_wait_escalation_time; + uint64_t m_long_wait_escalation_count; + uint64_t m_long_wait_escalation_time; + + // the escalator coordinates escalation on a set of locktrees for a bunch of + // threads + class locktree_escalator { + public: + void create(void); + void destroy(void); + void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), + void *extra); + + private: + toku_mutex_t m_escalator_mutex; + toku_cond_t m_escalator_done; + bool m_escalator_running; + }; + + locktree_escalator m_escalator; + + friend class manager_unit_test; +}; + +// A locktree represents the set of row locks owned by all transactions +// over an open dictionary. Read and write ranges are represented as +// a left and right key which are compared with the given comparator +// +// Locktrees are not created and destroyed by the user. Instead, they are +// referenced and released using the locktree manager. +// +// A sample workflow looks like this: +// - Create a manager. +// - Get a locktree by dictionaroy id from the manager. +// - Perform read/write lock acquision on the locktree, add references to +// the locktree using the manager, release locks, release references, etc. +// - ... +// - Release the final reference to the locktree. It will be destroyed. +// - Destroy the manager. +class locktree { + public: + // effect: Creates a locktree + void create(locktree_manager *mgr, DICTIONARY_ID dict_id, + const comparator &cmp, + toku_external_mutex_factory_t mutex_factory); + + void destroy(void); + + // For thread-safe, external reference counting + void add_reference(void); + + // requires: the reference count is > 0 + // returns: the reference count, after decrementing it by one + uint32_t release_reference(void); + + // returns: the current reference count + uint32_t get_reference_count(void); + + // effect: Attempts to grant a read lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + // note: Read locks cannot be shared between txnids, as one would expect. + // This is for simplicity since read locks are rare in MySQL. + int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: Attempts to grant a write lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: populate the conflicts set with the txnids that would preventing + // the given txnid from getting a lock on [left_key, right_key] + void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + // effect: Release all of the lock ranges represented by the range buffer for + // a txnid. + void release_locks(TXNID txnid, const range_buffer *ranges, + bool all_trx_locks_hint = false); + + // effect: Runs escalation on this locktree + void escalate(lt_escalate_cb after_escalate_callback, void *extra); + + // returns: The userdata associated with this locktree, or null if it has not + // been set. + void *get_userdata(void) const; + + void set_userdata(void *userdata); + + locktree_manager *get_manager(void) const; + + void set_comparator(const comparator &cmp); + + // Set the user-provided Lock Escalation Barrier check function and its + // argument + // + // Lock Escalation Barrier limits the scope of Lock Escalation. + // For two keys A and B (such that A < B), + // escalation_barrier_check_func(A, B)==true means that there's a lock + // escalation barrier between A and B, and lock escalation is not allowed to + // bridge the gap between A and B. + // + // This method sets the user-provided barrier check function and its + // parameter. + void set_escalation_barrier_func(lt_escalation_barrier_check_func func, + void *extra); + + int compare(const locktree *lt) const; + + DICTIONARY_ID get_dict_id() const; + + // Private info struct for storing pending lock request state. + // Only to be used by lock requests. We store it here as + // something less opaque than usual to strike a tradeoff between + // abstraction and code complexity. It is still fairly abstract + // since the lock_request object is opaque + struct lt_lock_request_info *get_lock_request_info(void); + + typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, + TXNID txnid, bool is_shared, + TxnidVector *owners); + void dump_locks(void *cdata, dump_callback cb); + + private: + locktree_manager *m_mgr; + DICTIONARY_ID m_dict_id; + uint32_t m_reference_count; + + // Since the memory referenced by this comparator is not owned by the + // locktree, the user must guarantee it will outlive the locktree. + // + // The ydb API accomplishes this by opening an ft_handle in the on_create + // callback, which will keep the underlying FT (and its descriptor) in memory + // for as long as the handle is open. The ft_handle is stored opaquely in the + // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra + comparator m_cmp; + + lt_escalation_barrier_check_func m_escalation_barrier; + void *m_escalation_barrier_arg; + + concurrent_tree *m_rangetree; + + void *m_userdata; + struct lt_lock_request_info m_lock_request_info; + + // psergey-todo: + // Each transaction also keeps a list of ranges it has locked. + // So, when a transaction is running in STO mode, two identical + // lists are kept: the STO lock list and transaction's owned locks + // list. Why can't we do with just one list? + + // The following fields and members prefixed with "sto_" are for + // the single txnid optimization, intended to speed up the case + // when only one transaction is using the locktree. If we know + // the locktree has only one transaction, then acquiring locks + // takes O(1) work and releasing all locks takes O(1) work. + // + // How do we know that the locktree only has a single txnid? + // What do we do if it does? + // + // When a txn with txnid T requests a lock: + // - If the tree is empty, the optimization is possible. Set the single + // txnid to T, and insert the lock range into the buffer. + // - If the tree is not empty, check if the single txnid is T. If so, + // append the lock range to the buffer. Otherwise, migrate all of + // the locks in the buffer into the rangetree on behalf of txnid T, + // and invalid the single txnid. + // + // When a txn with txnid T releases its locks: + // - If the single txnid is valid, it must be for T. Destroy the buffer. + // - If it's not valid, release locks the normal way in the rangetree. + // + // To carry out the optimization we need to record a single txnid + // and a range buffer for each locktree, each protected by the root + // lock of the locktree's rangetree. The root lock for a rangetree + // is grabbed by preparing a locked keyrange on the rangetree. + TXNID m_sto_txnid; + range_buffer m_sto_buffer; + + // The single txnid optimization speeds up the case when only one + // transaction is using the locktree. But it has the potential to + // hurt the case when more than one txnid exists. + // + // There are two things we need to do to make the optimization only + // optimize the case we care about, and not hurt the general case. + // + // Bound the worst-case latency for lock migration when the + // optimization stops working: + // - Idea: Stop the optimization and migrate immediate if we notice + // the single txnid has takes many locks in the range buffer. + // - Implementation: Enforce a max size on the single txnid range buffer. + // - Analysis: Choosing the perfect max value, M, is difficult to do + // without some feedback from the field. Intuition tells us that M should + // not be so small that the optimization is worthless, and it should not + // be so big that it's unreasonable to have to wait behind a thread doing + // the work of converting M buffer locks into rangetree locks. + // + // Prevent concurrent-transaction workloads from trying the optimization + // in vain: + // - Idea: Don't even bother trying the optimization if we think the + // system is in a concurrent-transaction state. + // - Implementation: Do something even simpler than detecting whether the + // system is in a concurent-transaction state. Just keep a "score" value + // and some threshold. If at any time the locktree is eligible for the + // optimization, only do it if the score is at this threshold. When you + // actually do the optimization but someone has to migrate locks in the buffer + // (expensive), then reset the score back to zero. Each time a txn + // releases locks, the score is incremented by 1. + // - Analysis: If you let the threshold be "C", then at most 1 / C txns will + // do the optimization in a concurrent-transaction system. Similarly, it + // takes at most C txns to start using the single txnid optimzation, which + // is good when the system transitions from multithreaded to single threaded. + // + // STO_BUFFER_MAX_SIZE: + // + // We choose the max value to be 1 million since most transactions are smaller + // than 1 million and we can create a rangetree of 1 million elements in + // less than a second. So we can be pretty confident that this threshold + // enables the optimization almost always, and prevents super pathological + // latency issues for the first lock taken by a second thread. + // + // STO_SCORE_THRESHOLD: + // + // A simple first guess at a good value for the score threshold is 100. + // By our analysis, we'd end up doing the optimization in vain for + // around 1% of all transactions, which seems reasonable. Further, + // if the system goes single threaded, it ought to be pretty quick + // for 100 transactions to go by, so we won't have to wait long before + // we start doing the single txind optimzation again. + static const int STO_BUFFER_MAX_SIZE = 50 * 1024; + static const int STO_SCORE_THRESHOLD = 100; + int m_sto_score; + + // statistics about time spent ending the STO early + uint64_t m_sto_end_early_count; + tokutime_t m_sto_end_early_time; + + // effect: begins the single txnid optimizaiton, setting m_sto_txnid + // to the given txnid. + // requires: m_sto_txnid is invalid + void sto_begin(TXNID txnid); + + // effect: append a range to the sto buffer + // requires: m_sto_txnid is valid + void sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // effect: ends the single txnid optimization, releaseing any memory + // stored in the sto buffer, notifying the tracker, and + // invalidating m_sto_txnid. + // requires: m_sto_txnid is valid + void sto_end(void); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see below. + // effect: ends the single txnid optimization early, migrating buffer locks + // into the rangetree, calling sto_end(), and then setting the + // sto_score back to zero. + // requires: m_sto_txnid is valid + void sto_end_early(void *prepared_lkr); + void sto_end_early_no_accounting(void *prepared_lkr); + + // params: prepared_lkr is a void * to a prepared locked keyrange. we can't + // use + // the real type because the compiler won't allow us to forward + // declare concurrent_tree::locked_keyrange without including + // concurrent_tree.h, which we cannot do here because it is a template + // implementation. + // requires: the prepared locked keyrange is for the locktree's rangetree + // requires: m_sto_txnid is valid + // effect: migrates each lock in the single txnid buffer into the locktree's + // rangetree, notifying the memory tracker as necessary. + void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr); + + // effect: If m_sto_txnid is valid, then release the txnid's locks + // by ending the optimization. + // requires: If m_sto_txnid is valid, it is equal to the given txnid + // returns: True if locks were released for this txnid + bool sto_try_release(TXNID txnid); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see above. + // requires: the prepared locked keyrange is for the locktree's rangetree + // effect: If m_sto_txnid is valid and equal to the given txnid, then + // append a range onto the buffer. Otherwise, if m_sto_txnid is valid + // but not equal to this txnid, then migrate the buffer's locks + // into the rangetree and end the optimization, setting the score + // back to zero. + // returns: true if the lock was acquired for this txnid + bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key, + const DBT *right_key, bool is_write_request); + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // true if m_sto_txnid is not TXNID_NONE + bool sto_txnid_is_valid_unsafe(void) const; + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // m_sto_score + int sto_get_score_unsafe(void) const; + + void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key, + const DBT *right_key); + + int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, + const DBT *left_key, const DBT *right_key, + bool is_write_request, txnid_set *conflicts); + + int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn); + + friend class locktree_unit_test; + friend class manager_unit_test; + friend class lock_request_unit_test; + + // engine status reaches into the locktree to read some stats + friend void locktree_manager::get_status(LTM_STATUS status); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc new file mode 100644 index 000000000..4186182be --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc @@ -0,0 +1,527 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include +#include + +#include "../portability/toku_pthread.h" +#include "../util/status.h" +#include "lock_request.h" +#include "locktree.h" + +namespace toku { + +void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, + lt_escalate_cb escalate_cb, void *escalate_extra, + toku_external_mutex_factory_t mutex_factory_arg) { + mutex_factory = mutex_factory_arg; + m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY; + m_current_lock_memory = 0; + + m_locktree_map.create(); + m_lt_create_callback = create_cb; + m_lt_destroy_callback = destroy_cb; + m_lt_escalate_callback = escalate_cb; + m_lt_escalate_callback_extra = escalate_extra; + ZERO_STRUCT(m_mutex); + toku_mutex_init(manager_mutex_key, &m_mutex, nullptr); + + ZERO_STRUCT(m_lt_counters); + + escalator_init(); +} + +void locktree_manager::destroy(void) { + escalator_destroy(); + invariant(m_current_lock_memory == 0); + invariant(m_locktree_map.size() == 0); + m_locktree_map.destroy(); + toku_mutex_destroy(&m_mutex); +} + +void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); } + +void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); } + +size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; } + +int locktree_manager::set_max_lock_memory(size_t max_lock_memory) { + int r = 0; + mutex_lock(); + if (max_lock_memory < m_current_lock_memory) { + r = EDOM; + } else { + m_max_lock_memory = max_lock_memory; + } + mutex_unlock(); + return r; +} + +int locktree_manager::find_by_dict_id(locktree *const <, + const DICTIONARY_ID &dict_id) { + if (lt->get_dict_id().dictid < dict_id.dictid) { + return -1; + } else if (lt->get_dict_id().dictid == dict_id.dictid) { + return 0; + } else { + return 1; + } +} + +locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) { + locktree *lt; + int r = m_locktree_map.find_zero(dict_id, <, + nullptr); + return r == 0 ? lt : nullptr; +} + +void locktree_manager::locktree_map_put(locktree *lt) { + int r = m_locktree_map.insert( + lt, lt->get_dict_id(), nullptr); + invariant_zero(r); +} + +void locktree_manager::locktree_map_remove(locktree *lt) { + uint32_t idx; + locktree *found_lt; + int r = m_locktree_map.find_zero( + lt->get_dict_id(), &found_lt, &idx); + invariant_zero(r); + invariant(found_lt == lt); + r = m_locktree_map.delete_at(idx); + invariant_zero(r); +} + +locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp, + void *on_create_extra) { + // hold the mutex around searching and maybe + // inserting into the locktree map + mutex_lock(); + + locktree *lt = locktree_map_find(dict_id); + if (lt == nullptr) { + XCALLOC(lt); + lt->create(this, dict_id, cmp, mutex_factory); + + // new locktree created - call the on_create callback + // and put it in the locktree map + if (m_lt_create_callback) { + int r = m_lt_create_callback(lt, on_create_extra); + if (r != 0) { + lt->release_reference(); + lt->destroy(); + toku_free(lt); + lt = nullptr; + } + } + if (lt) { + locktree_map_put(lt); + } + } else { + reference_lt(lt); + } + + mutex_unlock(); + + return lt; +} + +void locktree_manager::reference_lt(locktree *lt) { + // increment using a sync fetch and add. + // the caller guarantees that the lt won't be + // destroyed while we increment the count here. + // + // the caller can do this by already having an lt + // reference or by holding the manager mutex. + // + // if the manager's mutex is held, it is ok for the + // reference count to transition from 0 to 1 (no race), + // since we're serialized with other opens and closes. + lt->add_reference(); +} + +void locktree_manager::release_lt(locktree *lt) { + bool do_destroy = false; + DICTIONARY_ID dict_id = lt->get_dict_id(); + + // Release a reference on the locktree. If the count transitions to zero, + // then we *may* need to do the cleanup. + // + // Grab the manager's mutex and look for a locktree with this locktree's + // dictionary id. Since dictionary id's never get reused, any locktree + // found must be the one we just released a reference on. + // + // At least two things could have happened since we got the mutex: + // - Another thread gets a locktree with the same dict_id, increments + // the reference count. In this case, we shouldn't destroy it. + // - Another thread gets a locktree with the same dict_id and then + // releases it quickly, transitioning the reference count from zero to + // one and back to zero. In this case, only one of us should destroy it. + // It doesn't matter which. We originally missed this case, see #5776. + // + // After 5776, the high level rule for release is described below. + // + // If a thread releases a locktree and notices the reference count transition + // to zero, then that thread must immediately: + // - assume the locktree object is invalid + // - grab the manager's mutex + // - search the locktree map for a locktree with the same dict_id and remove + // it, if it exists. the destroy may be deferred. + // - release the manager's mutex + // + // This way, if many threads transition the same locktree's reference count + // from 1 to zero and wait behind the manager's mutex, only one of them will + // do the actual destroy and the others will happily do nothing. + uint32_t refs = lt->release_reference(); + if (refs == 0) { + mutex_lock(); + // lt may not have already been destroyed, so look it up. + locktree *find_lt = locktree_map_find(dict_id); + if (find_lt != nullptr) { + // A locktree is still in the map with that dict_id, so it must be + // equal to lt. This is true because dictionary ids are never reused. + // If the reference count is zero, it's our responsibility to remove + // it and do the destroy. Otherwise, someone still wants it. + // If the locktree is still valid then check if it should be deleted. + if (find_lt == lt) { + if (lt->get_reference_count() == 0) { + locktree_map_remove(lt); + do_destroy = true; + } + m_lt_counters.add(lt->get_lock_request_info()->counters); + } + } + mutex_unlock(); + } + + // if necessary, do the destroy without holding the mutex + if (do_destroy) { + if (m_lt_destroy_callback) { + m_lt_destroy_callback(lt); + } + lt->destroy(); + toku_free(lt); + } +} + +void locktree_manager::run_escalation(void) { + struct escalation_fn { + static void run(void *extra) { + locktree_manager *mgr = (locktree_manager *)extra; + mgr->escalate_all_locktrees(); + }; + }; + m_escalator.run(this, escalation_fn::run, this); +} + +// test-only version of lock escalation +void locktree_manager::run_escalation_for_test(void) { run_escalation(); } + +void locktree_manager::escalate_all_locktrees(void) { + uint64_t t0 = toku_current_time_microsec(); + + // get all locktrees + mutex_lock(); + int num_locktrees = m_locktree_map.size(); + locktree **locktrees = new locktree *[num_locktrees]; + for (int i = 0; i < num_locktrees; i++) { + int r = m_locktree_map.fetch(i, &locktrees[i]); + invariant_zero(r); + reference_lt(locktrees[i]); + } + mutex_unlock(); + + // escalate them + escalate_locktrees(locktrees, num_locktrees); + + delete[] locktrees; + + uint64_t t1 = toku_current_time_microsec(); + add_escalator_wait_time(t1 - t0); +} + +void locktree_manager::note_mem_used(uint64_t mem_used) { + (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used); +} + +void locktree_manager::note_mem_released(uint64_t mem_released) { + uint64_t old_mem_used = + toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released); + invariant(old_mem_used >= mem_released); +} + +bool locktree_manager::out_of_locks(void) const { + return m_current_lock_memory >= m_max_lock_memory; +} + +bool locktree_manager::over_big_threshold(void) { + return m_current_lock_memory >= m_max_lock_memory / 2; +} + +int locktree_manager::iterate_pending_lock_requests( + lock_request_iterate_callback callback, void *extra) { + mutex_lock(); + int r = 0; + uint32_t num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees && r == 0; i++) { + locktree *lt; + r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + + struct lt_lock_request_info *info = lt->get_lock_request_info(); + toku_external_mutex_lock(&info->mutex); + + uint32_t num_requests = info->pending_lock_requests.size(); + for (uint32_t k = 0; k < num_requests && r == 0; k++) { + lock_request *req; + r = info->pending_lock_requests.fetch(k, &req); + invariant_zero(r); + if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */ + continue; + r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(), + req->get_right_key(), req->get_conflicting_txnid(), + req->get_start_time(), extra); + } + + toku_external_mutex_unlock(&info->mutex); + } + mutex_unlock(); + return r; +} + +int locktree_manager::check_current_lock_constraints(bool big_txn) { + int r = 0; + if (big_txn && over_big_threshold()) { + run_escalation(); + if (over_big_threshold()) { + r = TOKUDB_OUT_OF_LOCKS; + } + } + if (r == 0 && out_of_locks()) { + run_escalation(); + if (out_of_locks()) { + // return an error if we're still out of locks after escalation. + r = TOKUDB_OUT_OF_LOCKS; + } + } + return r; +} + +void locktree_manager::escalator_init(void) { + ZERO_STRUCT(m_escalation_mutex); + toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr); + m_escalation_count = 0; + m_escalation_time = 0; + m_wait_escalation_count = 0; + m_wait_escalation_time = 0; + m_long_wait_escalation_count = 0; + m_long_wait_escalation_time = 0; + m_escalation_latest_result = 0; + m_escalator.create(); +} + +void locktree_manager::escalator_destroy(void) { + m_escalator.destroy(); + toku_mutex_destroy(&m_escalation_mutex); +} + +void locktree_manager::add_escalator_wait_time(uint64_t t) { + toku_mutex_lock(&m_escalation_mutex); + m_wait_escalation_count += 1; + m_wait_escalation_time += t; + if (t >= 1000000) { + m_long_wait_escalation_count += 1; + m_long_wait_escalation_time += t; + } + toku_mutex_unlock(&m_escalation_mutex); +} + +void locktree_manager::escalate_locktrees(locktree **locktrees, + int num_locktrees) { + // there are too many row locks in the system and we need to tidy up. + // + // a simple implementation of escalation does not attempt + // to reduce the memory foot print of each txn's range buffer. + // doing so would require some layering hackery (or a callback) + // and more complicated locking. for now, just escalate each + // locktree individually, in-place. + tokutime_t t0 = toku_time_now(); + for (int i = 0; i < num_locktrees; i++) { + locktrees[i]->escalate(m_lt_escalate_callback, + m_lt_escalate_callback_extra); + release_lt(locktrees[i]); + } + tokutime_t t1 = toku_time_now(); + + toku_mutex_lock(&m_escalation_mutex); + m_escalation_count++; + m_escalation_time += (t1 - t0); + m_escalation_latest_result = m_current_lock_memory; + toku_mutex_unlock(&m_escalation_mutex); +} + +struct escalate_args { + locktree_manager *mgr; + locktree **locktrees; + int num_locktrees; +}; + +void locktree_manager::locktree_escalator::create(void) { + ZERO_STRUCT(m_escalator_mutex); + toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr); + toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr); + m_escalator_running = false; +} + +void locktree_manager::locktree_escalator::destroy(void) { + toku_cond_destroy(&m_escalator_done); + toku_mutex_destroy(&m_escalator_mutex); +} + +void locktree_manager::locktree_escalator::run( + locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), + void *extra) { + uint64_t t0 = toku_current_time_microsec(); + toku_mutex_lock(&m_escalator_mutex); + if (!m_escalator_running) { + // run escalation on this thread + m_escalator_running = true; + toku_mutex_unlock(&m_escalator_mutex); + escalate_locktrees_fun(extra); + toku_mutex_lock(&m_escalator_mutex); + m_escalator_running = false; + toku_cond_broadcast(&m_escalator_done); + } else { + toku_cond_wait(&m_escalator_done, &m_escalator_mutex); + } + toku_mutex_unlock(&m_escalator_mutex); + uint64_t t1 = toku_current_time_microsec(); + mgr->add_escalator_wait_time(t1 - t0); +} + +void locktree_manager::get_status(LTM_STATUS statp) { + ltm_status.init(); + LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory; + LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory; + LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count; + LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time; + LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result; + LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count; + LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time; + LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count; + LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time; + + uint64_t lock_requests_pending = 0; + uint64_t sto_num_eligible = 0; + uint64_t sto_end_early_count = 0; + tokutime_t sto_end_early_time = 0; + uint32_t num_locktrees = 0; + struct lt_counters lt_counters; + ZERO_STRUCT(lt_counters); // PORT: instead of ={}. + + if (toku_mutex_trylock(&m_mutex) == 0) { + lt_counters = m_lt_counters; + num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees; i++) { + locktree *lt; + int r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + if (toku_external_mutex_trylock(<->m_lock_request_info.mutex) == 0) { + lock_requests_pending += + lt->m_lock_request_info.pending_lock_requests.size(); + lt_counters.add(lt->get_lock_request_info()->counters); + toku_external_mutex_unlock(<->m_lock_request_info.mutex); + } + sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0; + sto_end_early_count += lt->m_sto_end_early_count; + sto_end_early_time += lt->m_sto_end_early_time; + } + mutex_unlock(); + } + + LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees; + LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending; + LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible; + LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count; + LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time; + LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count; + LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time; + LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count; + LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time; + LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count; + *statp = ltm_status; +} + +void locktree_manager::kill_waiter(void *extra) { + mutex_lock(); + int r = 0; + uint32_t num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees; i++) { + locktree *lt; + r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r) continue; // Get rid of "may be used uninitialized" warning + lock_request::kill_waiter(lt, extra); + } + mutex_unlock(); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc new file mode 100644 index 000000000..1e1d23ef8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc @@ -0,0 +1,265 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "range_buffer.h" + +#include + +#include "../portability/memory.h" +#include "../util/dbt.h" + +namespace toku { + +bool range_buffer::record_header::left_is_infinite(void) const { + return left_neg_inf || left_pos_inf; +} + +bool range_buffer::record_header::right_is_infinite(void) const { + return right_neg_inf || right_pos_inf; +} + +void range_buffer::record_header::init(const DBT *left_key, + const DBT *right_key, + bool is_exclusive) { + is_exclusive_lock = is_exclusive; + left_neg_inf = left_key == toku_dbt_negative_infinity(); + left_pos_inf = left_key == toku_dbt_positive_infinity(); + left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size; + if (right_key) { + right_neg_inf = right_key == toku_dbt_negative_infinity(); + right_pos_inf = right_key == toku_dbt_positive_infinity(); + right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size; + } else { + right_neg_inf = left_neg_inf; + right_pos_inf = left_pos_inf; + right_key_size = 0; + } +} + +const DBT *range_buffer::iterator::record::get_left_key(void) const { + if (_header.left_neg_inf) { + return toku_dbt_negative_infinity(); + } else if (_header.left_pos_inf) { + return toku_dbt_positive_infinity(); + } else { + return &_left_key; + } +} + +const DBT *range_buffer::iterator::record::get_right_key(void) const { + if (_header.right_neg_inf) { + return toku_dbt_negative_infinity(); + } else if (_header.right_pos_inf) { + return toku_dbt_positive_infinity(); + } else { + return &_right_key; + } +} + +size_t range_buffer::iterator::record::size(void) const { + return sizeof(record_header) + _header.left_key_size + _header.right_key_size; +} + +void range_buffer::iterator::record::deserialize(const char *buf) { + size_t current = 0; + + // deserialize the header + memcpy(&_header, buf, sizeof(record_header)); + current += sizeof(record_header); + + // deserialize the left key if necessary + if (!_header.left_is_infinite()) { + // point the left DBT's buffer into ours + toku_fill_dbt(&_left_key, buf + current, _header.left_key_size); + current += _header.left_key_size; + } + + // deserialize the right key if necessary + if (!_header.right_is_infinite()) { + if (_header.right_key_size == 0) { + toku_copyref_dbt(&_right_key, _left_key); + } else { + toku_fill_dbt(&_right_key, buf + current, _header.right_key_size); + } + } +} + +toku::range_buffer::iterator::iterator() + : _ma_chunk_iterator(nullptr), + _current_chunk_base(nullptr), + _current_chunk_offset(0), + _current_chunk_max(0), + _current_rec_size(0) {} + +toku::range_buffer::iterator::iterator(const range_buffer *buffer) + : _ma_chunk_iterator(&buffer->_arena), + _current_chunk_base(nullptr), + _current_chunk_offset(0), + _current_chunk_max(0), + _current_rec_size(0) { + reset_current_chunk(); +} + +void range_buffer::iterator::reset_current_chunk() { + _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max); + _current_chunk_offset = 0; +} + +bool range_buffer::iterator::current(record *rec) { + if (_current_chunk_offset < _current_chunk_max) { + const char *buf = reinterpret_cast(_current_chunk_base); + rec->deserialize(buf + _current_chunk_offset); + _current_rec_size = rec->size(); + return true; + } else { + return false; + } +} + +// move the iterator to the next record in the buffer +void range_buffer::iterator::next(void) { + invariant(_current_chunk_offset < _current_chunk_max); + invariant(_current_rec_size > 0); + + // the next record is _current_rec_size bytes forward + _current_chunk_offset += _current_rec_size; + // now, we don't know how big the current is, set it to 0. + _current_rec_size = 0; + + if (_current_chunk_offset >= _current_chunk_max) { + // current chunk is exhausted, try moving to the next one + if (_ma_chunk_iterator.more()) { + _ma_chunk_iterator.next(); + reset_current_chunk(); + } + } +} + +void range_buffer::create(void) { + // allocate buffer space lazily instead of on creation. this way, + // no malloc/free is done if the transaction ends up taking no locks. + _arena.create(0); + _num_ranges = 0; +} + +void range_buffer::append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { + // if the keys are equal, then only one copy is stored. + if (toku_dbt_equals(left_key, right_key)) { + invariant(left_key->size <= MAX_KEY_SIZE); + append_point(left_key, is_write_request); + } else { + invariant(left_key->size <= MAX_KEY_SIZE); + invariant(right_key->size <= MAX_KEY_SIZE); + append_range(left_key, right_key, is_write_request); + } + _num_ranges++; +} + +bool range_buffer::is_empty(void) const { return total_memory_size() == 0; } + +uint64_t range_buffer::total_memory_size(void) const { + return _arena.total_size_in_use(); +} + +int range_buffer::get_num_ranges(void) const { return _num_ranges; } + +void range_buffer::destroy(void) { _arena.destroy(); } + +void range_buffer::append_range(const DBT *left_key, const DBT *right_key, + bool is_exclusive) { + size_t record_length = + sizeof(record_header) + left_key->size + right_key->size; + char *buf = reinterpret_cast(_arena.malloc_from_arena(record_length)); + + record_header h; + h.init(left_key, right_key, is_exclusive); + + // serialize the header + memcpy(buf, &h, sizeof(record_header)); + buf += sizeof(record_header); + + // serialize the left key if necessary + if (!h.left_is_infinite()) { + memcpy(buf, left_key->data, left_key->size); + buf += left_key->size; + } + + // serialize the right key if necessary + if (!h.right_is_infinite()) { + memcpy(buf, right_key->data, right_key->size); + } +} + +void range_buffer::append_point(const DBT *key, bool is_exclusive) { + size_t record_length = sizeof(record_header) + key->size; + char *buf = reinterpret_cast(_arena.malloc_from_arena(record_length)); + + record_header h; + h.init(key, nullptr, is_exclusive); + + // serialize the header + memcpy(buf, &h, sizeof(record_header)); + buf += sizeof(record_header); + + // serialize the key if necessary + if (!h.left_is_infinite()) { + memcpy(buf, key->data, key->size); + } +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h new file mode 100644 index 000000000..76e28d747 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h @@ -0,0 +1,178 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include + +#include "../util/dbt.h" +#include "../util/memarena.h" + +namespace toku { + +// a key range buffer represents a set of key ranges that can +// be stored, iterated over, and then destroyed all at once. +class range_buffer { + private: + // the key range buffer is a bunch of records in a row. + // each record has the following header, followed by the + // left key and right key data payload, if applicable. + // we limit keys to be 2^16, since we store lengths as 2 bytes. + static const size_t MAX_KEY_SIZE = 1 << 16; + + struct record_header { + bool left_neg_inf; + bool left_pos_inf; + bool right_pos_inf; + bool right_neg_inf; + uint16_t left_key_size; + uint16_t right_key_size; + bool is_exclusive_lock; + + bool left_is_infinite(void) const; + + bool right_is_infinite(void) const; + + void init(const DBT *left_key, const DBT *right_key, bool is_exclusive); + }; + // PORT static_assert(sizeof(record_header) == 8, "record header format is + // off"); + + public: + // the iterator abstracts reading over a buffer of variable length + // records one by one until there are no more left. + class iterator { + public: + iterator(); + iterator(const range_buffer *buffer); + + // a record represents the user-view of a serialized key range. + // it handles positive and negative infinity and the optimized + // point range case, where left and right points share memory. + class record { + public: + // get a read-only pointer to the left key of this record's range + const DBT *get_left_key(void) const; + + // get a read-only pointer to the right key of this record's range + const DBT *get_right_key(void) const; + + // how big is this record? this tells us where the next record is + size_t size(void) const; + + bool get_exclusive_flag() const { return _header.is_exclusive_lock; } + + // populate a record header and point our DBT's + // buffers into ours if they are not infinite. + void deserialize(const char *buf); + + private: + record_header _header; + DBT _left_key; + DBT _right_key; + }; + + // populate the given record object with the current + // the memory referred to by record is valid for only + // as long as the record exists. + bool current(record *rec); + + // move the iterator to the next record in the buffer + void next(void); + + private: + void reset_current_chunk(); + + // the key range buffer we are iterating over, the current + // offset in that buffer, and the size of the current record. + memarena::chunk_iterator _ma_chunk_iterator; + const void *_current_chunk_base; + size_t _current_chunk_offset; + size_t _current_chunk_max; + size_t _current_rec_size; + }; + + // allocate buffer space lazily instead of on creation. this way, + // no malloc/free is done if the transaction ends up taking no locks. + void create(void); + + // append a left/right key range to the buffer. + // if the keys are equal, then only one copy is stored. + void append(const DBT *left_key, const DBT *right_key, + bool is_write_request = false); + + // is this range buffer empty? + bool is_empty(void) const; + + // how much memory is being used by this range buffer? + uint64_t total_memory_size(void) const; + + // how many ranges are stored in this range buffer? + int get_num_ranges(void) const; + + void destroy(void); + + private: + memarena _arena; + int _num_ranges; + + void append_range(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // append a point to the buffer. this is the space/time saving + // optimization for key ranges where left == right. + void append_point(const DBT *key, bool is_write_request); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc new file mode 100644 index 000000000..8997f634b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc @@ -0,0 +1,520 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "treenode.h" + +#include "../portability/toku_race_tools.h" + +namespace toku { + +// TODO: source location info might have to be pulled up one caller +// to be useful +void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); } + +void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); } + +void treenode::init(const comparator *cmp) { + m_txnid = TXNID_NONE; + m_is_root = false; + m_is_empty = true; + m_cmp = cmp; + + m_is_shared = false; + m_owners = nullptr; + + // use an adaptive mutex at each node since we expect the time the + // lock is held to be relatively short compared to a context switch. + // indeed, this improves performance at high thread counts considerably. + memset(&m_mutex, 0, sizeof(toku_mutex_t)); + toku_pthread_mutexattr_t attr; + toku_mutexattr_init(&attr); + toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE); + toku_mutex_init(treenode_mutex_key, &m_mutex, &attr); + toku_mutexattr_destroy(&attr); + m_left_child.set(nullptr); + m_right_child.set(nullptr); +} + +void treenode::create_root(const comparator *cmp) { + init(cmp); + m_is_root = true; +} + +void treenode::destroy_root(void) { + invariant(is_root()); + invariant(is_empty()); + toku_mutex_destroy(&m_mutex); + m_cmp = nullptr; +} + +void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid, + bool is_shared) { + // allocates a new copy of the range for this node + m_range.create_copy(range); + m_txnid = txnid; + m_is_shared = is_shared; + m_is_empty = false; +} + +bool treenode::is_root(void) { return m_is_root; } + +bool treenode::is_empty(void) { return m_is_empty; } + +bool treenode::range_overlaps(const keyrange &range) { + return m_range.overlaps(*m_cmp, range); +} + +treenode *treenode::alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared) { + treenode *XCALLOC(node); + node->init(cmp); + node->set_range_and_txnid(range, txnid, is_shared); + return node; +} + +void treenode::swap_in_place(treenode *node1, treenode *node2) { + keyrange tmp_range = node1->m_range; + TXNID tmp_txnid = node1->m_txnid; + node1->m_range = node2->m_range; + node1->m_txnid = node2->m_txnid; + node2->m_range = tmp_range; + node2->m_txnid = tmp_txnid; + + bool tmp_is_shared = node1->m_is_shared; + node1->m_is_shared = node2->m_is_shared; + node2->m_is_shared = tmp_is_shared; + + auto tmp_m_owners = node1->m_owners; + node1->m_owners = node2->m_owners; + node2->m_owners = tmp_m_owners; +} + +bool treenode::add_shared_owner(TXNID txnid) { + assert(m_is_shared); + if (txnid == m_txnid) + return false; // acquiring a lock on the same range by the same trx + + if (m_txnid != TXNID_SHARED) { + m_owners = new TxnidVector; + m_owners->insert(m_txnid); + m_txnid = TXNID_SHARED; + } + m_owners->insert(txnid); + return true; +} + +void treenode::free(treenode *node) { + // destroy the range, freeing any copied keys + node->m_range.destroy(); + + if (node->m_owners) { + delete node->m_owners; + node->m_owners = nullptr; // need this? + } + + // the root is simply marked as empty. + if (node->is_root()) { + // PORT toku_mutex_assert_locked(&node->m_mutex); + node->m_is_empty = true; + } else { + // PORT toku_mutex_assert_unlocked(&node->m_mutex); + toku_mutex_destroy(&node->m_mutex); + toku_free(node); + } +} + +uint32_t treenode::get_depth_estimate(void) const { + const uint32_t left_est = m_left_child.depth_est; + const uint32_t right_est = m_right_child.depth_est; + return (left_est > right_est ? left_est : right_est) + 1; +} + +treenode *treenode::find_node_with_overlapping_child( + const keyrange &range, const keyrange::comparison *cmp_hint) { + // determine which child to look at based on a comparison. if we were + // given a comparison hint, use that. otherwise, compare them now. + keyrange::comparison c = + cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range); + + treenode *child; + if (c == keyrange::comparison::LESS_THAN) { + child = lock_and_rebalance_left(); + } else { + // The caller (locked_keyrange::acquire) handles the case where + // the root of the locked_keyrange is the node that overlaps. + // range is guaranteed not to overlap this node. + invariant(c == keyrange::comparison::GREATER_THAN); + child = lock_and_rebalance_right(); + } + + // if the search would lead us to an empty subtree (child == nullptr), + // or the child overlaps, then we know this node is the parent we want. + // otherwise we need to recur into that child. + if (child == nullptr) { + return this; + } else { + c = range.compare(*m_cmp, child->m_range); + if (c == keyrange::comparison::EQUALS || + c == keyrange::comparison::OVERLAPS) { + child->mutex_unlock(); + return this; + } else { + // unlock this node before recurring into the locked child, + // passing in a comparison hint since we just comapred range + // to the child's range. + mutex_unlock(); + return child->find_node_with_overlapping_child(range, &c); + } + } +} + +bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) { + int rc = true; + // choose a child to check. if that child is null, then insert the new node + // there. otherwise recur down that child's subtree + keyrange::comparison c = range.compare(*m_cmp, m_range); + if (c == keyrange::comparison::LESS_THAN) { + treenode *left_child = lock_and_rebalance_left(); + if (left_child == nullptr) { + left_child = treenode::alloc(m_cmp, range, txnid, is_shared); + m_left_child.set(left_child); + } else { + left_child->insert(range, txnid, is_shared); + left_child->mutex_unlock(); + } + } else if (c == keyrange::comparison::GREATER_THAN) { + // invariant(c == keyrange::comparison::GREATER_THAN); + treenode *right_child = lock_and_rebalance_right(); + if (right_child == nullptr) { + right_child = treenode::alloc(m_cmp, range, txnid, is_shared); + m_right_child.set(right_child); + } else { + right_child->insert(range, txnid, is_shared); + right_child->mutex_unlock(); + } + } else if (c == keyrange::comparison::EQUALS) { + invariant(is_shared); + invariant(m_is_shared); + rc = add_shared_owner(txnid); + } else { + invariant(0); + } + return rc; +} + +treenode *treenode::find_child_at_extreme(int direction, treenode **parent) { + treenode *child = + direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked(); + + if (child) { + *parent = this; + treenode *child_extreme = child->find_child_at_extreme(direction, parent); + child->mutex_unlock(); + return child_extreme; + } else { + return this; + } +} + +treenode *treenode::find_leftmost_child(treenode **parent) { + return find_child_at_extreme(-1, parent); +} + +treenode *treenode::find_rightmost_child(treenode **parent) { + return find_child_at_extreme(1, parent); +} + +treenode *treenode::remove_root_of_subtree() { + // if this node has no children, just free it and return null + if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) { + // treenode::free requires that non-root nodes are unlocked + if (!is_root()) { + mutex_unlock(); + } + treenode::free(this); + return nullptr; + } + + // we have a child, so get either the in-order successor or + // predecessor of this node to be our replacement. + // replacement_parent is updated by the find functions as + // they recur down the tree, so initialize it to this. + treenode *child, *replacement; + treenode *replacement_parent = this; + if (m_left_child.ptr != nullptr) { + child = m_left_child.get_locked(); + replacement = child->find_rightmost_child(&replacement_parent); + invariant(replacement == child || replacement_parent != this); + + // detach the replacement from its parent + if (replacement_parent == this) { + m_left_child = replacement->m_left_child; + } else { + replacement_parent->m_right_child = replacement->m_left_child; + } + } else { + child = m_right_child.get_locked(); + replacement = child->find_leftmost_child(&replacement_parent); + invariant(replacement == child || replacement_parent != this); + + // detach the replacement from its parent + if (replacement_parent == this) { + m_right_child = replacement->m_right_child; + } else { + replacement_parent->m_left_child = replacement->m_right_child; + } + } + child->mutex_unlock(); + + // swap in place with the detached replacement, then destroy it + treenode::swap_in_place(replacement, this); + treenode::free(replacement); + + return this; +} + +void treenode::recursive_remove(void) { + treenode *left = m_left_child.ptr; + if (left) { + left->recursive_remove(); + } + m_left_child.set(nullptr); + + treenode *right = m_right_child.ptr; + if (right) { + right->recursive_remove(); + } + m_right_child.set(nullptr); + + // we do not take locks on the way down, so we know non-root nodes + // are unlocked here and the caller is required to pass a locked + // root, so this free is correct. + treenode::free(this); +} + +void treenode::remove_shared_owner(TXNID txnid) { + assert(m_owners->size() > 1); + m_owners->erase(txnid); + assert(m_owners->size() > 0); + /* if there is just one owner left, move it to m_txnid */ + if (m_owners->size() == 1) { + m_txnid = *m_owners->begin(); + delete m_owners; + m_owners = nullptr; + } +} + +treenode *treenode::remove(const keyrange &range, TXNID txnid) { + treenode *child; + // if the range is equal to this node's range, then just remove + // the root of this subtree. otherwise search down the tree + // in either the left or right children. + keyrange::comparison c = range.compare(*m_cmp, m_range); + switch (c) { + case keyrange::comparison::EQUALS: { + // if we are the only owners, remove. Otherwise, just remove + // us from the owners list. + if (txnid != TXNID_ANY && has_multiple_owners()) { + remove_shared_owner(txnid); + return this; + } else { + return remove_root_of_subtree(); + } + } + case keyrange::comparison::LESS_THAN: + child = m_left_child.get_locked(); + invariant_notnull(child); + child = child->remove(range, txnid); + + // unlock the child if there still is one. + // regardless, set the right child pointer + if (child) { + child->mutex_unlock(); + } + m_left_child.set(child); + break; + case keyrange::comparison::GREATER_THAN: + child = m_right_child.get_locked(); + invariant_notnull(child); + child = child->remove(range, txnid); + + // unlock the child if there still is one. + // regardless, set the right child pointer + if (child) { + child->mutex_unlock(); + } + m_right_child.set(child); + break; + case keyrange::comparison::OVERLAPS: + // shouldn't be overlapping, since the tree is + // non-overlapping and this range must exist + abort(); + } + + return this; +} + +bool treenode::left_imbalanced(int threshold) const { + uint32_t left_depth = m_left_child.depth_est; + uint32_t right_depth = m_right_child.depth_est; + return m_left_child.ptr != nullptr && left_depth > threshold + right_depth; +} + +bool treenode::right_imbalanced(int threshold) const { + uint32_t left_depth = m_left_child.depth_est; + uint32_t right_depth = m_right_child.depth_est; + return m_right_child.ptr != nullptr && right_depth > threshold + left_depth; +} + +// effect: rebalances the subtree rooted at this node +// using AVL style O(1) rotations. unlocks this +// node if it is not the new root of the subtree. +// requires: node is locked by this thread, children are not +// returns: locked root node of the rebalanced tree +treenode *treenode::maybe_rebalance(void) { + // if we end up not rotating at all, the new root is this + treenode *new_root = this; + treenode *child = nullptr; + + if (left_imbalanced(IMBALANCE_THRESHOLD)) { + child = m_left_child.get_locked(); + if (child->right_imbalanced(0)) { + treenode *grandchild = child->m_right_child.get_locked(); + + child->m_right_child = grandchild->m_left_child; + grandchild->m_left_child.set(child); + + m_left_child = grandchild->m_right_child; + grandchild->m_right_child.set(this); + + new_root = grandchild; + } else { + m_left_child = child->m_right_child; + child->m_right_child.set(this); + new_root = child; + } + } else if (right_imbalanced(IMBALANCE_THRESHOLD)) { + child = m_right_child.get_locked(); + if (child->left_imbalanced(0)) { + treenode *grandchild = child->m_left_child.get_locked(); + + child->m_left_child = grandchild->m_right_child; + grandchild->m_right_child.set(child); + + m_right_child = grandchild->m_left_child; + grandchild->m_left_child.set(this); + + new_root = grandchild; + } else { + m_right_child = child->m_left_child; + child->m_left_child.set(this); + new_root = child; + } + } + + // up to three nodes may be locked. + // - this + // - child + // - grandchild (but if it is locked, its the new root) + // + // one of them is the new root. we unlock everything except the new root. + if (child && child != new_root) { + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex); + child->mutex_unlock(); + } + if (this != new_root) { + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex); + mutex_unlock(); + } + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex); + return new_root; +} + +treenode *treenode::lock_and_rebalance_left(void) { + treenode *child = m_left_child.get_locked(); + if (child) { + treenode *new_root = child->maybe_rebalance(); + m_left_child.set(new_root); + child = new_root; + } + return child; +} + +treenode *treenode::lock_and_rebalance_right(void) { + treenode *child = m_right_child.get_locked(); + if (child) { + treenode *new_root = child->maybe_rebalance(); + m_right_child.set(new_root); + child = new_root; + } + return child; +} + +void treenode::child_ptr::set(treenode *node) { + ptr = node; + depth_est = ptr ? ptr->get_depth_estimate() : 0; +} + +treenode *treenode::child_ptr::get_locked(void) { + if (ptr) { + ptr->mutex_lock(); + depth_est = ptr->get_depth_estimate(); + } + return ptr; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h new file mode 100644 index 000000000..ec25a8c58 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h @@ -0,0 +1,302 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../ft/comparator.h" +#include "../portability/memory.h" +#include "../portability/toku_pthread.h" +// PORT: we need LTM_STATUS +#include "../ft/ft-status.h" +#include "../portability/txn_subst.h" +#include "keyrange.h" + +namespace toku { + +// a node in a tree with its own mutex +// - range is the "key" of this node +// - txnid is the single txnid associated with this node +// - left and right children may be null +// +// to build a tree on top of this abstraction, the user: +// - provides memory for a root node, initializes it via create_root() +// - performs tree operations on the root node. memory management +// below the root node is handled by the abstraction, not the user. +// this pattern: +// - guaruntees a root node always exists. +// - does not allow for rebalances on the root node + +class treenode { + public: + // every treenode function has some common requirements: + // - node is locked and children are never locked + // - node may be unlocked if no other thread has visibility + + // effect: create the root node + void create_root(const comparator *cmp); + + // effect: destroys the root node + void destroy_root(void); + + // effect: sets the txnid and copies the given range for this node + void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared); + + // returns: true iff this node is marked as empty + bool is_empty(void); + + // returns: true if this is the root node, denoted by a null parent + bool is_root(void); + + // returns: true if the given range overlaps with this node's range + bool range_overlaps(const keyrange &range); + + // effect: locks the node + void mutex_lock(void); + + // effect: unlocks the node + void mutex_unlock(void); + + // return: node whose child overlaps, or a child that is empty + // and would contain range if it existed + // given: if cmp_hint is non-null, then it is a precomputed + // comparison of this node's range to the given range. + treenode *find_node_with_overlapping_child( + const keyrange &range, const keyrange::comparison *cmp_hint); + + // effect: performs an in-order traversal of the ranges that overlap the + // given range, calling function->fn() on each node that does + // requires: function signature is: bool fn(const keyrange &range, TXNID + // txnid) requires: fn returns true to keep iterating, false to stop iterating + // requires: fn does not attempt to use any ranges read out by value + // after removing a node with an overlapping range from the tree. + template + void traverse_overlaps(const keyrange &range, F *function) { + keyrange::comparison c = range.compare(*m_cmp, m_range); + if (c == keyrange::comparison::EQUALS) { + // Doesn't matter if fn wants to keep going, there + // is nothing left, so return. + function->fn(m_range, m_txnid, m_is_shared, m_owners); + return; + } + + treenode *left = m_left_child.get_locked(); + if (left) { + if (c != keyrange::comparison::GREATER_THAN) { + // Target range is less than this node, or it overlaps this + // node. There may be something on the left. + left->traverse_overlaps(range, function); + } + left->mutex_unlock(); + } + + if (c == keyrange::comparison::OVERLAPS) { + bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners); + if (!keep_going) { + return; + } + } + + treenode *right = m_right_child.get_locked(); + if (right) { + if (c != keyrange::comparison::LESS_THAN) { + // Target range is greater than this node, or it overlaps this + // node. There may be something on the right. + right->traverse_overlaps(range, function); + } + right->mutex_unlock(); + } + } + + // effect: inserts the given range and txnid into a subtree, recursively + // requires: range does not overlap with any node below the subtree + bool insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the subtree + // requires: range exists in the subtree + // returns: the root of the resulting subtree + treenode *remove(const keyrange &range, TXNID txnid); + + // effect: removes this node and all of its children, recursively + // requires: every node at and below this node is unlocked + void recursive_remove(void); + + private: + // the child_ptr is a light abstraction for the locking of + // a child and the maintenence of its depth estimate. + + struct child_ptr { + // set the child pointer + void set(treenode *node); + + // get and lock this child if it exists + treenode *get_locked(void); + + treenode *ptr; + uint32_t depth_est; + }; + + // the balance factor at which a node is considered imbalanced + static const int32_t IMBALANCE_THRESHOLD = 2; + + // node-level mutex + toku_mutex_t m_mutex; + + // the range and txnid for this node. the range contains a copy + // of the keys originally inserted into the tree. nodes may + // swap ranges. but at the end of the day, when a node is + // destroyed, it frees the memory associated with whatever range + // it has at the time of destruction. + keyrange m_range; + + void remove_shared_owner(TXNID txnid); + + bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); } + + private: + // Owner transaction id. + // A value of TXNID_SHARED means this node has multiple owners + TXNID m_txnid; + + // If true, this lock is a non-exclusive lock, and it can have either + // one or several owners. + bool m_is_shared; + + // List of the owners, or nullptr if there's just one owner. + TxnidVector *m_owners; + + // two child pointers + child_ptr m_left_child; + child_ptr m_right_child; + + // comparator for ranges + // psergey-todo: Is there any sense to store the comparator in each tree + // node? + const comparator *m_cmp; + + // marked for the root node. the root node is never free()'d + // when removed, but instead marked as empty. + bool m_is_root; + + // marked for an empty node. only valid for the root. + bool m_is_empty; + + // effect: initializes an empty node with the given comparator + void init(const comparator *cmp); + + // requires: this is a shared node (m_is_shared==true) + // effect: another transaction is added as an owner. + // returns: true <=> added another owner + // false <=> this transaction is already an owner + bool add_shared_owner(TXNID txnid); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the leftmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_leftmost_child(treenode **parent); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the rightmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_rightmost_child(treenode **parent); + + // effect: remove the root of this subtree, destroying the old root + // returns: the new root of the subtree + treenode *remove_root_of_subtree(void); + + // requires: subtree is non-empty, direction is not 0 + // returns: the child of the subtree at either the left or rightmost extreme + treenode *find_child_at_extreme(int direction, treenode **parent); + + // effect: retrieves and possibly rebalances the left child + // returns: a locked left child, if it exists + treenode *lock_and_rebalance_left(void); + + // effect: retrieves and possibly rebalances the right child + // returns: a locked right child, if it exists + treenode *lock_and_rebalance_right(void); + + // returns: the estimated depth of this subtree + uint32_t get_depth_estimate(void) const; + + // returns: true iff left subtree depth is sufficiently less than the right + bool left_imbalanced(int threshold) const; + + // returns: true iff right subtree depth is sufficiently greater than the left + bool right_imbalanced(int threshold) const; + + // effect: performs an O(1) rebalance, which will "heal" an imbalance by at + // most 1. effect: if the new root is not this node, then this node is + // unlocked. returns: locked node representing the new root of the rebalanced + // subtree + treenode *maybe_rebalance(void); + + // returns: allocated treenode populated with a copy of the range and txnid + static treenode *alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared); + + // requires: node is a locked root node, or an unlocked non-root node + static void free(treenode *node); + + // effect: swaps the range/txnid pairs for node1 and node2. + static void swap_in_place(treenode *node1, treenode *node2); + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc new file mode 100644 index 000000000..4caf1e26f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc @@ -0,0 +1,120 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "txnid_set.h" + +#include "../db.h" + +namespace toku { + +int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b); +int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) { + if (txnid_a < txnid_b) { + return -1; + } else if (txnid_a == txnid_b) { + return 0; + } else { + return 1; + } +} + +void txnid_set::create(void) { + // lazily allocate the underlying omt, since it is common + // to create a txnid set and never put anything in it. + m_txnids.create_no_array(); +} + +void txnid_set::destroy(void) { m_txnids.destroy(); } + +// Return true if the given transaction id is a member of the set. +// Otherwise, return false. +bool txnid_set::contains(TXNID txnid) const { + TXNID find_txnid; + int r = m_txnids.find_zero(txnid, &find_txnid, nullptr); + return r == 0 ? true : false; +} + +// Add a given txnid to the set +void txnid_set::add(TXNID txnid) { + int r = m_txnids.insert(txnid, txnid, nullptr); + invariant(r == 0 || r == DB_KEYEXIST); +} + +// Delete a given txnid from the set. +void txnid_set::remove(TXNID txnid) { + uint32_t idx; + int r = m_txnids.find_zero(txnid, nullptr, &idx); + if (r == 0) { + r = m_txnids.delete_at(idx); + invariant_zero(r); + } +} + +// Return the size of the set +uint32_t txnid_set::size(void) const { return m_txnids.size(); } + +// Get the ith id in the set, assuming that the set is sorted. +TXNID txnid_set::get(uint32_t i) const { + TXNID txnid; + int r = m_txnids.fetch(i, &txnid); + if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */ + return TXNID_NONE; + invariant_zero(r); + return txnid; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h new file mode 100644 index 000000000..d79c24fb0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h @@ -0,0 +1,92 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../portability/txn_subst.h" +#include "../util/omt.h" + +namespace toku { + +class txnid_set { + public: + // effect: Creates an empty set. Does not malloc space for + // any entries yet. That is done lazily on add(). + void create(void); + + // effect: Destroy the set's internals. + void destroy(void); + + // returns: True if the given txnid is a member of the set. + bool contains(TXNID id) const; + + // effect: Adds a given txnid to the set if it did not exist + void add(TXNID txnid); + + // effect: Deletes a txnid from the set if it exists. + void remove(TXNID txnid); + + // returns: Size of the set + uint32_t size(void) const; + + // returns: The "i'th" id in the set, as if it were sorted. + TXNID get(uint32_t i) const; + + private: + toku::omt m_txnids; + + friend class txnid_set_unit_test; +}; +ENSURE_POD(txnid_set); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc new file mode 100644 index 000000000..24536c88e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc @@ -0,0 +1,213 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "../db.h" +#include "../portability/memory.h" +// PORT #include +#include +#include + +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +// Create a lock request graph +void wfg::create(void) { m_nodes.create(); } + +// Destroy the internals of the lock request graph +void wfg::destroy(void) { + uint32_t n_nodes = m_nodes.size(); + for (uint32_t i = 0; i < n_nodes; i++) { + node *n; + int r = m_nodes.fetch(i, &n); + invariant_zero(r); + invariant_notnull(n); + if (r) continue; // Get rid of "may be used uninitialized" warning + node::free(n); + } + m_nodes.destroy(); +} + +// Add an edge (a_id, b_id) to the graph +void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) { + node *a_node = find_create_node(a_txnid); + node *b_node = find_create_node(b_txnid); + a_node->edges.add(b_node->txnid); +} + +// Return true if a node with the given transaction id exists in the graph. +// Return false otherwise. +bool wfg::node_exists(TXNID txnid) { + node *n = find_node(txnid); + return n != NULL; +} + +bool wfg::cycle_exists_from_node(node *target, node *head, + std::function reporter) { + bool cycle_found = false; + head->visited = true; + uint32_t n_edges = head->edges.size(); + for (uint32_t i = 0; i < n_edges && !cycle_found; i++) { + TXNID edge_id = head->edges.get(i); + if (target->txnid == edge_id) { + cycle_found = true; + if (reporter) reporter(edge_id); + } else { + node *new_head = find_node(edge_id); + if (new_head && !new_head->visited) { + cycle_found = cycle_exists_from_node(target, new_head, reporter); + if (cycle_found && reporter) reporter(edge_id); + } + } + } + head->visited = false; + return cycle_found; +} + +// Return true if there exists a cycle from a given transaction id in the graph. +// Return false otherwise. +bool wfg::cycle_exists_from_txnid(TXNID txnid, + std::function reporter) { + node *a_node = find_node(txnid); + bool cycles_found = false; + if (a_node) { + cycles_found = cycle_exists_from_node(a_node, a_node, reporter); + } + return cycles_found; +} + +// Apply a given function f to all of the nodes in the graph. The apply +// function returns when the function f is called for all of the nodes in the +// graph, or the function f returns non-zero. +void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) { + int r = 0; + uint32_t n_nodes = m_nodes.size(); + for (uint32_t i = 0; i < n_nodes && r == 0; i++) { + node *n; + r = m_nodes.fetch(i, &n); + invariant_zero(r); + if (r) continue; // Get rid of "may be used uninitialized" warning + r = fn(n->txnid, extra); + } +} + +// Apply a given function f to all of the edges whose origin is a given node id. +// The apply function returns when the function f is called for all edges in the +// graph rooted at node id, or the function f returns non-zero. +void wfg::apply_edges(TXNID txnid, + int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra), + void *extra) { + node *n = find_node(txnid); + if (n) { + int r = 0; + uint32_t n_edges = n->edges.size(); + for (uint32_t i = 0; i < n_edges && r == 0; i++) { + r = fn(txnid, n->edges.get(i), extra); + } + } +} + +// find node by id +wfg::node *wfg::find_node(TXNID txnid) { + node *n = nullptr; + int r = m_nodes.find_zero(txnid, &n, nullptr); + invariant(r == 0 || r == DB_NOTFOUND); + return n; +} + +// this is the omt comparison function +// nodes are compared by their txnid. +int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) { + TXNID txnid_a = node_a->txnid; + if (txnid_a < txnid_b) { + return -1; + } else if (txnid_a == txnid_b) { + return 0; + } else { + return 1; + } +} + +// insert a new node +wfg::node *wfg::find_create_node(TXNID txnid) { + node *n; + uint32_t idx; + int r = m_nodes.find_zero(txnid, &n, &idx); + if (r == DB_NOTFOUND) { + n = node::alloc(txnid); + r = m_nodes.insert_at(n, idx); + invariant_zero(r); + } + invariant_notnull(n); + return n; +} + +wfg::node *wfg::node::alloc(TXNID txnid) { + node *XCALLOC(n); + n->txnid = txnid; + n->visited = false; + n->edges.create(); + return n; +} + +void wfg::node::free(wfg::node *n) { + n->edges.destroy(); + toku_free(n); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h new file mode 100644 index 000000000..804202170 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h @@ -0,0 +1,124 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "../util/omt.h" +#include "txnid_set.h" + +namespace toku { + +// A wfg is a 'wait-for' graph. A directed edge in represents one +// txn waiting for another to finish before it can acquire a lock. + +class wfg { + public: + // Create a lock request graph + void create(void); + + // Destroy the internals of the lock request graph + void destroy(void); + + // Add an edge (a_id, b_id) to the graph + void add_edge(TXNID a_txnid, TXNID b_txnid); + + // Return true if a node with the given transaction id exists in the graph. + // Return false otherwise. + bool node_exists(TXNID txnid); + + // Return true if there exists a cycle from a given transaction id in the + // graph. Return false otherwise. + bool cycle_exists_from_txnid(TXNID txnid, + std::function reporter); + + // Apply a given function f to all of the nodes in the graph. The apply + // function returns when the function f is called for all of the nodes in the + // graph, or the function f returns non-zero. + void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra); + + // Apply a given function f to all of the edges whose origin is a given node + // id. The apply function returns when the function f is called for all edges + // in the graph rooted at node id, or the function f returns non-zero. + void apply_edges(TXNID txnid, + int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra), + void *extra); + + private: + struct node { + // txnid for this node and the associated set of edges + TXNID txnid; + txnid_set edges; + bool visited; + + static node *alloc(TXNID txnid); + + static void free(node *n); + }; + ENSURE_POD(node); + + toku::omt m_nodes; + + node *find_node(TXNID txnid); + + node *find_create_node(TXNID txnid); + + bool cycle_exists_from_node(node *target, node *head, + std::function reporter); + + static int find_by_txnid(node *const &node_a, const TXNID &txnid_b); +}; +ENSURE_POD(wfg); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h new file mode 100644 index 000000000..0a621f8e0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h @@ -0,0 +1,215 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +#include "toku_portability.h" + +/* Percona memory allocation functions and macros. + * These are functions for malloc and free */ + +int toku_memory_startup(void) __attribute__((constructor)); +void toku_memory_shutdown(void) __attribute__((destructor)); + +/* Generally: errno is set to 0 or a value to indicate problems. */ + +// Everything should call toku_malloc() instead of malloc(), and toku_calloc() +// instead of calloc() That way the tests can can, e.g., replace the malloc +// function using toku_set_func_malloc(). +void *toku_calloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xcalloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_malloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_malloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); + +// xmalloc aborts instead of return NULL if we run out of memory +void *toku_xmalloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_xrealloc(void *, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xmalloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); +// Effect: Perform a os_malloc_aligned(size) with the additional property that +// the returned pointer is a multiple of ALIGNMENT. +// Fail with a resource_assert if the allocation fails (don't return an error +// code). If the alloc_aligned function has been set then call it instead. +// Requires: alignment is a power of two. + +void toku_free(void *) __attribute__((__visibility__("default"))); + +size_t toku_malloc_usable_size(void *p) + __attribute__((__visibility__("default"))); + +/* MALLOC is a macro that helps avoid a common error: + * Suppose I write + * struct foo *x = malloc(sizeof(struct foo)); + * That works fine. But if I change it to this, I've probably made an mistake: + * struct foo *x = malloc(sizeof(struct bar)); + * It can get worse, since one might have something like + * struct foo *x = malloc(sizeof(struct foo *)) + * which looks reasonable, but it allocoates enough to hold a pointer instead of + * the amount needed for the struct. So instead, write struct foo *MALLOC(x); + * and you cannot go wrong. + */ +#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v))) +/* MALLOC_N is like calloc(Except no 0ing of data): It makes an array. Write + * int *MALLOC_N(5,x); + * to make an array of 5 integers. + */ +#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v))) +#define MALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v))) + +// CALLOC_N is like calloc with auto-figuring out size of members +#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v))) + +#define CALLOC(v) CALLOC_N(1, v) + +// XMALLOC macros are like MALLOC except they abort if the operation fails +#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v))) +#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v))) +#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v)))) +#define XCALLOC(v) XCALLOC_N(1, v) +#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s)) +#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v))) + +#define XMALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v))) + +#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src))) +#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len)) + +// ZERO_ARRAY writes zeroes to a stack-allocated array +#define ZERO_ARRAY(o) \ + do { \ + memset((o), 0, sizeof(o)); \ + } while (0) +// ZERO_STRUCT writes zeroes to a stack-allocated struct +#define ZERO_STRUCT(o) \ + do { \ + memset(&(o), 0, sizeof(o)); \ + } while (0) + +/* Copy memory. Analogous to strdup() */ +void *toku_memdup(const void *v, size_t len); +/* Toku-version of strdup. Use this so that it calls toku_malloc() */ +char *toku_strdup(const char *s) __attribute__((__visibility__("default"))); +/* Toku-version of strndup. Use this so that it calls toku_malloc() */ +char *toku_strndup(const char *s, size_t n) + __attribute__((__visibility__("default"))); +/* Copy memory. Analogous to strdup() Crashes instead of returning NULL */ +void *toku_xmemdup(const void *v, size_t len) + __attribute__((__visibility__("default"))); +/* Toku-version of strdup. Use this so that it calls toku_xmalloc() Crashes + * instead of returning NULL */ +char *toku_xstrdup(const char *s) __attribute__((__visibility__("default"))); + +void toku_malloc_cleanup( + void); /* Before exiting, call this function to free up any internal data + structures from toku_malloc. Otherwise valgrind will complain of + memory leaks. */ + +/* Check to see if everything malloc'd was free. Might be a no-op depending on + * how memory.c is configured. */ +void toku_memory_check_all_free(void); +/* Check to see if memory is "sane". Might be a no-op. Probably better to + * simply use valgrind. */ +void toku_do_memory_check(void); + +typedef void *(*malloc_fun_t)(size_t); +typedef void (*free_fun_t)(void *); +typedef void *(*realloc_fun_t)(void *, size_t); +typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/); +typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/, + size_t /*size*/); + +void toku_set_func_malloc(malloc_fun_t f); +void toku_set_func_xmalloc_only(malloc_fun_t f); +void toku_set_func_malloc_only(malloc_fun_t f); +void toku_set_func_realloc(realloc_fun_t f); +void toku_set_func_xrealloc_only(realloc_fun_t f); +void toku_set_func_realloc_only(realloc_fun_t f); +void toku_set_func_free(free_fun_t f); + +typedef struct memory_status { + uint64_t malloc_count; // number of malloc operations + uint64_t free_count; // number of free operations + uint64_t realloc_count; // number of realloc operations + uint64_t malloc_fail; // number of malloc operations that failed + uint64_t realloc_fail; // number of realloc operations that failed + uint64_t requested; // number of bytes requested + uint64_t used; // number of bytes used (requested + overhead), obtained from + // malloc_usable_size() + uint64_t freed; // number of bytes freed; + uint64_t max_requested_size; // largest attempted allocation size + uint64_t last_failed_size; // size of the last failed allocation attempt + volatile uint64_t + max_in_use; // maximum memory footprint (used - freed), approximate (not + // worth threadsafety overhead for exact) + const char *mallocator_version; + uint64_t mmap_threshold; +} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS; + +void toku_memory_get_status(LOCAL_MEMORY_STATUS s); + +// Effect: Like toku_memory_footprint, except instead of passing p, +// we pass toku_malloc_usable_size(p). +size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable); + +// Effect: Return an estimate how how much space an object is using, possibly by +// using toku_malloc_usable_size(p). +// If p is NULL then returns 0. +size_t toku_memory_footprint(void *p, size_t touched); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h new file mode 100644 index 000000000..af47800fb --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h @@ -0,0 +1,39 @@ +// +// A replacement for toku_assert.h +// +#pragma once + +#include +#include + +#ifdef NDEBUG + +#define assert_zero(a) ((void)(a)) +#define invariant(a) ((void)(a)) +#define invariant_notnull(a) ((void)(a)) +#define invariant_zero(a) ((void)(a)) + +#else + +#define assert_zero(a) assert((a) == 0) +#define invariant(a) assert(a) +#define invariant_notnull(a) assert(a) +#define invariant_zero(a) assert_zero(a) + +#endif + +#define lazy_assert_zero(a) assert_zero(a) + +#define paranoid_invariant_zero(a) assert_zero(a) +#define paranoid_invariant_notnull(a) assert(a) +#define paranoid_invariant(a) assert(a) + +#define ENSURE_POD(type) \ + static_assert( \ + std::is_standard_layout::value && std::is_trivial::value, \ + #type "isn't POD") + +inline int get_error_errno(void) { + invariant(errno); + return errno; +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h new file mode 100644 index 000000000..aaa2298fa --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h @@ -0,0 +1,130 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include +#include +#include +#include + +#include "toku_assert_subst.h" + +__attribute__((const, always_inline)) static inline intptr_t which_cache_line( + intptr_t addr) { + static const size_t assumed_cache_line_size = 64; + return addr / assumed_cache_line_size; +} +template +__attribute__((const, always_inline)) static inline bool crosses_boundary( + T *addr, size_t width) { + const intptr_t int_addr = reinterpret_cast(addr); + const intptr_t last_byte = int_addr + width - 1; + return which_cache_line(int_addr) != which_cache_line(last_byte); +} + +template +__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_add(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_add_and_fetch(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_sub(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_sub_and_fetch(addr, diff); +} +template +__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap( + T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_val_compare_and_swap(addr, oldval, newval); +} +template +__attribute__((always_inline)) static inline bool +toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +// in case you include this but not toku_portability.h +#pragma GCC poison __sync_fetch_and_add +#pragma GCC poison __sync_fetch_and_sub +#pragma GCC poison __sync_fetch_and_or +#pragma GCC poison __sync_fetch_and_and +#pragma GCC poison __sync_fetch_and_xor +#pragma GCC poison __sync_fetch_and_nand +#pragma GCC poison __sync_add_and_fetch +#pragma GCC poison __sync_sub_and_fetch +#pragma GCC poison __sync_or_and_fetch +#pragma GCC poison __sync_and_and_fetch +#pragma GCC poison __sync_xor_and_fetch +#pragma GCC poison __sync_nand_and_fetch +#pragma GCC poison __sync_bool_compare_and_swap +#pragma GCC poison __sync_val_compare_and_swap +#pragma GCC poison __sync_synchronize +#pragma GCC poison __sync_lock_test_and_set +#pragma GCC poison __sync_release diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h new file mode 100644 index 000000000..eb8291c1d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h @@ -0,0 +1,83 @@ +/* + A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided + condition and mutex that provides toku_pthread_*-like interface. The functions + are named + + toku_external_{mutex|cond}_XXX + + Lock Tree uses this mutex and condition for interruptible (long) lock waits. + + (It also still uses toku_pthread_XXX calls for mutexes/conditions for + shorter waits on internal objects) +*/ + +#pragma once + +#include +#include +#include + +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "toku_portability.h" + +using ROCKSDB_NAMESPACE::TransactionDBCondVar; +using ROCKSDB_NAMESPACE::TransactionDBMutex; + +typedef std::shared_ptr + toku_external_mutex_factory_t; + +typedef std::shared_ptr toku_external_mutex_t; +typedef std::shared_ptr toku_external_cond_t; + +static inline void toku_external_cond_init( + toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) { + *cond = mutex_factory->AllocateCondVar(); +} + +inline void toku_external_cond_destroy(toku_external_cond_t *cond) { + cond->reset(); // this will destroy the managed object +} + +inline void toku_external_cond_signal(toku_external_cond_t *cond) { + (*cond)->Notify(); +} + +inline void toku_external_cond_broadcast(toku_external_cond_t *cond) { + (*cond)->NotifyAll(); +} + +inline int toku_external_cond_timedwait(toku_external_cond_t *cond, + toku_external_mutex_t *mutex, + int64_t timeout_microsec) { + auto res = (*cond)->WaitFor(*mutex, timeout_microsec); + if (res.ok()) + return 0; + else + return ETIMEDOUT; +} + +inline void toku_external_mutex_init(toku_external_mutex_factory_t factory, + toku_external_mutex_t *mutex) { + // Use placement new: the memory has been allocated but constructor wasn't + // called + new (mutex) toku_external_mutex_t; + *mutex = factory->AllocateMutex(); +} + +inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); +} + +inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); + return 0; +} + +inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) { + (*mutex)->UnLock(); +} + +inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) { + mutex->reset(); // this will destroy the managed object +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h new file mode 100644 index 000000000..c967e7177 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h @@ -0,0 +1,286 @@ +/*====== +This file is part of PerconaFT. + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#pragma once + +#include // FILE + +// Performance instrumentation object identifier type +typedef unsigned int pfs_key_t; + +enum class toku_instr_object_type { mutex, rwlock, cond, thread, file }; + +struct PSI_file; + +struct TOKU_FILE { + /** The real file. */ + FILE *file; + struct PSI_file *key; + TOKU_FILE() : file(nullptr), key(nullptr) {} +}; + +struct PSI_mutex; +struct PSI_cond; +struct PSI_rwlock; + +struct toku_mutex_t; +struct toku_cond_t; +struct toku_pthread_rwlock_t; + +class toku_instr_key; + +class toku_instr_probe_empty { + public: + explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {} + + void start_with_source_location(UU(const char *src_file), UU(int src_line)) {} + + void stop() {} +}; + +#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__) +#define TOKU_PROBE_STOP(p) p->stop + +extern toku_instr_key toku_uninstrumented; + +#ifndef MYSQL_TOKUDB_ENGINE + +#include + +class toku_instr_key { + public: + toku_instr_key(UU(toku_instr_object_type type), UU(const char *group), + UU(const char *name)) {} + + explicit toku_instr_key(UU(pfs_key_t key_id)) {} + // No-instrumentation constructor: + toku_instr_key() {} + ~toku_instr_key() {} +}; + +typedef toku_instr_probe_empty toku_instr_probe; + +enum class toku_instr_file_op { + file_stream_open, + file_create, + file_open, + file_delete, + file_rename, + file_read, + file_write, + file_sync, + file_stream_close, + file_close, + file_stat +}; + +struct PSI_file {}; +struct PSI_mutex {}; + +struct toku_io_instrumentation {}; + +inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { + return pthread_create(thread, attr, start_routine, arg); +} + +inline void toku_instr_register_current_thread() {} + +inline void toku_instr_delete_current_thread() {} + +// Instrument file creation, opening, closing, and renaming +inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr), + UU(const toku_instr_key &key), + UU(toku_instr_file_op op), + UU(const char *name), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_open_end( + UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {} + +inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr), + UU(int fd)) {} + +inline void toku_instr_file_name_close_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_fd_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(int fd), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr), + UU(int result)) {} + +inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr), + UU(toku_instr_file_op op), UU(int fd), + UU(unsigned int count), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_name_io_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_stream_io_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr), + UU(unsigned int count)) {} + +struct toku_mutex_t; + +struct toku_mutex_instrumentation {}; + +inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key), + UU(toku_mutex_t &mutex)) { + return nullptr; +} + +inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {} + +inline void toku_instr_mutex_lock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_trylock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_lock_end( + UU(toku_mutex_instrumentation &mutex_instr), + UU(int pthread_mutex_lock_result)) {} + +inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {} + +struct toku_cond_instrumentation {}; + +enum class toku_instr_cond_op { + cond_wait, + cond_timedwait, +}; + +inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key), + UU(toku_cond_t &cond)) { + return nullptr; +} + +inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {} + +inline void toku_instr_cond_wait_start( + UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op), + UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr), + UU(int pthread_cond_wait_result)) {} + +inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {} + +inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {} + +#if 0 +// rw locks are not used +// rwlock instrumentation +struct toku_rwlock_instrumentation {}; + +inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key), + UU(toku_pthread_rwlock_t &rwlock)) { + return nullptr; +} + +inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {} + +inline void toku_instr_rwlock_rdlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_wrlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_rdlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_wrlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {} +#endif + +#else // MYSQL_TOKUDB_ENGINE +// There can be not only mysql but also mongodb or any other PFS stuff +#include +#endif // MYSQL_TOKUDB_ENGINE + +// Mutexes +extern toku_instr_key manager_escalation_mutex_key; +extern toku_instr_key manager_escalator_mutex_key; +extern toku_instr_key manager_mutex_key; +extern toku_instr_key treenode_mutex_key; +extern toku_instr_key locktree_request_info_mutex_key; +extern toku_instr_key locktree_request_info_retry_mutex_key; + +// condition vars +extern toku_instr_key lock_request_m_wait_cond_key; +extern toku_instr_key locktree_request_info_retry_cv_key; +extern toku_instr_key manager_m_escalator_done_key; // unused diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h new file mode 100644 index 000000000..9a95b38bd --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h @@ -0,0 +1,87 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#if defined(__clang__) +#define constexpr_static_assert(a, b) +#else +#define constexpr_static_assert(a, b) static_assert(a, b) +#endif + +// include here, before they get deprecated +#include +#include +#include +#include +#include +#include +#include + +#include "toku_atomic.h" + +#if defined(__cplusplus) +#include +#endif + +#if defined(__cplusplus) +// decltype() here gives a reference-to-pointer instead of just a pointer, +// just use __typeof__ +#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value) +#else +#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value) +#endif + +#define UU(x) x __attribute__((__unused__)) + +#include "toku_instrumentation.h" diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h new file mode 100644 index 000000000..571b950e1 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h @@ -0,0 +1,520 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include +#include + +#include "toku_portability.h" +// PORT2: #include "toku_assert.h" + +// TODO: some things moved toku_instrumentation.h, not necessarily the best +// place +typedef pthread_attr_t toku_pthread_attr_t; +typedef pthread_t toku_pthread_t; +typedef pthread_mutex_t toku_pthread_mutex_t; +typedef pthread_condattr_t toku_pthread_condattr_t; +typedef pthread_cond_t toku_pthread_cond_t; +typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t; +typedef pthread_key_t toku_pthread_key_t; +typedef struct timespec toku_timespec_t; + +// TODO: break this include loop +#include +typedef pthread_mutexattr_t toku_pthread_mutexattr_t; + +struct toku_mutex_t { + pthread_mutex_t pmutex; + struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */ +#if defined(TOKU_PTHREAD_DEBUG) + pthread_t owner; // = pthread_self(); // for debugging + bool locked; + bool valid; + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +struct toku_cond_t { + pthread_cond_t pcond; + struct PSI_cond *psi_cond; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 } +#else +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +struct toku_pthread_rwlock_t { + pthread_rwlock_t rwlock; + struct PSI_rwlock *psi_rwlock; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +typedef struct toku_mutex_aligned { + toku_mutex_t aligned_mutex __attribute__((__aligned__(64))); +} toku_mutex_aligned_t; + +// Initializing with {} will fill in a struct with all zeros. +// But you may also need a pragma to suppress the warnings, as follows +// +// #pragma GCC diagnostic push +// #pragma GCC diagnostic ignored "-Wmissing-field-initializers" +// toku_mutex_t foo = ZERO_MUTEX_INITIALIZER; +// #pragma GCC diagnostic pop +// +// In general it will be a lot of busy work to make this codebase compile +// cleanly with -Wmissing-field-initializers + +#define ZERO_MUTEX_INITIALIZER \ + {} + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +// Darwin doesn't provide adaptive mutexes +#if defined(__APPLE__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#else // __FreeBSD__, __linux__, at least +#if defined(__GLIBC__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP +#else +// not all libc (e.g. musl) implement NP (Non-POSIX) attributes +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#endif +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \ + .owner = 0, .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#endif // defined(__APPLE__) + +// Different OSes implement mutexes as different amounts of nested structs. +// C++ will fill out all missing values with zeroes if you provide at least one +// zero, but it needs the right amount of nesting. +#if defined(__FreeBSD__) +#define ZERO_COND_INITIALIZER \ + { 0 } +#elif defined(__APPLE__) +#define ZERO_COND_INITIALIZER \ + { \ + { 0 } \ + } +#else // __linux__, at least +#define ZERO_COND_INITIALIZER \ + {} +#endif + +static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_init(attr); + assert_zero(r); +} + +static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr, + int type) { + int r = pthread_mutexattr_settype(attr, type); + assert_zero(r); +} + +static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_destroy(attr); + assert_zero(r); +} + +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) { + invariant(mutex->locked); + invariant(mutex->owner == pthread_self()); +} +#else +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +// asserting that a mutex is unlocked only makes sense +// if the calling thread can guaruntee that no other threads +// are trying to lock this mutex at the time of the assertion +// +// a good example of this is a tree with mutexes on each node. +// when a node is locked the caller knows that no other threads +// can be trying to lock its childrens' mutexes. the children +// are in one of two fixed states: locked or unlocked. +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) { + invariant(mutex->owner == 0); + invariant(!mutex->locked); +} +#else +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +#define toku_mutex_lock(M) \ + toku_mutex_lock_with_source_location(M, __FILE__, __LINE__) + +static inline void toku_cond_init(toku_cond_t *cond, + const toku_pthread_condattr_t *attr) { + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +#define toku_mutex_trylock(M) \ + toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__) + +inline void toku_mutex_unlock(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->owner == pthread_self()); + invariant(mutex->valid); + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_unlock(mutex->psi_mutex); + int r = pthread_mutex_unlock(&mutex->pmutex); + assert_zero(r); +} + +inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + if (r == 0) { + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); + } +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +#define toku_cond_wait(C, M) \ + toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__) + +#define toku_cond_timedwait(C, M, W) \ + toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__) + +inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond, + const pthread_condattr_t *attr) { + toku_instr_cond_init(key, *cond); + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +inline void toku_cond_destroy(toku_cond_t *cond) { + toku_instr_cond_destroy(cond->psi_cond); + int r = pthread_cond_destroy(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_wait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond, + *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + toku_timespec_t *wakeup_at, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait, + *cond, *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +inline void toku_cond_signal(toku_cond_t *cond) { + toku_instr_cond_signal(*cond); + const int r = pthread_cond_signal(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_broadcast(toku_cond_t *cond) { + toku_instr_cond_broadcast(*cond); + const int r = pthread_cond_broadcast(&cond->pcond); + assert_zero(r); +} + +inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex, + const toku_pthread_mutexattr_t *attr) { +#if defined(TOKU_PTHREAD_DEBUG) + mutex->valid = true; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_init(key, *mutex); + const int r = pthread_mutex_init(&mutex->pmutex, attr); + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + mutex->locked = false; + invariant(mutex->valid); + mutex->valid = true; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline void toku_mutex_destroy(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + mutex->valid = false; + invariant(!mutex->locked); +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_destroy(mutex->psi_mutex); + int r = pthread_mutex_destroy(&mutex->pmutex); + assert_zero(r); +} + +#define toku_pthread_rwlock_rdlock(RW) \ + toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__) + +#define toku_pthread_rwlock_wrlock(RW) \ + toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__) + +#if 0 +inline void toku_pthread_rwlock_init( + const toku_instr_key &key, + toku_pthread_rwlock_t *__restrict rwlock, + const toku_pthread_rwlockattr_t *__restrict attr) { + toku_instr_rwlock_init(key, *rwlock); + int r = pthread_rwlock_init(&rwlock->rwlock, attr); + assert_zero(r); +} + +inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_destroy(rwlock->psi_rwlock); + int r = pthread_rwlock_destroy(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_rdlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_rdlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_wrlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_wrlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} +#endif + +static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) { + return pthread_join(thread, value_ptr); +} + +static inline int toku_pthread_detach(toku_pthread_t thread) { + return pthread_detach(thread); +} + +static inline int toku_pthread_key_create(toku_pthread_key_t *key, + void (*destroyf)(void *)) { + return pthread_key_create(key, destroyf); +} + +static inline int toku_pthread_key_delete(toku_pthread_key_t key) { + return pthread_key_delete(key); +} + +static inline void *toku_pthread_getspecific(toku_pthread_key_t key) { + return pthread_getspecific(key); +} + +static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) { + return pthread_setspecific(key, data); +} + +int toku_pthread_yield(void) __attribute__((__visibility__("default"))); + +static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); } + +static inline void *toku_pthread_done(void *exit_value) { + toku_instr_delete_current_thread(); + pthread_exit(exit_value); + return nullptr; // Avoid compiler warning +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h new file mode 100644 index 000000000..3cb5b5790 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h @@ -0,0 +1,179 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include + +#ifdef HAVE_valgrind +#undef USE_VALGRIND +#define USE_VALGRIND 1 +#endif + +#if defined(__linux__) && USE_VALGRIND + +#include +#include + +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \ + VALGRIND_HG_ENABLE_CHECKING(p, size) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \ + VALGRIND_HG_DISABLE_CHECKING(p, size) +#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN() +#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END() +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN() +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END() + +/* + * How to make helgrind happy about tree rotations and new mutex orderings: + * + * // Tell helgrind that we unlocked it so that the next call doesn't get a + * "destroyed a locked mutex" error. + * // Tell helgrind that we destroyed the mutex. + * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka); + * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka); + * + * // And recreate it. It would be better to simply be able to say that the + * order on these two can now be reversed, because this code forgets all the + * ordering information for this mutex. + * // Then tell helgrind that we have locked it again. + * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0); + * VALGRIND_HG_MUTEX_LOCK_POST(&locka); + * + * When the ordering of two locks changes, we don't need tell Helgrind about do + * both locks. Just one is good enough. + */ + +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \ + VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex); \ + VALGRIND_HG_MUTEX_DESTROY_PRE(mutex); \ + VALGRIND_HG_MUTEX_INIT_POST(mutex, 0); \ + VALGRIND_HG_MUTEX_LOCK_POST(mutex); + +#else // !defined(__linux__) || !USE_VALGRIND + +#define NVALGRIND 1 +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0) +#define TOKU_DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0) +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) +#undef RUNNING_ON_VALGRIND +#define RUNNING_ON_VALGRIND (0U) +#endif + +// Valgrind 3.10.1 (and previous versions). +// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING. +// Helgrind's implementation of disable and enable checking causes false races +// to be reported. In addition, the race report does not include ANY +// information about the code that uses the helgrind disable and enable +// functions. Therefore, it is very difficult to figure out the cause of the +// race. DRD does implement the disable and enable functions. + +// Problems with ANNOTATE_IGNORE_READS. +// Helgrind does not implement ignore reads. +// Annotate ignore reads is the way to inform DRD to ignore racy reads. + +// FT code uses unsafe reads in several places. These unsafe reads have been +// noted as valid since they use the toku_unsafe_fetch function. Unfortunately, +// this causes helgrind to report erroneous data races which makes use of +// helgrind problematic. + +// Unsafely fetch and return a `T' from src, telling drd to ignore +// racey access to src for the next sizeof(*src) bytes +template +T toku_unsafe_fetch(T *src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(src, + sizeof *src); // disabled, see comment + TOKU_ANNOTATE_IGNORE_READS_BEGIN(); + T r = *src; + TOKU_ANNOTATE_IGNORE_READS_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(src, + sizeof *src); // disabled, see comment + return r; +} + +template +T toku_unsafe_fetch(T &src) { + return toku_unsafe_fetch(&src); +} + +// Unsafely set a `T' value into *dest from src, telling drd to ignore +// racey access to dest for the next sizeof(*dest) bytes +template +void toku_unsafe_set(T *dest, const T src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment + TOKU_ANNOTATE_IGNORE_WRITES_BEGIN(); + *dest = src; + TOKU_ANNOTATE_IGNORE_WRITES_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment +} + +template +void toku_unsafe_set(T &dest, const T src) { + toku_unsafe_set(&dest, src); +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h new file mode 100644 index 000000000..46111e7f0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h @@ -0,0 +1,193 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include "toku_config.h" + +#include +#include +#include +#if defined(__powerpc__) +#include +#endif + +#if 0 +static inline float toku_tdiff (struct timeval *a, struct timeval *b) { + return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec)); +} +// PORT2: temporary: +#define HAVE_CLOCK_REALTIME +#if !defined(HAVE_CLOCK_REALTIME) +// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time. +typedef int clockid_t; +// just something bogus, it doesn't matter, we just want to make sure we're +// only supporting this mode because we're not sure we can support other modes +// without a real clock_gettime() +#define CLOCK_REALTIME 0x01867234 +#endif +int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default"))); +#endif + +// *************** Performance timers ************************ +// What do you really want from a performance timer: +// (1) Can determine actual time of day from the performance time. +// (2) Time goes forward, never backward. +// (3) Same time on different processors (or even different machines). +// (4) Time goes forward at a constant rate (doesn't get faster and slower) +// (5) Portable. +// (6) Getting the time is cheap. +// Unfortuately it seems tough to get Properties 1-5. So we go for Property 6,, +// but we abstract it. We offer a type tokutime_t which can hold the time. This +// type can be subtracted to get a time difference. We can get the present time +// cheaply. We can convert this type to seconds (but that can be expensive). The +// implementation is to use RDTSC (hence we lose property 3: not portable). +// Recent machines have constant_tsc in which case we get property (4). +// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock +// skew, so we get property (3). We get property 2 with RDTSC (as long as +// there's not any skew). We don't even try to get propety 1, since we don't +// need it. The decision here is that these times are really accurate only on +// modern machines with modern OSs. +typedef uint64_t tokutime_t; // Time type used in by tokutek timers. + +#if 0 +// The value of tokutime_t is not specified here. +// It might be microseconds since 1/1/1970 (if gettimeofday() is +// used), or clock cycles since boot (if rdtsc is used). Or something +// else. +// Two tokutime_t values can be subtracted to get a time difference. +// Use tokutime_to_seconds to that convert difference to seconds. +// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds(); +// +// For accurate time calculations do the subtraction in the right order: +// Right: tokutime_to_seconds(t1-t2); +// Wrong tokutime_to_seconds(t1)-toku_time_to_seconds(t2); +// Doing it the wrong way is likely to result in loss of precision. +// A double can hold numbers up to about 53 bits. RDTSC which uses about 33 bits every second, so that leaves +// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double. +// +double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds. + +#endif + +// Get the value of tokutime for right now. We want this to be fast, so we +// expose the implementation as RDTSC. +static inline tokutime_t toku_time_now(void) { +#if defined(__x86_64__) || defined(__i386__) + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return (uint64_t)hi << 32 | lo; +#elif defined(__aarch64__) + uint64_t result; + __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result)); + return result; +#elif defined(__powerpc__) + return __ppc_get_timebase(); +#elif defined(__s390x__) + uint64_t result; + asm volatile("stckf %0" : "=Q"(result) : : "cc"); + return result; +#elif defined(__riscv) && __riscv_xlen == 32 + uint32_t cycles_lo, cycles_hi0, cycles_hi1; + // Implemented in assembly because Clang insisted on branching. + asm volatile( + "rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "sub %0, %0, %2\n" + "seqz %0, %0\n" + "sub %0, zero, %0\n" + "and %1, %1, %0\n" + : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); + return (static_cast(cycles_hi1) << 32) | cycles_lo; +#elif defined(__riscv) && __riscv_xlen == 64 + uint64_t cycles; + asm volatile("rdcycle %0" : "=r"(cycles)); + return cycles; +#else +#error No timer implementation for this platform +#endif +} + +static inline uint64_t toku_current_time_microsec(void) { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec; +} + +#if 0 +// sleep microseconds +static inline void toku_sleep_microsec(uint64_t ms) { + struct timeval t; + + t.tv_sec = ms / 1000000; + t.tv_usec = ms % 1000000; + + select(0, NULL, NULL, NULL, &t); +} +#endif + +/* + PORT: Usage of this file: + + uint64_t toku_current_time_microsec() // uses gettimeoday + is used to track how much time various operations took (for example, lock + escalation). (TODO: it is not clear why these operations are tracked with + microsecond precision while others use nanoseconds) + + tokutime_t toku_time_now() // uses rdtsc + seems to be used for a very similar purpose. This has greater precision + + RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which + should be adequate substitutes. +*/ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h new file mode 100644 index 000000000..803914862 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h @@ -0,0 +1,27 @@ +// +// A substitute for ft/txn/txn.h +// +#pragma once + +#include + +#include "../util/omt.h" + +typedef uint64_t TXNID; +#define TXNID_NONE ((TXNID)0) + +// A set of transactions +// (TODO: consider using class toku::txnid_set. The reason for using STL +// container was that its API is easier) +class TxnidVector : public std::set { + public: + bool contains(TXNID txnid) { return find(txnid) != end(); } +}; + +// A value for lock structures with a meaning "the lock is owned by multiple +// transactions (and one has to check the TxnidVector to get their ids) +#define TXNID_SHARED (TXNID(-1)) + +// Auxiliary value meaning "any transaction id will do". No real transaction +// may have this is as id. +#define TXNID_ANY (TXNID(-2)) diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc new file mode 100644 index 000000000..50dc879ce --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc @@ -0,0 +1,132 @@ +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +/* + This is a dump ground to make Lock Tree work without the rest of TokuDB. +*/ +#include + +#include "db.h" +#include "ft/ft-status.h" +#include "portability/memory.h" +#include "util/dbt.h" + +// portability/os_malloc.cc + +void toku_free(void *p) { free(p); } + +void *toku_xmalloc(size_t size) { return malloc(size); } + +void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); } + +void *toku_xmemdup(const void *v, size_t len) { + void *p = toku_xmalloc(len); + memcpy(p, v, len); + return p; +} + +// TODO: what are the X-functions? Xcalloc, Xrealloc? +void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); } + +// ft-ft-opts.cc: + +// locktree +toku_instr_key lock_request_m_wait_cond_key; +toku_instr_key manager_m_escalator_done_key; +toku_instr_key locktree_request_info_mutex_key; +toku_instr_key locktree_request_info_retry_mutex_key; +toku_instr_key locktree_request_info_retry_cv_key; + +toku_instr_key treenode_mutex_key; +toku_instr_key manager_mutex_key; +toku_instr_key manager_escalation_mutex_key; +toku_instr_key manager_escalator_mutex_key; + +// portability/memory.cc +size_t toku_memory_footprint(void *, size_t touched) { return touched; } + +// ft/ft-status.c +// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with +// "TOKU" +LTM_STATUS_S ltm_status; +void LTM_STATUS_S::init() { + if (m_initialized) return; +#define LTM_STATUS_INIT(k, c, t, l) \ + TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \ + TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS) + LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64, + "memory size"); + LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64, + "memory size limit"); + LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64, + "number of times lock escalation ran"); + LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS, + STATUS_TOKUTIME, "time spent running escalation (seconds)"); + LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT, + LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64, + "latest post-escalation memory size"); + LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64, + "number of locktrees open now"); + LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS, + STATUS_UINT64, "number of pending lock requests"); + LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM, + STATUS_UINT64, "number of locktrees eligible for the STO"); + LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM, + STATUS_UINT64, + "number of times a locktree ended the STO early"); + LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS, + STATUS_TOKUTIME, "time spent ending the STO early (seconds)"); + LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64, + "number of wait locks"); + LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64, + "time waiting for locks"); + LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64, + "number of long wait locks"); + LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64, + "long time waiting for locks"); + LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64, + "number of lock timeouts"); + LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT, + STATUS_UINT64, "number of waits on lock escalation"); + LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME, + STATUS_UINT64, "time waiting on lock escalation"); + LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT, + LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64, + "number of long waits on lock escalation"); + LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME, + LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64, + "long time waiting on lock escalation"); + + m_initialized = true; +#undef LTM_STATUS_INIT +} +void LTM_STATUS_S::destroy() { + if (!m_initialized) return; + for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) { + if (status[i].type == STATUS_PARCOUNT) { + // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount); + } + } +} + +int toku_keycompare(const void *key1, size_t key1len, const void *key2, + size_t key2len) { + size_t comparelen = key1len < key2len ? key1len : key2len; + int c = memcmp(key1, key2, comparelen); + if (__builtin_expect(c != 0, 1)) { + return c; + } else { + if (key1len < key2len) { + return -1; + } else if (key1len > key2len) { + return 1; + } else { + return 0; + } + } +} + +int toku_builtin_compare_fun(const DBT *a, const DBT *b) { + return toku_keycompare(a->data, a->size, b->data, b->size); +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc new file mode 100644 index 000000000..63cc3a267 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc @@ -0,0 +1,153 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "dbt.h" + +#include + +#include "../db.h" +#include "../portability/memory.h" + +DBT *toku_init_dbt(DBT *dbt) { + memset(dbt, 0, sizeof(*dbt)); + return dbt; +} + +DBT toku_empty_dbt(void) { + static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0}; + return empty_dbt; +} + +DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) { + toku_init_dbt(dbt); + dbt->flags = flags; + return dbt; +} + +void toku_destroy_dbt(DBT *dbt) { + switch (dbt->flags) { + case DB_DBT_MALLOC: + case DB_DBT_REALLOC: + toku_free(dbt->data); + toku_init_dbt(dbt); + break; + } +} + +DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) { + toku_init_dbt(dbt); + dbt->size = len; + dbt->data = (char *)k; + return dbt; +} + +DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) { + toku_init_dbt_flags(dbt, DB_DBT_MALLOC); + dbt->size = len; + dbt->data = toku_xmemdup(k, len); + return dbt; +} + +DBT *toku_copyref_dbt(DBT *dst, const DBT src) { + dst->flags = 0; + dst->ulen = 0; + dst->size = src.size; + dst->data = src.data; + return dst; +} + +DBT *toku_clone_dbt(DBT *dst, const DBT &src) { + return toku_memdup_dbt(dst, src.data, src.size); +} + +void toku_sdbt_cleanup(struct simple_dbt *sdbt) { + if (sdbt->data) toku_free(sdbt->data); + memset(sdbt, 0, sizeof(*sdbt)); +} + +const DBT *toku_dbt_positive_infinity(void) { + static DBT positive_infinity_dbt = { + .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + return &positive_infinity_dbt; +} + +const DBT *toku_dbt_negative_infinity(void) { + static DBT negative_infinity_dbt = { + .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + return &negative_infinity_dbt; +} + +bool toku_dbt_is_infinite(const DBT *dbt) { + return dbt == toku_dbt_positive_infinity() || + dbt == toku_dbt_negative_infinity(); +} + +bool toku_dbt_is_empty(const DBT *dbt) { + // can't have a null data field with a non-zero size + paranoid_invariant(dbt->data != nullptr || dbt->size == 0); + return dbt->data == nullptr; +} + +int toku_dbt_infinite_compare(const DBT *a, const DBT *b) { + if (a == b) { + return 0; + } else if (a == toku_dbt_positive_infinity()) { + return 1; + } else if (b == toku_dbt_positive_infinity()) { + return -1; + } else if (a == toku_dbt_negative_infinity()) { + return -1; + } else { + invariant(b == toku_dbt_negative_infinity()); + return 1; + } +} + +bool toku_dbt_equals(const DBT *a, const DBT *b) { + if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) { + return a->data == b->data && a->size == b->size; + } else { + // a or b is infinite, so they're equal if they are the same infinite + return a == b ? true : false; + } +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h new file mode 100644 index 000000000..d86c440f8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h @@ -0,0 +1,98 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" + +// TODO: John +// Document this API a little better so that DBT +// memory management can be morm widely understood. + +DBT *toku_init_dbt(DBT *); + +// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true) +DBT toku_empty_dbt(void); + +DBT *toku_init_dbt_flags(DBT *, uint32_t flags); + +void toku_destroy_dbt(DBT *); + +DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_copyref_dbt(DBT *dst, const DBT src); + +DBT *toku_clone_dbt(DBT *dst, const DBT &src); + +void toku_sdbt_cleanup(struct simple_dbt *sdbt); + +// returns: special DBT pointer representing positive infinity +const DBT *toku_dbt_positive_infinity(void); + +// returns: special DBT pointer representing negative infinity +const DBT *toku_dbt_negative_infinity(void); + +// returns: true if the given dbt is either positive or negative infinity +bool toku_dbt_is_infinite(const DBT *dbt); + +// returns: true if the given dbt has no data (ie: dbt->data == nullptr) +bool toku_dbt_is_empty(const DBT *dbt); + +// effect: compares two potentially infinity-valued dbts +// requires: at least one is infinite (assert otherwise) +int toku_dbt_infinite_compare(const DBT *a, const DBT *b); + +// returns: true if the given dbts have the same data pointer and size +bool toku_dbt_equals(const DBT *a, const DBT *b); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h new file mode 100644 index 000000000..158750fdb --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h @@ -0,0 +1,144 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +//****************************************************************************** +// +// Overview: A growable array is a little bit like std::vector except that +// it doesn't have constructors (hence can be used in static constructs, since +// the google style guide says no constructors), and it's a little simpler. +// Operations: +// init and deinit (we don't have constructors and destructors). +// fetch_unchecked to get values out. +// store_unchecked to put values in. +// push to add an element at the end +// get_size to find out the size +// get_memory_size to find out how much memory the data stucture is using. +// +//****************************************************************************** + +namespace toku { + +template +class GrowableArray { + public: + void init(void) + // Effect: Initialize the array to contain no elements. + { + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + void deinit(void) + // Effect: Deinitialize the array (freeing any memory it uses, for example). + { + toku_free(m_array); + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + T fetch_unchecked(size_t i) const + // Effect: Fetch the ith element. If i is out of range, the system asserts. + { + return m_array[i]; + } + + void store_unchecked(size_t i, T v) + // Effect: Store v in the ith element. If i is out of range, the system + // asserts. + { + paranoid_invariant(i < m_size); + m_array[i] = v; + } + + void push(T v) + // Effect: Add v to the end of the array (increasing the size). The amortized + // cost of this operation is constant. Implementation hint: Double the size + // of the array when it gets too big so that the amortized cost stays + // constant. + { + if (m_size >= m_size_limit) { + if (m_array == NULL) { + m_size_limit = 1; + } else { + m_size_limit *= 2; + } + XREALLOC_N(m_size_limit, m_array); + } + m_array[m_size++] = v; + } + + size_t get_size(void) const + // Effect: Return the number of elements in the array. + { + return m_size; + } + size_t memory_size(void) const + // Effect: Return the size (in bytes) that the array occupies in memory. This + // is really only an estimate. + { + return sizeof(*this) + sizeof(T) * m_size_limit; + } + + private: + T *m_array; + size_t m_size; + size_t m_size_limit; // How much space is allocated in array. +}; + +} // namespace toku diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc new file mode 100644 index 000000000..0e7a9880b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc @@ -0,0 +1,201 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "memarena.h" + +#include + +#include + +#include "../portability/memory.h" + +void memarena::create(size_t initial_size) { + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _size_of_other_chunks = 0; + _footprint_of_other_chunks = 0; + _n_other_chunks = 0; + + _current_chunk.size = initial_size; + if (_current_chunk.size > 0) { + XMALLOC_N(_current_chunk.size, _current_chunk.buf); + } +} + +void memarena::destroy(void) { + if (_current_chunk.buf) { + toku_free(_current_chunk.buf); + } + for (int i = 0; i < _n_other_chunks; i++) { + toku_free(_other_chunks[i].buf); + } + if (_other_chunks) { + toku_free(_other_chunks); + } + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _n_other_chunks = 0; +} + +static size_t round_to_page(size_t size) { + const size_t page_size = 4096; + const size_t r = page_size + ((size - 1) & ~(page_size - 1)); + assert((r & (page_size - 1)) == 0); // make sure it's aligned + assert(r >= size); // make sure it's not too small + assert(r < + size + page_size); // make sure we didn't grow by more than a page. + return r; +} + +static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024; + +void *memarena::malloc_from_arena(size_t size) { + if (_current_chunk.buf == nullptr || + _current_chunk.size < _current_chunk.used + size) { + // The existing block isn't big enough. + // Add the block to the vector of blocks. + if (_current_chunk.buf) { + invariant(_current_chunk.size > 0); + int old_n = _n_other_chunks; + XREALLOC_N(old_n + 1, _other_chunks); + _other_chunks[old_n] = _current_chunk; + _n_other_chunks = old_n + 1; + _size_of_other_chunks += _current_chunk.size; + _footprint_of_other_chunks += + toku_memory_footprint(_current_chunk.buf, _current_chunk.used); + } + + // Make a new one. Grow the buffer size exponentially until we hit + // the max chunk size, but make it at least `size' bytes so the + // current allocation always fit. + size_t new_size = + std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size); + if (new_size < size) { + new_size = size; + } + new_size = round_to_page( + new_size); // at least size, but round to the next page size + XMALLOC_N(new_size, _current_chunk.buf); + _current_chunk.used = 0; + _current_chunk.size = new_size; + } + invariant(_current_chunk.buf != nullptr); + + // allocate in the existing block. + char *p = _current_chunk.buf + _current_chunk.used; + _current_chunk.used += size; + return p; +} + +void memarena::move_memory(memarena *dest) { + // Move memory to dest + XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks); + dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size; + dest->_footprint_of_other_chunks += + _footprint_of_other_chunks + + toku_memory_footprint(_current_chunk.buf, _current_chunk.used); + for (int i = 0; i < _n_other_chunks; i++) { + dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i]; + } + dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk; + + // Clear out this memarena's memory + toku_free(_other_chunks); + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _size_of_other_chunks = 0; + _footprint_of_other_chunks = 0; + _n_other_chunks = 0; +} + +size_t memarena::total_memory_size(void) const { + return sizeof(*this) + total_size_in_use() + + _n_other_chunks * sizeof(*_other_chunks); +} + +size_t memarena::total_size_in_use(void) const { + return _size_of_other_chunks + _current_chunk.used; +} + +size_t memarena::total_footprint(void) const { + return sizeof(*this) + _footprint_of_other_chunks + + toku_memory_footprint(_current_chunk.buf, _current_chunk.used) + + _n_other_chunks * sizeof(*_other_chunks); +} + +//////////////////////////////////////////////////////////////////////////////// + +const void *memarena::chunk_iterator::current(size_t *used) const { + if (_chunk_idx < 0) { + *used = _ma->_current_chunk.used; + return _ma->_current_chunk.buf; + } else if (_chunk_idx < _ma->_n_other_chunks) { + *used = _ma->_other_chunks[_chunk_idx].used; + return _ma->_other_chunks[_chunk_idx].buf; + } + *used = 0; + return nullptr; +} + +void memarena::chunk_iterator::next() { _chunk_idx++; } + +bool memarena::chunk_iterator::more() const { + if (_chunk_idx < 0) { + return _ma->_current_chunk.buf != nullptr; + } + return _chunk_idx < _ma->_n_other_chunks; +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h new file mode 100644 index 000000000..ddcc1144f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include + +/* + * A memarena is used to efficiently store a collection of objects that never + * move The pattern is allocate more and more stuff and free all of the items at + * once. The underlying memory will store 1 or more objects per chunk. Each + * chunk is contiguously laid out in memory but chunks are not necessarily + * contiguous with each other. + */ +class memarena { + public: + memarena() + : _current_chunk(arena_chunk()), + _other_chunks(nullptr), + _n_other_chunks(0), + _size_of_other_chunks(0), + _footprint_of_other_chunks(0) {} + + // Effect: Create a memarena with the specified initial size + void create(size_t initial_size); + + void destroy(void); + + // Effect: Allocate some memory. The returned value remains valid until the + // memarena is cleared or closed. + // In case of ENOMEM, aborts. + void *malloc_from_arena(size_t size); + + // Effect: Move all the memory from this memarena into DEST. + // When SOURCE is closed the memory won't be freed. + // When DEST is closed, the memory will be freed, unless DEST moves + // its memory to another memarena... + void move_memory(memarena *dest); + + // Effect: Calculate the amount of memory used by a memory arena. + size_t total_memory_size(void) const; + + // Effect: Calculate the used space of the memory arena (ie: excludes unused + // space) + size_t total_size_in_use(void) const; + + // Effect: Calculate the amount of memory used, according to + // toku_memory_footprint(), + // which is a more expensive but more accurate count of memory used. + size_t total_footprint(void) const; + + // iterator over the underlying chunks that store objects in the memarena. + // a chunk is represented by a pointer to const memory and a usable byte + // count. + class chunk_iterator { + public: + chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {} + + // returns: base pointer to the current chunk + // *used set to the number of usable bytes + // if more() is false, returns nullptr and *used = 0 + const void *current(size_t *used) const; + + // requires: more() is true + void next(); + + bool more() const; + + private: + // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk + // >= 0 represents the i'th chunk in the ma->_other_chunks array + const memarena *_ma; + int _chunk_idx; + }; + + private: + struct arena_chunk { + arena_chunk() : buf(nullptr), used(0), size(0) {} + char *buf; + size_t used; + size_t size; + }; + + struct arena_chunk _current_chunk; + struct arena_chunk *_other_chunks; + int _n_other_chunks; + size_t _size_of_other_chunks; // the buf_size of all the other chunks. + size_t _footprint_of_other_chunks; // the footprint of all the other chunks. + + friend class memarena_unit_test; +}; diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h new file mode 100644 index 000000000..f208002d3 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h @@ -0,0 +1,794 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include +#include + +#include "../portability/toku_portability.h" +#include "../portability/toku_race_tools.h" +#include "growable_array.h" + +namespace toku { + +/** + * Order Maintenance Tree (OMT) + * + * Maintains a collection of totally ordered values, where each value has an + * integer weight. The OMT is a mutable datatype. + * + * The Abstraction: + * + * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector. + * The vector is numbered from $0$ to $|V|-1$. + * Each value has a weight. The weight of the $i$th element is denoted + * $w(V_i)$. + * + * We can create a new OMT, which is the empty vector. + * + * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where + * $|V'|=1+|V|$ and + * + * V'_j = V_j if $ji$. + * + * We can specify $i$ using a kind of function instead of as an integer. + * Let $b$ be a function mapping from values to nonzero integers, such that + * the signum of $b$ is monotically increasing. + * We can specify $i$ as the minimum integer such that $b(V_i)>0$. + * + * We look up a value using its index, or using a Heaviside function. + * For lookups, we allow $b$ to be zero for some values, and again the signum of + * $b$ must be monotonically increasing. When lookup up values, we can look up + * $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a + * special return code if no such value exists.) (Rationale: Ordinarily we want + * $i$ to be unique. But for various reasons we want to allow multiple zeros, + * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum + * integer such that $b(V_i)>0$. (Or an indication that no such value exists.) + * $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an + * indication that no such value exists.) + * + * When looking up a value using a Heaviside function, we get the value and its + * index. + * + * We can also split an OMT into two OMTs, splitting the weight of the values + * evenly. Find a value $j$ such that the values to the left of $j$ have about + * the same total weight as the values to the right of $j$. The resulting two + * OMTs contain the values to the left of $j$ and the values to the right of $j$ + * respectively. All of the values from the original OMT go into one of the new + * OMTs. If the weights of the values don't split exactly evenly, then the + * implementation has the freedom to choose whether the new left OMT or the new + * right OMT is larger. + * + * Performance: + * Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ + * calls to the Heaviside function. The memory required is O(|V|). + * + * Usage: + * The omt is templated by two parameters: + * - omtdata_t is what will be stored within the omt. These could be pointers + * or real data types (ints, structs). + * - omtdataout_t is what will be returned by find and related functions. By + * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To + * create an omt which will store "TXNID"s, for example, it is a good idea to + * typedef the template: typedef omt txnid_omt_t; If you are storing + * structs, you may want to be able to get a pointer to the data actually stored + * in the omt (see find_zero). To do this, use the second template parameter: + * typedef omt foo_omt_t; + */ + +namespace omt_internal { + +template +class subtree_templated { + private: + uint32_t m_index; + + public: + static const uint32_t NODE_NULL = UINT32_MAX; + inline void set_to_null(void) { m_index = NODE_NULL; } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { return m_index; } + + inline void set_index(uint32_t index) { + paranoid_invariant(index != NODE_NULL); + m_index = index; + } +} __attribute__((__packed__, aligned(4))); + +template <> +class subtree_templated { + private: + uint32_t m_bitfield; + static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31); + static const uint32_t MASK_BIT = ((uint32_t)1) << 31; + + inline void set_index_internal(uint32_t new_index) { + m_bitfield = (m_bitfield & MASK_BIT) | new_index; + } + + public: + static const uint32_t NODE_NULL = INT32_MAX; + inline void set_to_null(void) { this->set_index_internal(NODE_NULL); } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return bits & MASK_INDEX; + } + + inline void set_index(uint32_t index) { + paranoid_invariant(index < NODE_NULL); + this->set_index_internal(index); + } + + inline bool get_bit(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return (bits & MASK_BIT) != 0; + } + + inline void enable_bit(void) { + // These bits may be set by a thread with a write lock on some + // leaf, and the index can be read by another thread with a (read + // or write) lock on another thread. Also, the has_marks_below + // bit can be set by two threads simultaneously. Neither of these + // are real races, so if we are using DRD we should tell it to + // ignore these bits just while we set this bit. If there were a + // race in setting the index, that would be a real race. + TOKU_DRD_IGNORE_VAR(m_bitfield); + m_bitfield |= MASK_BIT; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + } + + inline void disable_bit(void) { m_bitfield &= MASK_INDEX; } +} __attribute__((__packed__)); + +template +class omt_node_templated { + public: + omtdata_t value; + uint32_t weight; + subtree_templated left; + subtree_templated right; + + // this needs to be in both implementations because we don't have + // a "static if" the caller can use + inline void clear_stolen_bits(void) {} +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +template +class omt_node_templated { + public: + omtdata_t value; + uint32_t weight; + subtree_templated left; + subtree_templated right; + inline bool get_marked(void) const { return left.get_bit(); } + inline void set_marked_bit(void) { return left.enable_bit(); } + inline void unset_marked_bit(void) { return left.disable_bit(); } + + inline bool get_marks_below(void) const { return right.get_bit(); } + inline void set_marks_below_bit(void) { + // This function can be called by multiple threads. + // Checking first reduces cache invalidation. + if (!this->get_marks_below()) { + right.enable_bit(); + } + } + inline void unset_marks_below_bit(void) { right.disable_bit(); } + + inline void clear_stolen_bits(void) { + this->unset_marked_bit(); + this->unset_marks_below_bit(); + } +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +} // namespace omt_internal + +template +class omt { + public: + /** + * Effect: Create an empty OMT. + * Performance: constant time. + */ + void create(void); + + /** + * Effect: Create an empty OMT with no internal allocated space. + * Performance: constant time. + * Rationale: In some cases we need a valid omt but don't want to malloc. + */ + void create_no_array(void); + + /** + * Effect: Create a OMT containing values. The number of values is in + * numvalues. Stores the new OMT in *omtp. Requires: this has not been created + * yet Requires: values != NULL Requires: values is sorted Performance: + * time=O(numvalues) Rationale: Normally to insert N values takes O(N lg N) + * amortized time. If the N values are known in advance, are sorted, and the + * structure is empty, we can batch insert them much faster. + */ + __attribute__((nonnull)) void create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues); + + /** + * Effect: Create an OMT containing values. The number of values is in + * numvalues. On success the OMT takes ownership of *values array, and sets + * values=NULL. Requires: this has not been created yet Requires: values != + * NULL Requires: *values is sorted Requires: *values was allocated with + * toku_malloc Requires: Capacity of the *values array is <= new_capacity + * Requires: On success, *values may not be accessed again by the caller. + * Performance: time=O(1) + * Rational: create_from_sorted_array takes O(numvalues) time. + * By taking ownership of the array, we save a malloc and + * memcpy, and possibly a free (if the caller is done with the array). + */ + void create_steal_sorted_array(omtdata_t **const values, + const uint32_t numvalues, + const uint32_t new_capacity); + + /** + * Effect: Create a new OMT, storing it in *newomt. + * The values to the right of index (starting at index) are moved to *newomt. + * Requires: newomt != NULL + * Returns + * 0 success, + * EINVAL if index > toku_omt_size(omt) + * On nonzero return, omt and *newomt are unmodified. + * Performance: time=O(n) + * Rationale: We don't need a split-evenly operation. We need to split items + * so that their total sizes are even, and other similar splitting criteria. + * It's easy to split evenly by calling size(), and dividing by two. + */ + __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx); + + /** + * Effect: Appends leftomt and rightomt to produce a new omt. + * Creates this as the new omt. + * leftomt and rightomt are destroyed. + * Performance: time=O(n) is acceptable, but one can imagine implementations + * that are O(\log n) worst-case. + */ + __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt); + + /** + * Effect: Creates a copy of an omt. + * Creates this as the clone. + * Each element is copied directly. If they are pointers, the underlying + * data is not duplicated. Performance: O(n) or the running time of + * fill_array_with_subtree_values() + */ + void clone(const omt &src); + + /** + * Effect: Set the tree to be empty. + * Note: Will not reallocate or resize any memory. + * Performance: time=O(1) + */ + void clear(void); + + /** + * Effect: Destroy an OMT, freeing all its memory. + * If the values being stored are pointers, their underlying data is not + * freed. See free_items() Those values may be freed before or after calling + * toku_omt_destroy. Rationale: Returns no values since free() cannot fail. + * Rationale: Does not free the underlying pointers to reduce complexity. + * Performance: time=O(1) + */ + void destroy(void); + + /** + * Effect: return |this|. + * Performance: time=O(1) + */ + uint32_t size(void) const; + + /** + * Effect: Insert value into the OMT. + * If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST. + * Otherwise, let i be the minimum value such that $h(V_i, v)>0$. + * If no such i exists, then let i be |V| + * Then this has the same effect as + * insert_at(tree, value, i); + * If idx!=NULL then i is stored in *idx + * Requires: The signum of h must be monotonically increasing. + * Returns: + * 0 success + * DB_KEYEXIST the key is present (h was equal to zero for some value) + * On nonzero return, omt is unchanged. + * Performance: time=O(\log N) amortized. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + template + int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx); + + /** + * Effect: Increases indexes of all items at slot >= idx by 1. + * Insert value into the position at idx. + * Returns: + * 0 success + * EINVAL if idx > this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) amortized time. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + int insert_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Replaces the item at idx with value. + * Returns: + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) + * Rationale: The FT needs to be able to replace a value with another copy of + * the same value (allocated in a different location) + * + */ + int set_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Delete the item in slot idx. + * Decreases indexes of all items at slot > idx by 1. + * Returns + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Rationale: To delete an item, first find its index using find or find_zero, + * then delete it. Performance: time=O(\log N) amortized. + */ + int delete_at(const uint32_t idx); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL + * Returns: + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate. If f always returns zero, then + * iterate returns 0. Requires: Don't modify the omt while running. (E.g., f + * may not insert or delete values from the omt.) Performance: time=O(i+\log + * N) where i is the number of times f is called, and N is the number of + * elements in the omt. Rationale: Although the functional iterator requires + * defining another function (as opposed to C++ style iterator), it is much + * easier to read. Rationale: We may at some point use functors, but for now + * this is a smaller change from the old OMT. + */ + template + int iterate(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). We will iterate only over + * [left,right) + * + * Requires: left <= right + * Requires: f != NULL + * Returns: + * EINVAL if right > this->size() + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate_on_range. If f always returns zero, + * then iterate_on_range returns 0. Requires: Don't modify the omt while + * running. (E.g., f may not insert or delete values from the omt.) + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Rational: Although the + * functional iterator requires defining another function (as opposed to C++ + * style iterator), it is much easier to read. + */ + template + int iterate_on_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, and mark the nodes that are + * visited. Other than the marks, this behaves the same as iterate_on_range. + * Requires: supports_marks == true + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Notes: This function MAY be + * called concurrently by multiple threads, but not concurrently with any + * other non-const function. + */ + template + int iterate_and_mark_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value whose node has been marked. Other than the marks, this + * behaves the same as iterate. Requires: supports_marks == true Performance: + * time=O(i+\log N) where i is the number of times f is called, and N is the + * number of elements in the omt. + */ + template + int iterate_over_marked(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Delete all elements from the omt, whose nodes have been marked. + * Requires: supports_marks == true + * Performance: time=O(N + i\log N) where i is the number of marked elements, + * {c,sh}ould be faster + */ + void delete_all_marked(void); + + /** + * Effect: Verify that the internal state of the marks in the tree are + * self-consistent. Crashes the system if the marks are in a bad state. + * Requires: supports_marks == true + * Performance: time=O(N) + * Notes: + * Even though this is a const function, it requires exclusive access. + * Rationale: + * The current implementation of the marks relies on a sort of + * "cache" bit representing the state of bits below it in the tree. + * This allows glass-box testing that these bits are correct. + */ + void verify_marks_consistent(void) const; + + /** + * Effect: None + * Returns whether there are any marks in the tree. + */ + bool has_marks(void) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a pointer to the value + * stored in the omt. The second argument passed to f is the index of the + * value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: same as for + * iterate() Returns: same as for iterate() Performance: same as for iterate() + * Rationale: In general, most iterators should use iterate() since they + * should not modify the data stored in the omt. This function is for + * iterators which need to modify values (for example, free_items). Rationale: + * We assume if you are transforming the data in place, you want to do it to + * everything at once, so there is not yet an iterate_on_range_ptr (but there + * could be). + */ + template + void iterate_ptr(iterate_extra_t *const iterate_extra); + + /** + * Effect: Set *value=V_idx + * Returns + * 0 success + * EINVAL if index>=toku_omt_size(omt) + * On nonzero return, *value is unchanged + * Performance: time=O(\log N) + */ + int fetch(const uint32_t idx, omtdataout_t *const value) const; + + /** + * Effect: Find the smallest i such that h(V_i, extra)>=0 + * If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = + * V_i, and return 0. If there is such an i and h(V_i,extra)>0 then set + * *idxp=i and return DB_NOTFOUND. If there is no such i then set + * *idx=this->size() and return DB_NOTFOUND. Note: value is of type + * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is + * fixed by the instantiation. If it is the value type, then the value is + * copied out (even if the value type is a pointer to something else) If it is + * the pointer type, then *value is set to a pointer to the data within the + * omt. This is determined by the type of the omt as initially declared. If + * the omt is declared as omt, then foo_t's will be stored and foo_t's + * will be returned by find and related functions. If the omt is declared as + * omt, then foo_t's will be stored, and pointers to the + * stored items will be returned by find and related functions. Rationale: + * Structs too small for malloc should be stored directly in the omt. + * These structs may need to be edited as they exist inside the omt, so we + * need a way to get a pointer within the omt. Using separate functions for + * returning pointers and values increases code duplication and reduces + * type-checking. That also reduces the ability of the creator of a data + * structure to give advice to its future users. Slight overloading in this + * case seemed to provide a better API and better type checking. + */ + template + int find_zero(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: + * If direction >0 then find the smallest i such that h(V_i,extra)>0. + * If direction <0 then find the largest i such that h(V_i,extra)<0. + * (Direction may not be equal to zero.) + * If value!=NULL then store V_i in *value + * If idxp!=NULL then store i in *idxp. + * Requires: The signum of h is monotically increasing. + * Returns + * 0 success + * DB_NOTFOUND no such value is found. + * On nonzero return, *value and *idxp are unchanged + * Performance: time=O(\log N) + * Rationale: + * Here's how to use the find function to find various things + * Cases for find: + * find first value: ( h(v)=+1, direction=+1 ) + * find last value ( h(v)=-1, direction=-1 ) + * find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 ) + * find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 ) + * find X or successor to X ( same as find first X. ) + * + * Rationale: To help understand heaviside functions and behavor of find: + * There are 7 kinds of heaviside functions. + * The signus of the h must be monotonically increasing. + * Given a function of the following form, A is the element + * returned for direction>0, B is the element returned + * for direction<0, C is the element returned for + * direction==0 (see find_zero) (with a return of 0), and D is the element + * returned for direction==0 (see find_zero) with a return of DB_NOTFOUND. + * If any of A, B, or C are not found, then asking for the + * associated direction will return DB_NOTFOUND. + * See find_zero for more information. + * + * Let the following represent the signus of the heaviside function. + * + * -...- + * A + * D + * + * +...+ + * B + * D + * + * 0...0 + * C + * + * -...-0...0 + * AC + * + * 0...0+...+ + * C B + * + * -...-+...+ + * AB + * D + * + * -...-0...0+...+ + * AC B + */ + template + int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: Return the size (in bytes) of the omt, as it resides in main + * memory. If the data stored are pointers, don't include the size of what + * they all point to. + */ + size_t memory_size(void); + + private: + typedef uint32_t node_idx; + typedef omt_internal::subtree_templated subtree; + typedef omt_internal::omt_node_templated omt_node; + ENSURE_POD(subtree); + + struct omt_array { + uint32_t start_idx; + uint32_t num_values; + omtdata_t *values; + }; + + struct omt_tree { + subtree root; + uint32_t free_idx; + omt_node *nodes; + }; + + bool is_array; + uint32_t capacity; + union { + struct omt_array a; + struct omt_tree t; + } d; + + __attribute__((nonnull)) void unmark(const subtree &subtree, + const uint32_t index, + GrowableArray *const indexes); + + void create_internal_no_array(const uint32_t new_capacity); + + void create_internal(const uint32_t new_capacity); + + uint32_t nweight(const subtree &subtree) const; + + node_idx node_malloc(void); + + void node_free(const node_idx idx); + + void maybe_resize_array(const uint32_t n); + + __attribute__((nonnull)) void fill_array_with_subtree_values( + omtdata_t *const array, const subtree &subtree) const; + + void convert_to_array(void); + + __attribute__((nonnull)) void rebuild_from_sorted_array( + subtree *const subtree, const omtdata_t *const values, + const uint32_t numvalues); + + void convert_to_tree(void); + + void maybe_resize_or_convert(const uint32_t n); + + bool will_need_rebalance(const subtree &subtree, const int leftmod, + const int rightmod) const; + + __attribute__((nonnull)) void insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree); + + void set_at_internal_array(const omtdata_t &value, const uint32_t idx); + + void set_at_internal(const subtree &subtree, const omtdata_t &value, + const uint32_t idx); + + void delete_internal(subtree *const subtreep, const uint32_t idx, + omt_node *const copyn, + subtree **const rebalance_subtree); + + template + int iterate_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + template + void iterate_ptr_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template + void iterate_ptr_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + template + int iterate_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + template + int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, + const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template + int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + uint32_t verify_marks_consistent_internal(const subtree &subtree, + const bool allow_marks) const; + + void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const; + + void fetch_internal(const subtree &subtree, const uint32_t i, + omtdataout_t *const value) const; + + __attribute__((nonnull)) void fill_array_with_subtree_idxs( + node_idx *const array, const subtree &subtree) const; + + __attribute__((nonnull)) void rebuild_subtree_from_idxs( + subtree *const subtree, const node_idx *const idxs, + const uint32_t numvalues); + + __attribute__((nonnull)) void rebalance(subtree *const subtree); + + __attribute__((nonnull)) static void copyout(omtdata_t *const out, + const omt_node *const n); + + __attribute__((nonnull)) static void copyout(omtdata_t **const out, + omt_node *const n); + + __attribute__((nonnull)) static void copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr); + + __attribute__((nonnull)) static void copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr); + + template + int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_zero(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template + int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_plus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template + int find_internal_minus_array(const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; + + template + int find_internal_minus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; +}; + +} // namespace toku + +// include the implementation here +#include "omt_impl.h" diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h new file mode 100644 index 000000000..e77986716 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h @@ -0,0 +1,1295 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include + +#include "../db.h" +#include "../portability/memory.h" + +namespace toku { + +template +void omt::create(void) { + this->create_internal(2); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::create_no_array(void) { + if (!supports_marks) { + this->create_internal_no_array(0); + } else { + this->is_array = false; + this->capacity = 0; + this->d.t.nodes = nullptr; + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template +void omt::create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues) { + this->create_internal(numvalues); + memcpy(this->d.a.values, values, numvalues * (sizeof values[0])); + this->d.a.num_values = numvalues; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::create_steal_sorted_array( + omtdata_t **const values, const uint32_t numvalues, + const uint32_t new_capacity) { + paranoid_invariant_notnull(values); + this->create_internal_no_array(new_capacity); + this->d.a.num_values = numvalues; + this->d.a.values = *values; + *values = nullptr; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +int omt::split_at(omt *const newomt, + const uint32_t idx) { + barf_if_marked(*this); + paranoid_invariant_notnull(newomt); + if (idx > this->size()) { + return EINVAL; + } + this->convert_to_array(); + const uint32_t newsize = this->size() - idx; + newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx], + newsize); + this->d.a.num_values = idx; + this->maybe_resize_array(idx); + if (supports_marks) { + this->convert_to_tree(); + } + return 0; +} + +template +void omt::merge(omt *const leftomt, + omt *const rightomt) { + barf_if_marked(*this); + paranoid_invariant_notnull(leftomt); + paranoid_invariant_notnull(rightomt); + const uint32_t leftsize = leftomt->size(); + const uint32_t rightsize = rightomt->size(); + const uint32_t newsize = leftsize + rightsize; + + if (leftomt->is_array) { + if (leftomt->capacity - + (leftomt->d.a.start_idx + leftomt->d.a.num_values) >= + rightsize) { + this->create_steal_sorted_array( + &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity); + this->d.a.start_idx = leftomt->d.a.start_idx; + } else { + this->create_internal(newsize); + memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx], + leftomt->d.a.num_values * (sizeof this->d.a.values[0])); + } + } else { + this->create_internal(newsize); + leftomt->fill_array_with_subtree_values(&this->d.a.values[0], + leftomt->d.t.root); + } + leftomt->destroy(); + this->d.a.num_values = leftsize; + + if (rightomt->is_array) { + memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + &rightomt->d.a.values[rightomt->d.a.start_idx], + rightomt->d.a.num_values * (sizeof this->d.a.values[0])); + } else { + rightomt->fill_array_with_subtree_values( + &this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + rightomt->d.t.root); + } + rightomt->destroy(); + this->d.a.num_values += rightsize; + paranoid_invariant(this->size() == newsize); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::clone(const omt &src) { + barf_if_marked(*this); + this->create_internal(src.size()); + if (src.is_array) { + memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx], + src.d.a.num_values * (sizeof this->d.a.values[0])); + } else { + src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root); + } + this->d.a.num_values = src.size(); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template +void omt::clear(void) { + if (this->is_array) { + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + } else { + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template +void omt::destroy(void) { + this->clear(); + this->capacity = 0; + if (this->is_array) { + if (this->d.a.values != nullptr) { + toku_free(this->d.a.values); + } + this->d.a.values = nullptr; + } else { + if (this->d.t.nodes != nullptr) { + toku_free(this->d.t.nodes); + } + this->d.t.nodes = nullptr; + } +} + +template +uint32_t omt::size(void) const { + if (this->is_array) { + return this->d.a.num_values; + } else { + return this->nweight(this->d.t.root); + } +} + +template +template +int omt::insert(const omtdata_t &value, + const omtcmp_t &v, + uint32_t *const idx) { + int r; + uint32_t insert_idx; + + r = this->find_zero(v, nullptr, &insert_idx); + if (r == 0) { + if (idx) *idx = insert_idx; + return DB_KEYEXIST; + } + if (r != DB_NOTFOUND) return r; + + if ((r = this->insert_at(value, insert_idx))) return r; + if (idx) *idx = insert_idx; + + return 0; +} + +// The following 3 functions implement a static if for us. +template +static void barf_if_marked(const omt &UU(omt)) { +} + +template +static void barf_if_marked(const omt &omt) { + invariant(!omt.has_marks()); +} + +template +bool omt::has_marks(void) const { + static_assert(supports_marks, "Does not support marks"); + if (this->d.t.root.is_null()) { + return false; + } + const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()]; + return node.get_marks_below() || node.get_marked(); +} + +template +int omt::insert_at( + const omtdata_t &value, const uint32_t idx) { + barf_if_marked(*this); + if (idx > this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() + 1); + if (this->is_array && idx != this->d.a.num_values && + (idx != 0 || this->d.a.start_idx == 0)) { + this->convert_to_tree(); + } + if (this->is_array) { + if (idx == this->d.a.num_values) { + this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value; + } else { + this->d.a.values[--this->d.a.start_idx] = value; + } + this->d.a.num_values++; + } else { + subtree *rebalance_subtree = nullptr; + this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template +int omt::set_at(const omtdata_t &value, + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + if (this->is_array) { + this->set_at_internal_array(value, idx); + } else { + this->set_at_internal(this->d.t.root, value, idx); + } + return 0; +} + +template +int omt::delete_at( + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() - 1); + if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) { + this->convert_to_tree(); + } + if (this->is_array) { + // Testing for 0 does not rule out it being the last entry. + // Test explicitly for num_values-1 + if (idx != this->d.a.num_values - 1) { + this->d.a.start_idx++; + } + this->d.a.num_values--; + } else { + subtree *rebalance_subtree = nullptr; + this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template +template +int omt::iterate( + iterate_extra_t *const iterate_extra) const { + return this->iterate_on_range(0, this->size(), + iterate_extra); +} + +template +template +int omt::iterate_on_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + if (this->is_array) { + return this->iterate_internal_array(left, right, + iterate_extra); + } + return this->iterate_internal(left, right, this->d.t.root, + 0, iterate_extra); +} + +template +template +int omt::iterate_and_mark_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + static_assert(supports_marks, "does not support marks"); + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + paranoid_invariant(!this->is_array); + return this->iterate_and_mark_range_internal( + left, right, this->d.t.root, 0, iterate_extra); +} + +// TODO: We can optimize this if we steal 3 bits. 1 bit: this node is +// marked. 1 bit: left subtree has marks. 1 bit: right subtree has marks. +template +template +int omt::iterate_over_marked( + iterate_extra_t *const iterate_extra) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + return this->iterate_over_marked_internal( + this->d.t.root, 0, iterate_extra); +} + +template +void omt::unmark( + const subtree &st, const uint32_t index, + GrowableArray *const indexes) { + if (st.is_null()) { + return; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t index_root = index + this->nweight(n.left); + + const bool below = n.get_marks_below(); + if (below) { + this->unmark(n.left, index, indexes); + } + if (n.get_marked()) { + indexes->push(index_root); + } + n.clear_stolen_bits(); + if (below) { + this->unmark(n.right, index_root + 1, indexes); + } +} + +template +void omt::delete_all_marked(void) { + static_assert(supports_marks, "does not support marks"); + if (!this->has_marks()) { + return; + } + paranoid_invariant(!this->is_array); + GrowableArray marked_indexes; + marked_indexes.init(); + + // Remove all marks. + // We need to delete all the stolen bits before calling delete_at to + // prevent barfing. + this->unmark(this->d.t.root, 0, &marked_indexes); + + for (uint32_t i = 0; i < marked_indexes.get_size(); i++) { + // Delete from left to right, shift by number already deleted. + // Alternative is delete from right to left. + int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i); + lazy_assert_zero(r); + } + marked_indexes.deinit(); + barf_if_marked(*this); +} + +template +uint32_t +omt::verify_marks_consistent_internal( + const subtree &st, const bool UU(allow_marks)) const { + if (st.is_null()) { + return 0; + } + const omt_node &node = this->d.t.nodes[st.get_index()]; + uint32_t num_marks = + verify_marks_consistent_internal(node.left, node.get_marks_below()); + num_marks += + verify_marks_consistent_internal(node.right, node.get_marks_below()); + if (node.get_marks_below()) { + paranoid_invariant(allow_marks); + paranoid_invariant(num_marks > 0); + } else { + // redundant with invariant below, but nice to have explicitly + paranoid_invariant(num_marks == 0); + } + if (node.get_marked()) { + paranoid_invariant(allow_marks); + ++num_marks; + } + return num_marks; +} + +template +void omt::verify_marks_consistent( + void) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + this->verify_marks_consistent_internal(this->d.t.root, true); +} + +template +template +void omt::iterate_ptr( + iterate_extra_t *const iterate_extra) { + if (this->is_array) { + this->iterate_ptr_internal_array(0, this->size(), + iterate_extra); + } else { + this->iterate_ptr_internal( + 0, this->size(), this->d.t.root, 0, iterate_extra); + } +} + +template +int omt::fetch( + const uint32_t idx, omtdataout_t *const value) const { + if (idx >= this->size()) { + return EINVAL; + } + if (this->is_array) { + this->fetch_internal_array(idx, value); + } else { + this->fetch_internal(this->d.t.root, idx, value); + } + return 0; +} + +template +template +int omt::find_zero( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + int r; + if (this->is_array) { + r = this->find_internal_zero_array(extra, value, child_idxp); + } else { + r = this->find_internal_zero(this->d.t.root, extra, value, + child_idxp); + } + return r; +} + +template +template +int omt::find( + const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + paranoid_invariant(direction != 0); + if (direction < 0) { + if (this->is_array) { + return this->find_internal_minus_array(extra, value, + child_idxp); + } else { + return this->find_internal_minus(this->d.t.root, extra, + value, child_idxp); + } + } else { + if (this->is_array) { + return this->find_internal_plus_array(extra, value, + child_idxp); + } else { + return this->find_internal_plus(this->d.t.root, extra, value, + child_idxp); + } + } +} + +template +size_t omt::memory_size(void) { + if (this->is_array) { + return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]); + } + return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]); +} + +template +void omt::create_internal_no_array( + const uint32_t new_capacity) { + this->is_array = true; + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + this->d.a.values = nullptr; + this->capacity = new_capacity; +} + +template +void omt::create_internal( + const uint32_t new_capacity) { + this->create_internal_no_array(new_capacity); + XMALLOC_N(this->capacity, this->d.a.values); +} + +template +uint32_t omt::nweight( + const subtree &st) const { + if (st.is_null()) { + return 0; + } else { + return this->d.t.nodes[st.get_index()].weight; + } +} + +template +typename omt::node_idx +omt::node_malloc(void) { + paranoid_invariant(this->d.t.free_idx < this->capacity); + omt_node &n = this->d.t.nodes[this->d.t.free_idx]; + n.clear_stolen_bits(); + return this->d.t.free_idx++; +} + +template +void omt::node_free( + const node_idx UU(idx)) { + paranoid_invariant(idx < this->capacity); +} + +template +void omt::maybe_resize_array( + const uint32_t n) { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t room = this->capacity - this->d.a.start_idx; + + if (room < n || this->capacity / 2 >= new_size) { + omtdata_t *XMALLOC_N(new_size, tmp_values); + if (this->d.a.num_values) { + memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx], + this->d.a.num_values * (sizeof tmp_values[0])); + } + this->d.a.start_idx = 0; + this->capacity = new_size; + toku_free(this->d.a.values); + this->d.a.values = tmp_values; + } +} + +template +void omt::fill_array_with_subtree_values(omtdata_t *const array, + const subtree &st) + const { + if (st.is_null()) return; + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_values(&array[0], tree.left); + array[this->nweight(tree.left)] = tree.value; + this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1], + tree.right); +} + +template +void omt::convert_to_array(void) { + if (!this->is_array) { + const uint32_t num_values = this->size(); + uint32_t new_size = 2 * num_values; + new_size = new_size < 4 ? 4 : new_size; + + omtdata_t *XMALLOC_N(new_size, tmp_values); + this->fill_array_with_subtree_values(tmp_values, this->d.t.root); + toku_free(this->d.t.nodes); + this->is_array = true; + this->capacity = new_size; + this->d.a.num_values = num_values; + this->d.a.values = tmp_values; + this->d.a.start_idx = 0; + } +} + +template +void omt::rebuild_from_sorted_array( + subtree *const st, const omtdata_t *const values, + const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + const uint32_t halfway = numvalues / 2; + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = numvalues; + newnode->value = values[halfway]; + st->set_index(newidx); + // update everything before the recursive calls so the second call + // can be a tail call. + this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway); + this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1], + numvalues - (halfway + 1)); + } +} + +template +void omt::convert_to_tree(void) { + if (this->is_array) { + const uint32_t num_nodes = this->size(); + uint32_t new_size = num_nodes * 2; + new_size = new_size < 4 ? 4 : new_size; + + omt_node *XMALLOC_N(new_size, new_nodes); + omtdata_t *const values = this->d.a.values; + omtdata_t *const tmp_values = &values[this->d.a.start_idx]; + this->is_array = false; + this->d.t.nodes = new_nodes; + this->capacity = new_size; + this->d.t.free_idx = 0; + this->d.t.root.set_to_null(); + this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes); + toku_free(values); + } +} + +template +void omt::maybe_resize_or_convert( + const uint32_t n) { + if (this->is_array) { + this->maybe_resize_array(n); + } else { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t num_nodes = this->nweight(this->d.t.root); + if ((this->capacity / 2 >= new_size) || + (this->d.t.free_idx >= this->capacity && num_nodes < n) || + (this->capacity < n)) { + this->convert_to_array(); + // if we had a free list, the "supports_marks" version could + // just resize, as it is now, we have to convert to and back + // from an array. + if (supports_marks) { + this->convert_to_tree(); + } + } + } +} + +template +bool omt::will_need_rebalance( + const subtree &st, const int leftmod, const int rightmod) const { + if (st.is_null()) { + return false; + } + const omt_node &n = this->d.t.nodes[st.get_index()]; + // one of the 1's is for the root. + // the other is to take ceil(n/2) + const uint32_t weight_left = this->nweight(n.left) + leftmod; + const uint32_t weight_right = this->nweight(n.right) + rightmod; + return ((1 + weight_left < (1 + 1 + weight_right) / 2) || + (1 + weight_right < (1 + 1 + weight_left) / 2)); +} + +template +void omt::insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree) { + if (subtreep->is_null()) { + paranoid_invariant_zero(idx); + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = 1; + newnode->left.set_to_null(); + newnode->right.set_to_null(); + newnode->value = value; + subtreep->set_index(newidx); + } else { + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + n.weight++; + if (idx <= this->nweight(n.left)) { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 1, 0)) { + *rebalance_subtree = subtreep; + } + this->insert_internal(&n.left, value, idx, rebalance_subtree); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, 1)) { + *rebalance_subtree = subtreep; + } + const uint32_t sub_index = idx - this->nweight(n.left) - 1; + this->insert_internal(&n.right, value, sub_index, rebalance_subtree); + } + } +} + +template +void omt::set_at_internal_array( + const omtdata_t &value, const uint32_t idx) { + this->d.a.values[this->d.a.start_idx + idx] = value; +} + +template +void omt::set_at_internal( + const subtree &st, const omtdata_t &value, const uint32_t idx) { + paranoid_invariant(!st.is_null()); + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + this->set_at_internal(n.left, value, idx); + } else if (idx == leftweight) { + n.value = value; + } else { + this->set_at_internal(n.right, value, idx - leftweight - 1); + } +} + +template +void omt::delete_internal( + subtree *const subtreep, const uint32_t idx, omt_node *const copyn, + subtree **const rebalance_subtree) { + paranoid_invariant_notnull(subtreep); + paranoid_invariant_notnull(rebalance_subtree); + paranoid_invariant(!subtreep->is_null()); + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, -1, 0)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.left, idx, copyn, rebalance_subtree); + } else if (idx == leftweight) { + if (n.left.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.right; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else if (n.right.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.left; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + // don't need to copy up value, it's only used by this + // next call, and when that gets to the bottom there + // won't be any more recursion + n.weight--; + this->delete_internal(&n.right, 0, &n, rebalance_subtree); + } + } else { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.right, idx - leftweight - 1, copyn, + rebalance_subtree); + } +} + +template +template +int omt::iterate_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + int r; + for (uint32_t i = left; i < right; ++i) { + r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + if (r != 0) { + return r; + } + } + return 0; +} + +template +template +void omt::iterate_ptr_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) { + if (!st.is_null()) { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + this->iterate_ptr_internal(left, right, n.left, idx, + iterate_extra); + } + if (left <= idx_root && idx_root < right) { + int r = f(&n.value, idx_root, iterate_extra); + lazy_assert_zero(r); + } + if (idx_root + 1 < right) { + this->iterate_ptr_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + } +} + +template +template +void omt::iterate_ptr_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + for (uint32_t i = left; i < right; ++i) { + int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + lazy_assert_zero(r); + } +} + +template +template +int omt::iterate_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + r = this->iterate_internal(left, right, n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right) { + return this->iterate_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +template +int omt:: + iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) { + paranoid_invariant(!st.is_null()); + int r; + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root && !n.left.is_null()) { + n.set_marks_below_bit(); + r = this->iterate_and_mark_range_internal( + left, right, n.left, idx, iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + n.set_marked_bit(); + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right && !n.right.is_null()) { + n.set_marks_below_bit(); + return this->iterate_and_mark_range_internal( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +template +int omt::iterate_over_marked_internal( + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (n.get_marks_below()) { + r = this->iterate_over_marked_internal(n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marked()) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marks_below()) { + return this->iterate_over_marked_internal( + n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template +void omt::fetch_internal_array( + const uint32_t i, omtdataout_t *const value) const { + if (value != nullptr) { + copyout(value, &this->d.a.values[this->d.a.start_idx + i]); + } +} + +template +void omt::fetch_internal( + const subtree &st, const uint32_t i, omtdataout_t *const value) const { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (i < leftweight) { + this->fetch_internal(n.left, i, value); + } else if (i == leftweight) { + if (value != nullptr) { + copyout(value, &n); + } + } else { + this->fetch_internal(n.right, i - leftweight - 1, value); + } +} + +template +void omt::fill_array_with_subtree_idxs( + node_idx *const array, const subtree &st) const { + if (!st.is_null()) { + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_idxs(&array[0], tree.left); + array[this->nweight(tree.left)] = st.get_index(); + this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1], + tree.right); + } +} + +template +void omt::rebuild_subtree_from_idxs( + subtree *const st, const node_idx *const idxs, const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + uint32_t halfway = numvalues / 2; + st->set_index(idxs[halfway]); + // node_idx newidx = idxs[halfway]; + omt_node &newnode = this->d.t.nodes[st->get_index()]; + newnode.weight = numvalues; + // value is already in there. + this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway); + this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1], + numvalues - (halfway + 1)); + // n_idx = newidx; + } +} + +template +void omt::rebalance( + subtree *const st) { + node_idx idx = st->get_index(); + if (idx == this->d.t.root.get_index()) { + // Try to convert to an array. + // If this fails, (malloc) nothing will have changed. + // In the failure case we continue on to the standard rebalance + // algorithm. + this->convert_to_array(); + if (supports_marks) { + this->convert_to_tree(); + } + } else { + const omt_node &n = this->d.t.nodes[idx]; + node_idx *tmp_array; + size_t mem_needed = n.weight * (sizeof tmp_array[0]); + size_t mem_free = + (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]); + bool malloced; + if (mem_needed <= mem_free) { + // There is sufficient free space at the end of the nodes array + // to hold enough node indexes to rebalance. + malloced = false; + tmp_array = + reinterpret_cast(&this->d.t.nodes[this->d.t.free_idx]); + } else { + malloced = true; + XMALLOC_N(n.weight, tmp_array); + } + this->fill_array_with_subtree_idxs(tmp_array, *st); + this->rebuild_subtree_from_idxs(st, tmp_array, n.weight); + if (malloced) toku_free(tmp_array); + } +} + +template +void omt::copyout( + omtdata_t *const out, const omt_node *const n) { + *out = n->value; +} + +template +void omt::copyout( + omtdata_t **const out, omt_node *const n) { + *out = &n->value; +} + +template +void omt::copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr) { + *out = *stored_value_ptr; +} + +template +void omt::copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr) { + *out = stored_value_ptr; +} + +template +template +int omt::find_internal_zero_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best_pos = subtree::NODE_NULL; + uint32_t best_zero = subtree::NODE_NULL; + + while (min != limit) { + uint32_t mid = (min + limit) / 2; + int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + min = mid + 1; + } else if (hv > 0) { + best_pos = mid; + limit = mid; + } else { + best_zero = mid; + limit = mid; + } + } + if (best_zero != subtree::NODE_NULL) { + // Found a zero + if (value != nullptr) { + copyout(value, &this->d.a.values[best_zero]); + } + *idxp = best_zero - this->d.a.start_idx; + return 0; + } + if (best_pos != subtree::NODE_NULL) + *idxp = best_pos - this->d.a.start_idx; + else + *idxp = this->d.a.num_values; + return DB_NOTFOUND; +} + +template +template +int omt::find_internal_zero( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + *idxp = 0; + return DB_NOTFOUND; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + int hv = h(n.value, extra); + if (hv < 0) { + int r = this->find_internal_zero(n.right, extra, value, idxp); + *idxp += this->nweight(n.left) + 1; + return r; + } else if (hv > 0) { + return this->find_internal_zero(n.left, extra, value, idxp); + } else { + int r = this->find_internal_zero(n.left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n.left); + if (value != nullptr) { + copyout(value, &n); + } + r = 0; + } + return r; + } +} + +template +template +int omt::find_internal_plus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv > 0) { + best = mid; + limit = mid; + } else { + min = mid + 1; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template +template +int omt::find_internal_plus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + int r; + if (hv > 0) { + r = this->find_internal_plus(n->left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + } else { + r = this->find_internal_plus(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } + } + return r; +} + +template +template +int omt::find_internal_minus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + best = mid; + min = mid + 1; + } else { + limit = mid; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template +template +int omt::find_internal_minus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + if (hv < 0) { + int r = + this->find_internal_minus(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } else if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + return r; + } else { + return this->find_internal_minus(n->left, extra, value, idxp); + } +} +} // namespace toku diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h new file mode 100644 index 000000000..f20eeedf2 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h @@ -0,0 +1,165 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// Overview: A partitioned_counter provides a counter that can be incremented +// and the running sum can be read at any time. +// We assume that increments are frequent, whereas reading is infrequent. +// Implementation hint: Use thread-local storage so each thread increments its +// own data. The increment does not require a lock or atomic operation. +// Reading the data can be performed by iterating over the thread-local +// versions, summing them up. The data structure also includes a sum for all +// the threads that have died. Use a pthread_key to create the thread-local +// versions. When a thread finishes, the system calls pthread_key destructor +// which can add that thread's copy into the sum_of_dead counter. +// Rationale: For statistics such as are found in engine status, we need a +// counter that requires no cache misses to increment. We've seen significant +// performance speedups by removing certain counters. Rather than removing +// those statistics, we would like to just make the counter fast. We generally +// increment the counters frequently, and want to fetch the values +// infrequently. The counters are monotonic. The counters can be split into +// many counters, which can be summed up at the end. We don't care if we get +// slightly out-of-date counter sums when we read the counter. We don't care +// if there is a race on reading the a counter +// variable and incrementing. +// See tests/test_partitioned_counter.c for some performance measurements. +// Operations: +// create_partitioned_counter Create a counter initialized to zero. +// destroy_partitioned_counter Destroy it. +// increment_partitioned_counter Increment it. This is the frequent +// operation. read_partitioned_counter Get the current value. This is +// infrequent. +// See partitioned_counter.cc for the abstraction function and representation +// invariant. +// +// The google style guide says to avoid using constructors, and it appears that +// constructors may have broken all the tests, because they called +// pthread_key_create before the key was actually created. So the google style +// guide may have some wisdom there... +// +// This version does not use constructors, essentially reverrting to the google +// C++ style guide. +// + +// The old C interface. This required a bunch of explicit +// ___attribute__((__destructor__)) functions to remember to destroy counters at +// the end. +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct partitioned_counter *PARTITIONED_COUNTER; +PARTITIONED_COUNTER create_partitioned_counter(void); +// Effect: Create a counter, initialized to zero. + +void destroy_partitioned_counter(PARTITIONED_COUNTER); +// Effect: Destroy the counter. No operations on that counter are permitted +// after this. + +void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount); +// Effect: Increment the counter by amount. +// Requires: No overflows. This is a 64-bit unsigned counter. + +uint64_t read_partitioned_counter(PARTITIONED_COUNTER) + __attribute__((__visibility__("default"))); +// Effect: Return the current value of the counter. + +void partitioned_counters_init(void); +// Effect: Initialize any partitioned counters data structures that must be set +// up before any partitioned counters run. + +void partitioned_counters_destroy(void); +// Effect: Destroy any partitioned counters data structures. + +#if defined(__cplusplus) +}; +#endif + +#if 0 +#include + +#include "fttypes.h" + +// Used inside the PARTITIONED_COUNTER. +struct linked_list_head { + struct linked_list_element *first; +}; + + +class PARTITIONED_COUNTER { +public: + PARTITIONED_COUNTER(void); + // Effect: Construct a counter, initialized to zero. + + ~PARTITIONED_COUNTER(void); + // Effect: Destruct the counter. + + void increment(uint64_t amount); + // Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64). + // Requires: Don't use this from a static constructor or destructor. + + uint64_t read(void); + // Effect: Read the sum. + // Requires: Don't use this from a static constructor or destructor. + +private: + uint64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated. + pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage. + struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter. + + // This function is used to destroy the thread-local part of the state when a thread terminates. + // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends. + friend void destroy_thread_local_part_of_partitioned_counters (void *); +}; +#endif diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h new file mode 100644 index 000000000..3fd0095d0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h @@ -0,0 +1,76 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "partitioned_counter.h" +// PORT2: #include + +#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc) \ + do { \ + array.status[k].keyname = #k; \ + array.status[k].columnname = #c; \ + array.status[k].type = t; \ + array.status[k].legend = l; \ + constexpr_static_assert( \ + strcmp(#c, "NULL") && strcmp(#c, "0"), \ + "Use nullptr for no column name instead of NULL, 0, etc..."); \ + constexpr_static_assert( \ + (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"), \ + "Missing column name."); \ + array.status[k].include = \ + static_cast(inc); \ + if (t == STATUS_PARCOUNT) { \ + array.status[k].value.parcount = create_partitioned_counter(); \ + } \ + } while (0) diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc new file mode 100644 index 000000000..531165dea --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc @@ -0,0 +1,503 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h" + +#include +#include +#include + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/thread_local.h" +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr mutex_factory) { + std::shared_ptr use_factory; + + if (mutex_factory) { + use_factory = mutex_factory; + } else { + use_factory.reset(new TransactionDBMutexFactoryImpl()); + } + return new RangeTreeLockManager(use_factory); +} + +static const char SUFFIX_INFIMUM = 0x0; +static const char SUFFIX_SUPREMUM = 0x1; + +// Convert Endpoint into an internal format used for storing it in locktree +// (DBT structure is used for passing endpoints to locktree and getting back) +void serialize_endpoint(const Endpoint& endp, std::string* buf) { + buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM); + buf->append(endp.slice.data(), endp.slice.size()); +} + +// Decode the endpoint from the format it is stored in the locktree (DBT) to +// the one used outside: either Endpoint or EndpointWithString +template +void deserialize_endpoint(const DBT* dbt, EndpointStruct* endp) { + assert(dbt->size >= 1); + const char* dbt_data = (const char*)dbt->data; + char suffix = dbt_data[0]; + assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM); + endp->inf_suffix = (suffix == SUFFIX_SUPREMUM); + endp->slice = decltype(EndpointStruct::slice)(dbt_data + 1, dbt->size - 1); +} + +// Get a range lock on [start_key; end_key] range +Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn, + uint32_t column_family_id, + const Endpoint& start_endp, + const Endpoint& end_endp, Env*, + bool exclusive) { + toku::lock_request request; + request.create(mutex_factory_); + DBT start_key_dbt, end_key_dbt; + + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter"); + std::string start_key; + std::string end_key; + serialize_endpoint(start_endp, &start_key); + serialize_endpoint(end_endp, &end_key); + + toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size()); + toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size()); + + auto lt = GetLockTreeForCF(column_family_id); + + // Put the key waited on into request's m_extra. See + // wait_callback_for_locktree for details. + std::string wait_key(start_endp.slice.data(), start_endp.slice.size()); + + request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt, + exclusive ? toku::lock_request::WRITE : toku::lock_request::READ, + false /* not a big txn */, &wait_key); + + // This is for "periodically wake up and check if the wait is killed" feature + // which we are not using. + uint64_t killed_time_msec = 0; + uint64_t wait_time_msec = txn->GetLockTimeout(); + + if (wait_time_msec == static_cast(-1)) { + // The transaction has no wait timeout. lock_request::wait doesn't support + // this, it needs a number of milliseconds to wait. Pass it one year to + // be safe. + wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365; + } else { + // convert microseconds to milliseconds + wait_time_msec = (wait_time_msec + 500) / 1000; + } + + std::vector di_path; + request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive, + const DBT* start_dbt, const DBT* end_dbt) { + EndpointWithString start; + EndpointWithString end; + deserialize_endpoint(start_dbt, &start); + deserialize_endpoint(end_dbt, &end); + + di_path.push_back({txnid, column_family_id, is_exclusive, std::move(start), + std::move(end)}); + }; + + request.start(); + + const int r = request.wait(wait_time_msec, killed_time_msec, + nullptr, // killed_callback + wait_callback_for_locktree, nullptr); + + // Inform the txn that we are no longer waiting: + txn->ClearWaitingTxn(); + + request.destroy(); + switch (r) { + case 0: + break; // fall through + case DB_LOCK_NOTGRANTED: + return Status::TimedOut(Status::SubCode::kLockTimeout); + case TOKUDB_OUT_OF_LOCKS: + return Status::Busy(Status::SubCode::kLockLimit); + case DB_LOCK_DEADLOCK: { + std::reverse(di_path.begin(), di_path.end()); + dlock_buffer_.AddNewPath( + RangeDeadlockPath(di_path, request.get_start_time())); + return Status::Busy(Status::SubCode::kDeadlock); + } + default: + assert(0); + return Status::Busy(Status::SubCode::kLockLimit); + } + + return Status::OK(); +} + +// Wait callback that locktree library will call to inform us about +// the lock waits that are in progress. +void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) { + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:EnterWaitingTxn"); + for (auto wait_info : *infos) { + // As long as we hold the lock on the locktree's pending request queue + // this should be safe. + auto txn = (PessimisticTransaction*)wait_info.waiter; + auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid; + + autovector waitee_ids; + for (auto waitee : wait_info.waitees) { + waitee_ids.push_back(waitee); + } + txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra); + } + + // Here we can assume that the locktree code will now wait for some lock + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn"); +} + +void RangeTreeLockManager::UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env*) { + auto locktree = GetLockTreeForCF(column_family_id); + std::string endp_image; + serialize_endpoint({key.data(), key.size(), false}, &endp_image); + + DBT key_dbt; + toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size()); + + toku::range_buffer range_buf; + range_buf.create(); + range_buf.append(&key_dbt, &key_dbt); + + locktree->release_locks((TXNID)txn, &range_buf); + range_buf.destroy(); + + toku::lock_request::retry_all_lock_requests( + locktree.get(), wait_callback_for_locktree, nullptr); +} + +void RangeTreeLockManager::UnLock(PessimisticTransaction* txn, + const LockTracker& tracker, Env*) { + const RangeTreeLockTracker* range_tracker = + static_cast(&tracker); + + RangeTreeLockTracker* range_trx_tracker = + static_cast(&txn->GetTrackedLocks()); + bool all_keys = (range_trx_tracker == range_tracker); + + // tracked_locks_->range_list may hold nullptr if the transaction has never + // acquired any locks. + ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys); +} + +int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key, + const DBT* b_key) { + const char* a = (const char*)a_key->data; + const char* b = (const char*)b_key->data; + + size_t a_len = a_key->size; + size_t b_len = b_key->size; + + size_t min_len = std::min(a_len, b_len); + + // Compare the values. The first byte encodes the endpoint type, its value + // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM. + Comparator* cmp = (Comparator*)arg; + int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1)); + if (!res) { + if (b_len > min_len) { + // a is shorter; + if (a[0] == SUFFIX_INFIMUM) { + return -1; //"a is smaller" + } else { + // a is considered padded with 0xFF:FF:FF:FF... + return 1; // "a" is bigger + } + } else if (a_len > min_len) { + // the opposite of the above: b is shorter. + if (b[0] == SUFFIX_INFIMUM) { + return 1; //"b is smaller" + } else { + // b is considered padded with 0xFF:FF:FF:FF... + return -1; // "b" is bigger + } + } else { + // the lengths are equal (and the key values, too) + if (a[0] < b[0]) { + return -1; + } else if (a[0] > b[0]) { + return 1; + } else { + return 0; + } + } + } else { + return res; + } +} + +namespace { +void UnrefLockTreeMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_tree_map_cache = static_cast< + std::unordered_map>*>( + ptr); + delete lock_tree_map_cache; +} +} // anonymous namespace + +RangeTreeLockManager::RangeTreeLockManager( + std::shared_ptr mutex_factory) + : mutex_factory_(mutex_factory), + ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)), + dlock_buffer_(10) { + ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_); +} + +int RangeTreeLockManager::on_create(toku::locktree* lt, void* arg) { + // arg is a pointer to RangeTreeLockManager + lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg); + return 0; +} + +bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT* a, const DBT* b, + void* extra) { + Endpoint a_endp, b_endp; + deserialize_endpoint(a, &a_endp); + deserialize_endpoint(b, &b_endp); + auto self = static_cast(extra); + return self->barrier_func_(a_endp, b_endp); +} + +void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize( + uint32_t target_size) { + dlock_buffer_.Resize(target_size); +} + +void RangeTreeLockManager::Resize(uint32_t target_size) { + SetRangeDeadlockInfoBufferSize(target_size); +} + +std::vector +RangeTreeLockManager::GetRangeDeadlockInfoBuffer() { + return dlock_buffer_.PrepareBuffer(); +} + +std::vector RangeTreeLockManager::GetDeadlockInfoBuffer() { + std::vector res; + std::vector data = GetRangeDeadlockInfoBuffer(); + // report left endpoints + for (auto it = data.begin(); it != data.end(); ++it) { + std::vector path; + + for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) { + path.push_back( + {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice}); + } + res.push_back(DeadlockPath(path, it->deadlock_time)); + } + return res; +} + +// @brief Lock Escalation Callback function +// +// @param txnid Transaction whose locks got escalated +// @param lt Lock Tree where escalation is happening +// @param buffer Escalation result: list of locks that this transaction now +// owns in this lock tree. +// @param void* Callback context +void RangeTreeLockManager::on_escalate(TXNID txnid, const toku::locktree* lt, + const toku::range_buffer& buffer, + void*) { + auto txn = (PessimisticTransaction*)txnid; + ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer); +} + +RangeTreeLockManager::~RangeTreeLockManager() { + autovector local_caches; + ltree_lookup_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } + ltree_map_.clear(); // this will call release_lt() for all locktrees + ltm_.destroy(); +} + +RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() { + LTM_STATUS_S ltm_status_test; + ltm_.get_status(<m_status_test); + Counters res; + + // Searching status variable by its string name is how Toku's unit tests + // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?) + // lookup keyname in status + for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) { + TOKU_ENGINE_STATUS_ROW status = <m_status_test.status[i]; + if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) { + res.escalation_count = status->value.num; + continue; + } + if (strcmp(status->keyname, "LTM_WAIT_COUNT") == 0) { + res.lock_wait_count = status->value.num; + continue; + } + if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) { + res.current_lock_memory = status->value.num; + } + } + return res; +} + +std::shared_ptr RangeTreeLockManager::MakeLockTreePtr( + toku::locktree* lt) { + toku::locktree_manager* ltm = <m_; + return std::shared_ptr( + lt, [ltm](toku::locktree* p) { ltm->release_lt(p); }); +} + +void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) { + uint32_t column_family_id = cfh->GetID(); + + InstrumentedMutexLock l(<ree_map_mutex_); + if (ltree_map_.find(column_family_id) == ltree_map_.end()) { + DICTIONARY_ID dict_id = {.dictid = column_family_id}; + toku::comparator cmp; + cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator()); + toku::locktree* ltree = + ltm_.get_lt(dict_id, cmp, + /* on_create_extra*/ static_cast(this)); + // This is ok to because get_lt has copied the comparator: + cmp.destroy(); + + ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)}); + } +} + +void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) { + uint32_t column_family_id = cfh->GetID(); + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep using it + // until they release their references to it. + + // TODO what if one drops a column family while transaction(s) still have + // locks in it? + // locktree uses column family'c Comparator* as the criteria to do tree + // ordering. If the comparator is gone, we won't even be able to remove the + // elements from the locktree. + // A possible solution might be to remove everything right now: + // - wait until everyone traversing the locktree are gone + // - remove everything from the locktree. + // - some transactions may have acquired locks in their LockTracker objects. + // Arrange something so we don't blow up when they try to release them. + // - ... + // This use case (drop column family while somebody is using it) doesn't seem + // the priority, though. + + { + InstrumentedMutexLock l(<ree_map_mutex_); + + auto lock_maps_iter = ltree_map_.find(column_family_id); + assert(lock_maps_iter != ltree_map_.end()); + ltree_map_.erase(lock_maps_iter); + } // lock_map_mutex_ + + autovector local_caches; + ltree_lookup_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast(cache); + } +} + +std::shared_ptr RangeTreeLockManager::GetLockTreeForCF( + ColumnFamilyId column_family_id) { + // First check thread-local cache + if (ltree_lookup_cache_->Get() == nullptr) { + ltree_lookup_cache_->Reset(new LockTreeMap()); + } + + auto ltree_map_cache = static_cast(ltree_lookup_cache_->Get()); + + auto it = ltree_map_cache->find(column_family_id); + if (it != ltree_map_cache->end()) { + // Found lock map for this column family. + return it->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(<ree_map_mutex_); + + it = ltree_map_.find(column_family_id); + if (it == ltree_map_.end()) { + return nullptr; + } else { + // Found lock map. Store in thread-local cache and return. + ltree_map_cache->insert({column_family_id, it->second}); + return it->second; + } +} + +struct LOCK_PRINT_CONTEXT { + RangeLockManagerHandle::RangeLockStatus* data; // Save locks here + uint32_t cfh_id; // Column Family whose tree we are traversing +}; + +// Report left endpoints of the acquired locks +LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() { + PointLockStatus res; + LockManager::RangeLockStatus data = GetRangeLockStatus(); + // report left endpoints + for (auto it = data.begin(); it != data.end(); ++it) { + auto& val = it->second; + res.insert({it->first, {val.start.slice, val.ids, val.exclusive}}); + } + return res; +} + +static void push_into_lock_status_data(void* param, const DBT* left, + const DBT* right, TXNID txnid_arg, + bool is_shared, TxnidVector* owners) { + struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param; + struct RangeLockInfo info; + + info.exclusive = !is_shared; + + deserialize_endpoint(left, &info.start); + deserialize_endpoint(right, &info.end); + + if (txnid_arg != TXNID_SHARED) { + info.ids.push_back(txnid_arg); + } else { + for (auto it : *owners) { + info.ids.push_back(it); + } + } + ctx->data->insert({ctx->cfh_id, info}); +} + +LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() { + LockManager::RangeLockStatus data; + { + InstrumentedMutexLock l(<ree_map_mutex_); + for (auto it : ltree_map_) { + LOCK_PRINT_CONTEXT ctx = {&data, it.first}; + it.second->dump_locks((void*)&ctx, push_into_lock_status_data); + } + } + return data; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h new file mode 100644 index 000000000..e4236d600 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +// For DeadlockInfoBuffer: +#include "util/thread_local.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" + +// Lock Tree library: +#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h" +#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h" +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +typedef DeadlockInfoBufferTempl RangeDeadlockInfoBuffer; + +// A Range Lock Manager that uses PerconaFT's locktree library +class RangeTreeLockManager : public RangeLockManagerBase, + public RangeLockManagerHandle { + public: + LockManager* getLockManager() override { return this; } + + void AddColumnFamily(const ColumnFamilyHandle* cfh) override; + void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override; + + void Resize(uint32_t) override; + std::vector GetDeadlockInfoBuffer() override; + + std::vector GetRangeDeadlockInfoBuffer() override; + void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override; + + // Get a lock on a range + // @note only exclusive locks are currently supported (requesting a + // non-exclusive lock will get an exclusive one) + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start_endp, const Endpoint& end_endp, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&, + const Endpoint&, Env*) override { + // TODO: range unlock does nothing... + } + + explicit RangeTreeLockManager( + std::shared_ptr mutex_factory); + + ~RangeTreeLockManager() override; + + int SetMaxLockMemory(size_t max_lock_memory) override { + return ltm_.set_max_lock_memory(max_lock_memory); + } + + size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); } + + Counters GetStatus() override; + + bool IsPointLockSupported() const override { + // One could have acquired a point lock (it is reduced to range lock) + return true; + } + + PointLockStatus GetPointLockStatus() override; + + // This is from LockManager + LockManager::RangeLockStatus GetRangeLockStatus() override; + + // This has the same meaning as GetRangeLockStatus but is from + // RangeLockManagerHandle + RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override { + return GetRangeLockStatus(); + } + + bool IsRangeLockSupported() const override { return true; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return RangeTreeLockTrackerFactory::Get(); + } + + // Get the locktree which stores locks for the Column Family with given cf_id + std::shared_ptr GetLockTreeForCF(ColumnFamilyId cf_id); + + void SetEscalationBarrierFunc(EscalationBarrierFunc func) override { + barrier_func_ = func; + } + + private: + toku::locktree_manager ltm_; + + EscalationBarrierFunc barrier_func_ = + [](const Endpoint&, const Endpoint&) -> bool { return false; }; + + std::shared_ptr mutex_factory_; + + // Map from cf_id to locktree*. Can only be accessed while holding the + // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt + using LockTreeMap = + std::unordered_map>; + LockTreeMap ltree_map_; + + InstrumentedMutex ltree_map_mutex_; + + // Per-thread cache of ltree_map_. + // (uses the same approach as TransactionLockMgr::lock_maps_cache_) + std::unique_ptr ltree_lookup_cache_; + + RangeDeadlockInfoBuffer dlock_buffer_; + + std::shared_ptr MakeLockTreePtr(toku::locktree* lt); + static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key); + + // Callbacks + static int on_create(toku::locktree*, void*); + static void on_destroy(toku::locktree*) {} + static void on_escalate(TXNID txnid, const toku::locktree* lt, + const toku::range_buffer& buffer, void* extra); + + static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra); +}; + +void serialize_endpoint(const Endpoint& endp, std::string* buf); +void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos); + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc new file mode 100644 index 000000000..be1e1478b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +RangeLockList *RangeTreeLockTracker::getOrCreateList() { + if (range_list_) return range_list_.get(); + + // Doesn't exist, create + range_list_.reset(new RangeLockList()); + return range_list_.get(); +} + +void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) { + DBT key_dbt; + std::string key; + serialize_endpoint(Endpoint(lock_req.key, false), &key); + toku_fill_dbt(&key_dbt, key.data(), key.size()); + RangeLockList *rl = getOrCreateList(); + rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt); +} + +void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) { + DBT start_dbt, end_dbt; + std::string start_key, end_key; + + serialize_endpoint(lock_req.start_endp, &start_key); + serialize_endpoint(lock_req.end_endp, &end_key); + + toku_fill_dbt(&start_dbt, start_key.data(), start_key.size()); + toku_fill_dbt(&end_dbt, end_key.data(), end_key.size()); + + RangeLockList *rl = getOrCreateList(); + rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt); +} + +PointLockStatus RangeTreeLockTracker::GetPointLockStatus( + ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const { + // This function is not expected to be called as RangeTreeLockTracker:: + // IsPointLockSupported() returns false. Return the status which indicates + // the point is not locked. + PointLockStatus p; + p.locked = false; + p.exclusive = true; + p.seq = 0; + return p; +} + +void RangeTreeLockTracker::Clear() { range_list_.reset(); } + +void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key, + const DBT *right_key) { + MutexLock l(&mutex_); + // Only the transaction owner thread calls this function. + // The same thread does the lock release, so we can be certain nobody is + // releasing the locks concurrently. + assert(!releasing_locks_.load()); + auto it = buffers_.find(cf_id); + if (it == buffers_.end()) { + // create a new one + it = buffers_.emplace(cf_id, std::make_shared()).first; + it->second->create(); + } + it->second->append(left_key, right_key); +} + +void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr, + PessimisticTransaction *txn, + bool all_trx_locks) { + { + MutexLock l(&mutex_); + // The lt->release_locks() call below will walk range_list->buffer_. We + // need to prevent lock escalation callback from replacing + // range_list->buffer_ while we are doing that. + // + // Additional complication here is internal mutex(es) in the locktree + // (let's call them latches): + // - Lock escalation first obtains latches on the lock tree + // - Then, it calls RangeTreeLockManager::on_escalate to replace + // transaction's range_list->buffer_. = Access to that buffer must be + // synchronized, so it will want to acquire the range_list->mutex_. + // + // While in this function we would want to do the reverse: + // - Acquire range_list->mutex_ to prevent access to the range_list. + // - Then, lt->release_locks() call will walk through the range_list + // - and acquire latches on parts of the lock tree to remove locks from + // it. + // + // How do we avoid the deadlock? The idea is that here we set + // releasing_locks_=true, and release the mutex. + // All other users of the range_list must: + // - Acquire the mutex, then check that releasing_locks_=false. + // (the code in this function doesnt do that as there's only one thread + // that releases transaction's locks) + releasing_locks_.store(true); + } + + for (auto it : buffers_) { + // Don't try to call release_locks() if the buffer is empty! if we are + // not holding any locks, the lock tree might be in the STO-mode with + // another transaction, and our attempt to release an empty set of locks + // will cause an assertion failure. + if (it.second->get_num_ranges()) { + auto lt_ptr = mgr->GetLockTreeForCF(it.first); + toku::locktree *lt = lt_ptr.get(); + + lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks); + + it.second->destroy(); + it.second->create(); + + toku::lock_request::retry_all_lock_requests(lt, + wait_callback_for_locktree); + } + } + + Clear(); + releasing_locks_.store(false); +} + +void RangeLockList::ReplaceLocks(const toku::locktree *lt, + const toku::range_buffer &buffer) { + MutexLock l(&mutex_); + if (releasing_locks_.load()) { + // Do nothing. The transaction is releasing its locks, so it will not care + // about having a correct list of ranges. (In TokuDB, + // toku_db_txn_escalate_callback() makes use of this property, too) + return; + } + + ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid; + + auto it = buffers_.find(cf_id); + it->second->destroy(); + it->second->create(); + + toku::range_buffer::iterator iter(&buffer); + toku::range_buffer::iterator::record rec; + while (iter.current(&rec)) { + it->second->append(rec.get_left_key(), rec.get_right_key()); + iter.next(); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h new file mode 100644 index 000000000..4ef48d252 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "util/mutexlock.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +// Range Locking: +#include "lib/locktree/lock_request.h" +#include "lib/locktree/locktree.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeTreeLockManager; + +// Storage for locks that are currently held by a transaction. +// +// Locks are kept in toku::range_buffer because toku::locktree::release_locks() +// accepts that as an argument. +// +// Note: the list of locks may differ slighly from the contents of the lock +// tree, due to concurrency between lock acquisition, lock release, and lock +// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details. +// This property is currently harmless. +// +// Append() and ReleaseLocks() are not thread-safe, as they are expected to be +// called only by the owner transaction. ReplaceLocks() is safe to call from +// other threads. +class RangeLockList { + public: + ~RangeLockList() { Clear(); } + + RangeLockList() : releasing_locks_(false) {} + + void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key); + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks); + void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer); + + private: + void Clear() { + for (auto it : buffers_) { + it.second->destroy(); + } + buffers_.clear(); + } + + std::unordered_map> + buffers_; + port::Mutex mutex_; + std::atomic releasing_locks_; +}; + +// A LockTracker-based object that is used together with RangeTreeLockManager. +class RangeTreeLockTracker : public LockTracker { + public: + RangeTreeLockTracker() : range_list_(nullptr) {} + + RangeTreeLockTracker(const RangeTreeLockTracker&) = delete; + RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete; + + void Track(const PointLockRequest&) override; + void Track(const RangeLockRequest&) override; + + bool IsPointLockSupported() const override { + // This indicates that we don't implement GetPointLockStatus() + return false; + } + bool IsRangeLockSupported() const override { return true; } + + // a Not-supported dummy implementation. + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + // "If this method is not supported, leave it as a no-op." + void Merge(const LockTracker&) override {} + + // "If this method is not supported, leave it as a no-op." + void Subtract(const LockTracker&) override {} + + void Clear() override; + + // "If this method is not supported, returns nullptr." + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker&) const override { + return nullptr; + } + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + // The return value is only used for tests + uint64_t GetNumPointLocks() const override { return 0; } + + ColumnFamilyIterator* GetColumnFamilyIterator() const override { + return nullptr; + } + + KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const override { + return nullptr; + } + + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks) { + if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks); + } + + void ReplaceLocks(const toku::locktree* lt, + const toku::range_buffer& buffer) { + // range_list_ cannot be NULL here + range_list_->ReplaceLocks(lt, buffer); + } + + private: + RangeLockList* getOrCreateList(); + std::unique_ptr range_list_; +}; + +class RangeTreeLockTrackerFactory : public LockTrackerFactory { + public: + static const RangeTreeLockTrackerFactory& Get() { + static const RangeTreeLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new RangeTreeLockTracker(); } + + private: + RangeTreeLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.cc b/src/rocksdb/utilities/transactions/optimistic_transaction.cc new file mode 100644 index 000000000..0ee0f28b6 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/optimistic_transaction.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" +#include "utilities/transactions/optimistic_transaction.h" +#include "utilities/transactions/optimistic_transaction_db_impl.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +OptimisticTransaction::OptimisticTransaction( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) + : TransactionBaseImpl(txn_db->GetBaseDB(), write_options, + PointLockTrackerFactory::Get()), + txn_db_(txn_db) { + Initialize(txn_options); +} + +void OptimisticTransaction::Initialize( + const OptimisticTransactionOptions& txn_options) { + if (txn_options.set_snapshot) { + SetSnapshot(); + } +} + +void OptimisticTransaction::Reinitialize( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options); + Initialize(txn_options); +} + +OptimisticTransaction::~OptimisticTransaction() {} + +void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); } + +Status OptimisticTransaction::Prepare() { + return Status::InvalidArgument( + "Two phase commit not supported for optimistic transactions."); +} + +Status OptimisticTransaction::Commit() { + auto txn_db_impl = static_cast_with_check(txn_db_); + assert(txn_db_impl); + switch (txn_db_impl->GetValidatePolicy()) { + case OccValidationPolicy::kValidateParallel: + return CommitWithParallelValidate(); + case OccValidationPolicy::kValidateSerial: + return CommitWithSerialValidate(); + default: + assert(0); + } + // unreachable, just void compiler complain + return Status::OK(); +} + +Status OptimisticTransaction::CommitWithSerialValidate() { + // Set up callback which will call CheckTransactionForConflicts() to + // check whether this transaction is safe to be committed. + OptimisticTransactionCallback callback(this); + + DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); + + Status s = db_impl->WriteWithCallback( + write_options_, GetWriteBatch()->GetWriteBatch(), &callback); + + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::CommitWithParallelValidate() { + auto txn_db_impl = static_cast_with_check(txn_db_); + assert(txn_db_impl); + DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); + assert(db_impl); + const size_t space = txn_db_impl->GetLockBucketsSize(); + std::set lk_idxes; + std::vector> lks; + std::unique_ptr cf_it( + tracked_locks_->GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr key_it( + tracked_locks_->GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space)); + } + } + // NOTE: in a single txn, all bucket-locks are taken in ascending order. + // In this way, txns from different threads all obey this rule so that + // deadlock can be avoided. + for (auto v : lk_idxes) { + lks.emplace_back(txn_db_impl->LockBucket(v)); + } + + Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); + if (!s.ok()) { + return s; + } + + s = db_impl->Write(write_options_, GetWriteBatch()->GetWriteBatch()); + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::Rollback() { + Clear(); + return Status::OK(); +} + +// Record this key so that we can check it for conflicts at commit time. +// +// 'exclusive' is unused for OptimisticTransaction. +Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked); // not supported + (void)assume_tracked; + if (!do_validate) { + return Status::OK(); + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + + SetSnapshotIfNeeded(); + + SequenceNumber seq; + if (snapshot_) { + seq = snapshot_->GetSequenceNumber(); + } else { + seq = db_->GetLatestSequenceNumber(); + } + + std::string key_str = key.ToString(); + + TrackKey(cfh_id, key_str, seq, read_only, exclusive); + + // Always return OK. Confilct checking will happen at commit time. + return Status::OK(); +} + +// Returns OK if it is safe to commit this transaction. Returns Status::Busy +// if there are read or write conflicts that would prevent us from committing OR +// if we can not determine whether there would be any such conflicts. +// +// Should only be called on writer thread in order to avoid any race conditions +// in detecting write conflicts. +Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) { + auto db_impl = static_cast_with_check(db); + + // Since we are on the write thread and do not want to block other writers, + // we will do a cache-only conflict check. This can result in TryAgain + // getting returned if there is not sufficient memtable history to check + // for conflicts. + return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); +} + +Status OptimisticTransaction::SetName(const TransactionName& /* unused */) { + return Status::InvalidArgument("Optimistic transactions cannot be named."); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.h b/src/rocksdb/utilities/transactions/optimistic_transaction.h new file mode 100644 index 000000000..de23233d5 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransaction : public TransactionBaseImpl { + public: + OptimisticTransaction(OptimisticTransactionDB* db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + // No copying allowed + OptimisticTransaction(const OptimisticTransaction&) = delete; + void operator=(const OptimisticTransaction&) = delete; + + virtual ~OptimisticTransaction(); + + void Reinitialize(OptimisticTransactionDB* txn_db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + Status Rollback() override; + + Status SetName(const TransactionName& name) override; + + protected: + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + private: + ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_; + + friend class OptimisticTransactionCallback; + + void Initialize(const OptimisticTransactionOptions& txn_options); + + // Returns OK if it is safe to commit this transaction. Returns Status::Busy + // if there are read or write conflicts that would prevent us from committing + // OR if we can not determine whether there would be any such conflicts. + // + // Should only be called on writer thread. + Status CheckTransactionForConflicts(DB* db); + + void Clear() override; + + void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */, + const Slice& /* unused */) override { + // Nothing to unlock. + } + + Status CommitWithSerialValidate(); + + Status CommitWithParallelValidate(); +}; + +// Used at commit time to trigger transaction validation +class OptimisticTransactionCallback : public WriteCallback { + public: + explicit OptimisticTransactionCallback(OptimisticTransaction* txn) + : txn_(txn) {} + + Status Callback(DB* db) override { + return txn_->CheckTransactionForConflicts(db); + } + + bool AllowWriteBatching() override { return false; } + + private: + OptimisticTransaction* txn_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc new file mode 100644 index 000000000..bffb3d5ed --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/optimistic_transaction_db_impl.h" + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "utilities/transactions/optimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +Transaction* OptimisticTransactionDBImpl::BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new OptimisticTransaction(this, write_options, txn_options); + } +} + +std::unique_lock OptimisticTransactionDBImpl::LockBucket( + size_t idx) { + assert(idx < bucketed_locks_.size()); + return std::unique_lock(*bucketed_locks_[idx]); +} + +Status OptimisticTransactionDB::Open(const Options& options, + const std::string& dbname, + OptimisticTransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr) { + return OptimisticTransactionDB::Open(db_options, + OptimisticTransactionDBOptions(), dbname, + column_families, handles, dbptr); +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + OptimisticTransactionDB** dbptr) { + Status s; + DB* db; + + std::vector column_families_copy = column_families; + + // Enable MemTable History if not already enabled + for (auto& column_family : column_families_copy) { + ColumnFamilyOptions* options = &column_family.options; + + if (options->max_write_buffer_size_to_maintain == 0 && + options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + options->max_write_buffer_size_to_maintain = -1; + } + } + + s = DB::Open(db_options, dbname, column_families_copy, handles, &db); + + if (s.ok()) { + *dbptr = new OptimisticTransactionDBImpl(db, occ_options); + } + + return s; +} + +void OptimisticTransactionDBImpl::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h new file mode 100644 index 000000000..88e86ea4a --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransactionDBImpl : public OptimisticTransactionDB { + public: + explicit OptimisticTransactionDBImpl( + DB* db, const OptimisticTransactionDBOptions& occ_options, + bool take_ownership = true) + : OptimisticTransactionDB(db), + db_owner_(take_ownership), + validate_policy_(occ_options.validate_policy) { + if (validate_policy_ == OccValidationPolicy::kValidateParallel) { + uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets); + bucketed_locks_.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + bucketed_locks_.emplace_back( + std::unique_ptr(new std::mutex)); + } + } + } + + ~OptimisticTransactionDBImpl() { + // Prevent this stackable from destroying + // base db + if (!db_owner_) { + db_ = nullptr; + } + } + + Transaction* BeginTransaction(const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, + Transaction* old_txn) override; + + // Transactional `DeleteRange()` is not yet supported. + using StackableDB::DeleteRange; + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + + // Range deletions also must not be snuck into `WriteBatch`es as they are + // incompatible with `OptimisticTransactionDB`. + virtual Status Write(const WriteOptions& write_opts, + WriteBatch* batch) override { + if (batch->HasDeleteRange()) { + return Status::NotSupported(); + } + return OptimisticTransactionDB::Write(write_opts, batch); + } + + size_t GetLockBucketsSize() const { return bucketed_locks_.size(); } + + OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } + + std::unique_lock LockBucket(size_t idx); + + private: + // NOTE: used in validation phase. Each key is hashed into some + // bucket. We then take the lock in the hash value order to avoid deadlock. + std::vector> bucketed_locks_; + + bool db_owner_; + + const OccValidationPolicy validate_policy_; + + void ReinitializeTransaction(Transaction* txn, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc new file mode 100644 index 000000000..aa8192c32 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc @@ -0,0 +1,1491 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/transaction_test_util.h" +#include "util/crc32c.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransactionTest + : public testing::Test, + public testing::WithParamInterface { + public: + OptimisticTransactionDB* txn_db; + std::string dbname; + Options options; + + OptimisticTransactionTest() { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; + options.merge_operator.reset(new TestPutOperator()); + dbname = test::PerThreadDBPath("optimistic_transaction_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + Open(); + } + ~OptimisticTransactionTest() override { + delete txn_db; + EXPECT_OK(DestroyDB(dbname, options)); + } + + void Reopen() { + delete txn_db; + txn_db = nullptr; + Open(); + } + + private: + void Open() { + ColumnFamilyOptions cf_options(options); + OptimisticTransactionDBOptions occ_opts; + occ_opts.validate_policy = GetParam(); + std::vector column_families; + std::vector handles; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + Status s = + OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname, + column_families, &handles, &txn_db); + + ASSERT_OK(s); + ASSERT_NE(txn_db, nullptr); + ASSERT_EQ(handles.size(), 1); + delete handles[0]; + } +}; + +TEST_P(OptimisticTransactionTest, SuccessTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("foo", "bar2")); + + // This Put outside of a transaction will conflict with the previous write + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_EQ(1, txn->GetNumKeys()); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn_db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + // This Put outside of a transaction will conflict with a later write + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn->Put( + "foo", "bar2")); // Conflicts with write done after snapshot taken + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn_db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest3) { + ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar")); + + Transaction* txn = txn_db->BeginTransaction(WriteOptions()); + ASSERT_NE(txn, nullptr); + + std::string value; + ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Merge("foo", "bar3")); + + // Merge outside of a transaction should conflict with the previous merge + ASSERT_OK(txn_db->Merge(WriteOptions(), "foo", "bar2")); + ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_EQ(1, txn->GetNumKeys()); + + Status s = txn->Commit(); + EXPECT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflict4) { + ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar")); + + Transaction* txn = txn_db->BeginTransaction(WriteOptions()); + ASSERT_NE(txn, nullptr); + + std::string value; + ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Merge("foo", "bar3")); + + // Range delete outside of a transaction should conflict with the previous + // merge inside txn + auto* dbimpl = static_cast_with_check(txn_db->GetRootDB()); + ColumnFamilyHandle* default_cf = dbimpl->DefaultColumnFamily(); + ASSERT_OK(dbimpl->DeleteRange(WriteOptions(), default_cf, "foo", "foo1")); + Status s = txn_db->Get(ReadOptions(), "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_EQ(1, txn->GetNumKeys()); + + s = txn->Commit(); + EXPECT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + s = txn_db->Get(ReadOptions(), "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("x", "y")); + + ASSERT_OK(txn->Commit()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy")); + + // force a memtable flush + FlushOptions flush_ops; + ASSERT_OK(txn_db->Flush(flush_ops)); + + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, FlushTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a MemTable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy")); + + // force a memtable flush + FlushOptions flush_ops; + ASSERT_OK(txn_db->Flush(flush_ops)); + + // Put a random key so we have a MemTable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2")); + + // force a memtable flush + ASSERT_OK(txn_db->Flush(flush_ops)); + + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3")); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + ASSERT_OK(txn_db->Flush(flush_ops)); + + Status s = txn->Commit(); + // txn should not commit since MemTableList History is not large enough + ASSERT_TRUE(s.IsTryAgain()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + Reopen(); + + WriteOptions write_options; + ReadOptions read_options; + ReadOptions snapshot_read_options; + ReadOptions snapshot_read_options2; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn != nullptr); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn2 != nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + snapshot_read_options2.snapshot = txn2->GetSnapshot(); + ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2"))); + + // txn updates "foo" and txn2 updates "foo2", and now a write is + // issued for "foo", which conflicts with txn but not txn2 + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"OptimisticTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(txn_db->Flush(flush_ops)); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + DBImpl* db_impl = static_cast(txn_db->GetRootDB()); + ASSERT_OK(db_impl->TEST_SwitchMemtable()); + } + uint64_t num_imm_mems; + ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, when this transaction is commited, + // only need to check the active memtable + Transaction* txn3 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn3 != nullptr); + + // Commit both of txn and txn2. txn will conflict but txn2 will + // pass. In both ways, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + + get_perf_context()->Reset(); + Status s = txn->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + // txn should fail because of conflict, even if the memtable + // has flushed, because it is still preserved in history. + ASSERT_TRUE(s.IsBusy()); + + get_perf_context()->Reset(); + s = txn2->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2"))); + get_perf_context()->Reset(); + s = txn3->Commit(); + // txn3 is created after the active memtable is created, so that is the only + // memtable to check. + ASSERT_EQ(1, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + } +} + +TEST_P(OptimisticTransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + // Modify key after transaction start + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("AAA", "bar2")); + + // Should commit since read/write was done after data changed + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); + ASSERT_OK(txn_db->Put(write_options, "BBB", "bar")); + ASSERT_OK(txn_db->Put(write_options, "CCC", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("AAA", "bar2")); + + // Modify BBB before snapshot is taken + ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1")); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("BBB", "bar2")); + + ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1")); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("CCC", "bar2")); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn_db->Get(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn_db->Get(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn_db->Get(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn_db->Get(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn_db->Get(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = txn_db->BeginTransaction(write_options); + + // Potentially conflicting writes + ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz")); + ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx")); + + txn->SetSnapshot(); + + OptimisticTransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + ASSERT_OK(txn->Put("ZZZ", "zzzz")); + ASSERT_OK(txn->Commit()); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + ASSERT_OK(txn2->Put("ZZZ", "xxxxx")); + + Status s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn2; +} + +TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa)); + ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb)); + + delete cfa; + delete cfb; + delete txn_db; + txn_db = nullptr; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families, + &handles, &txn_db)); + assert(txn_db != nullptr); + ASSERT_NE(txn_db, nullptr); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "foo")); + ASSERT_OK(batch.Put(handles[1], "AAA", "bar")); + ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar")); + ASSERT_OK(txn_db->Write(write_options, &batch)); + ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ")); + + // These keys do no conflict with existing writes since they're in + // different column families + ASSERT_OK(txn->Delete("AAA")); + Status s = + txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2))); + + ASSERT_EQ(3, txn->GetNumKeys()); + + // Txn should commit + ASSERT_OK(txn->Commit()); + s = txn_db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + // This write will cause a conflict with the earlier batch write + ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1))); + + ASSERT_OK(txn2->Delete(handles[2], "XXX")); + ASSERT_OK(txn2->Delete(handles[1], "XXX")); + s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Verify txn did not commit + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(value, "barbar"); + + delete txn; + delete txn2; + + txn = txn_db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + std::vector multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector values(4); + + std::vector results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + ASSERT_OK(txn->Delete(handles[2], "ZZZ")); + ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY")); + ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY")); + ASSERT_OK(txn->Delete(handles[2], "ZZZ")); + ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar")); + + ASSERT_EQ(5, txn->GetNumKeys()); + + // Txn should commit + ASSERT_OK(txn->Commit()); + s = txn_db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000")); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + // Verify Txn Did not Commit + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + s = txn_db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = txn_db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(OptimisticTransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->Commit()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->Rollback()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value)); + ASSERT_EQ(value, "aaa"); + + ASSERT_OK(txn->Commit()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + txn->SetSnapshot(); + ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value)); + ASSERT_EQ(value, "aaa"); + + ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx")); + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn; +} + +TEST_P(OptimisticTransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + OptimisticTransactionOptions txn_options; + std::string value; + + txn_options.set_snapshot = true; + Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector multiget_keys = {"1", "2", "3"}; + std::vector multiget_values; + + std::vector results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + ASSERT_OK(txn2->Put("2", "x")); + + ASSERT_OK(txn2->Commit()); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + // should not commit since txn2 wrote a key txn has read + Status s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("4", "x")); + + ASSERT_OK(txn2->Delete("4")); + + // txn1 can commit since txn2's delete hasn't happened yet (it's just batched) + ASSERT_OK(txn1->Commit()); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsNotFound()); + + // txn2 cannot commit since txn1 changed "4" + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; +} + +TEST_P(OptimisticTransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + OptimisticTransactionOptions txn_options; + std::string value; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->Put("1", "1")); + ASSERT_OK(txn2->Put("1", "2")); + + ASSERT_OK(txn1->Commit()); + + Status s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "3")); + ASSERT_OK(txn2->Put("1", "4")); + + ASSERT_OK(txn1->Commit()); + + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "5")); + ASSERT_OK(txn1->Commit()); + + ASSERT_OK(txn2->Put("1", "6")); + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "5")); + ASSERT_OK(txn1->Commit()); + + txn2->SetSnapshot(); + ASSERT_OK(txn2->Put("1", "6")); + ASSERT_OK(txn2->Commit()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options); + txn2 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->Put("1", "7")); + ASSERT_OK(txn1->Commit()); + + ASSERT_OK(txn2->Put("1", "8")); + ASSERT_OK(txn2->Commit()); + + delete txn1; + delete txn2; + + ASSERT_OK(txn_db->Get(read_options, "1", &value)); + ASSERT_EQ(value, "8"); +} + +TEST_P(OptimisticTransactionTest, UntrackedWrites) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->PutUntracked("untracked", "0")); + ASSERT_OK(txn->Rollback()); + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn->Put("tracked", "1")); + ASSERT_OK(txn->PutUntracked("untracked", "1")); + ASSERT_OK(txn->MergeUntracked("untracked", "2")); + ASSERT_OK(txn->DeleteUntracked("untracked")); + + // Write to the untracked key outside of the transaction and verify + // it doesn't prevent the transaction from committing. + ASSERT_OK(txn_db->Put(write_options, "untracked", "x")); + + ASSERT_OK(txn->Commit()); + + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn->Put("tracked", "10")); + ASSERT_OK(txn->PutUntracked("untracked", "A")); + + // Write to tracked key outside of the transaction and verify that the + // untracked keys are not written when the commit fails. + ASSERT_OK(txn_db->Delete(write_options, "tracked")); + + s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + // Write some keys to the db + ASSERT_OK(txn_db->Put(write_options, "A", "a")); + ASSERT_OK(txn_db->Put(write_options, "G", "g")); + ASSERT_OK(txn_db->Put(write_options, "F", "f")); + ASSERT_OK(txn_db->Put(write_options, "C", "c")); + ASSERT_OK(txn_db->Put(write_options, "D", "d")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + // Write some keys in a txn + ASSERT_OK(txn->Put("B", "b")); + ASSERT_OK(txn->Put("H", "h")); + ASSERT_OK(txn->Delete("D")); + ASSERT_OK(txn->Put("E", "e")); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + ASSERT_OK(txn_db->Put(write_options, "BB", "xx")); + ASSERT_OK(txn_db->Put(write_options, "C", "xx")); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr)); + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + // key "C" was modified in the db after txn's snapshot. txn will not commit. + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete iter; + delete txn; +} + +TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) { + // `OptimisticTransactionDB` does not allow range deletion in any API. + ASSERT_TRUE( + txn_db + ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + ASSERT_NOK(txn_db->Write(WriteOptions(), &wb)); +} + +TEST_P(OptimisticTransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + Status s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + txn->SetSavePoint(); // 1 + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("B", "b")); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("A", "a")); + ASSERT_OK(txn->Put("B", "bb")); + ASSERT_OK(txn->Put("C", "c")); + + txn->SetSavePoint(); // 2 + + ASSERT_OK(txn->Delete("B")); + ASSERT_OK(txn->Put("C", "cc")); + ASSERT_OK(txn->Put("D", "d")); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2 + + ASSERT_OK(txn->Get(read_options, "A", &value)); + ASSERT_EQ("a", value); + ASSERT_OK(txn->Get(read_options, "B", &value)); + ASSERT_EQ("bb", value); + ASSERT_OK(txn->Get(read_options, "C", &value)); + ASSERT_EQ("c", value); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("A", "a")); + ASSERT_OK(txn->Put("E", "e")); + + // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Rollback()); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("A", "aa")); + ASSERT_OK(txn->Put("F", "f")); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + ASSERT_OK(txn->Put("G", "g")); + ASSERT_OK(txn->Delete("F")); + ASSERT_OK(txn->Delete("B")); + + ASSERT_OK(txn->Get(read_options, "A", &value)); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3 + + ASSERT_OK(txn->Get(read_options, "F", &value)); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "F", &value)); + ASSERT_EQ("f", value); + + s = txn_db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn_db->Get(read_options, "A", &value)); + ASSERT_EQ("aa", value); + + ASSERT_OK(txn_db->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + s = txn_db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn_db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn_db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "A", "")); + + Transaction* txn1 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn1->Put("A", "a")); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + Status s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + txn1->UndoGetForUpdate("A"); + + ASSERT_OK(txn1->RollbackToSavePoint()); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; +} + +namespace { +Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db, + const size_t num_transactions, + const size_t num_sets, + const size_t num_keys_per_set) { + size_t seed = std::hash()(std::this_thread::get_id()); + Random64 _rand(seed); + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + txn_options.set_snapshot = true; + + RandomTransactionInserter inserter(&_rand, write_options, read_options, + num_keys_per_set, + static_cast(num_sets)); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.OptimisticTransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + + inserter.GetLastStatus().PermitUncheckedError(); + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) { + const size_t num_threads = 4; + const size_t num_transactions_per_thread = 10000; + const size_t num_sets = 3; + const size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector threads; + + std::function call_inserter = [&] { + ASSERT_OK(OptimisticTransactionStressTestInserter( + txn_db, num_transactions_per_thread, num_sets, num_keys_per_set)); + }; + + // Create N threads that use RandomTransactionInserter to write + // many transactions. + for (uint32_t i = 0; i < num_threads; i++) { + threads.emplace_back(call_inserter); + } + + // Wait for all threads to run + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(txn_db, num_sets); + ASSERT_OK(s); +} + +TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { + WriteOptions write_options; + OptimisticTransactionOptions transaction_options; + + Transaction* transaction( + txn_db->BeginTransaction(write_options, transaction_options)); + Status s = transaction->Put("foo", "val"); + ASSERT_OK(s); + s = transaction->Put("foo2", "val"); + ASSERT_OK(s); + s = transaction->Put("foo3", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + ASSERT_OK(s); + delete transaction; + + Reopen(); + transaction = txn_db->BeginTransaction(write_options, transaction_options); + s = transaction->Put("bar", "val"); + ASSERT_OK(s); + s = transaction->Put("bar2", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + ASSERT_OK(s); + + delete transaction; +} + +TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) { + std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) { + std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + std::shared_ptr snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + +INSTANTIATE_TEST_CASE_P( + InstanceOccGroup, OptimisticTransactionTest, + testing::Values(OccValidationPolicy::kValidateSerial, + OccValidationPolicy::kValidateParallel)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf( + stderr, + "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc new file mode 100644 index 000000000..cb8fd3bb6 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc @@ -0,0 +1,1175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/pessimistic_transaction.h" + +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_util.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +std::atomic PessimisticTransaction::txn_id_counter_(1); + +TransactionID PessimisticTransaction::GenTxnID() { + return txn_id_counter_.fetch_add(1); +} + +PessimisticTransaction::PessimisticTransaction( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options, const bool init) + : TransactionBaseImpl( + txn_db->GetRootDB(), write_options, + static_cast_with_check(txn_db) + ->GetLockTrackerFactory()), + txn_db_impl_(nullptr), + expiration_time_(0), + txn_id_(0), + waiting_cf_id_(0), + waiting_key_(nullptr), + lock_timeout_(0), + deadlock_detect_(false), + deadlock_detect_depth_(0), + skip_concurrency_control_(false) { + txn_db_impl_ = static_cast_with_check(txn_db); + db_impl_ = static_cast_with_check(db_); + if (init) { + Initialize(txn_options); + } +} + +void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) { + // Range lock manager uses address of transaction object as TXNID + const TransactionDBOptions& db_options = txn_db_impl_->GetTxnDBOptions(); + if (db_options.lock_mgr_handle && + db_options.lock_mgr_handle->getLockManager()->IsRangeLockSupported()) { + txn_id_ = reinterpret_cast(this); + } else { + txn_id_ = GenTxnID(); + } + + txn_state_ = STARTED; + + deadlock_detect_ = txn_options.deadlock_detect; + deadlock_detect_depth_ = txn_options.deadlock_detect_depth; + write_batch_.SetMaxBytes(txn_options.max_write_batch_size); + skip_concurrency_control_ = txn_options.skip_concurrency_control; + + lock_timeout_ = txn_options.lock_timeout * 1000; + if (lock_timeout_ < 0) { + // Lock timeout not set, use default + lock_timeout_ = + txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000; + } + + if (txn_options.expiration >= 0) { + expiration_time_ = start_time_ + txn_options.expiration * 1000; + } else { + expiration_time_ = 0; + } + + if (txn_options.set_snapshot) { + SetSnapshot(); + } + + if (expiration_time_ > 0) { + txn_db_impl_->InsertExpirableTransaction(txn_id_, this); + } + use_only_the_last_commit_time_batch_for_recovery_ = + txn_options.use_only_the_last_commit_time_batch_for_recovery; + skip_prepare_ = txn_options.skip_prepare; + + read_timestamp_ = kMaxTxnTimestamp; + commit_timestamp_ = kMaxTxnTimestamp; +} + +PessimisticTransaction::~PessimisticTransaction() { + txn_db_impl_->UnLock(this, *tracked_locks_); + if (expiration_time_ > 0) { + txn_db_impl_->RemoveExpirableTransaction(txn_id_); + } + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } +} + +void PessimisticTransaction::Clear() { + txn_db_impl_->UnLock(this, *tracked_locks_); + TransactionBaseImpl::Clear(); +} + +void PessimisticTransaction::Reinitialize( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } + TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options); + Initialize(txn_options); +} + +bool PessimisticTransaction::IsExpired() const { + if (expiration_time_ > 0) { + if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) { + // Transaction is expired. + return true; + } + } + + return false; +} + +WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options) {} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, value, exclusive, + do_validate); +} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, pinnable_val, + exclusive, do_validate); +} + +template +inline Status WriteCommittedTxn::GetForUpdateImpl( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + if (!read_options.timestamp) { + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); + } + } else { + Status s = db_impl_->FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + + if (!do_validate) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with read_timestamp is not " + "defined."); + } else if (kMaxTxnTimestamp == read_timestamp_) { + return Status::InvalidArgument("read_timestamp must be set for validation"); + } + + if (!read_options.timestamp) { + ReadOptions read_opts_copy = read_options; + char ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(ts_buf, read_timestamp_); + Slice ts(ts_buf, sizeof(ts_buf)); + read_opts_copy.timestamp = &ts; + return TransactionBaseImpl::GetForUpdate(read_opts_copy, column_family, key, + value, exclusive, do_validate); + } + assert(read_options.timestamp); + const char* const ts_buf = read_options.timestamp->data(); + assert(read_options.timestamp->size() == sizeof(kMaxTxnTimestamp)); + TxnTimestamp ts = DecodeFixed64(ts_buf); + if (ts != read_timestamp_) { + return Status::InvalidArgument("Must read from the same read_timestamp"); + } + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + ++num_merges_; + } + return s; + }); +} + +template +Status WriteCommittedTxn::Operate(ColumnFamilyHandle* column_family, + const TKey& key, const bool do_validate, + const bool assume_tracked, + TOperation&& operation) { + Status s; + if constexpr (std::is_same_v) { + s = TryLock(column_family, key, /*read_only=*/false, /*exclusive=*/true, + do_validate, assume_tracked); + } else if constexpr (std::is_same_v) { + std::string key_buf; + Slice contiguous_key(key, &key_buf); + s = TryLock(column_family, contiguous_key, /*read_only=*/false, + /*exclusive=*/true, do_validate, assume_tracked); + } + if (!s.ok()) { + return s; + } + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0) { + assert(ts_sz == sizeof(TxnTimestamp)); + if (!IndexingEnabled()) { + cfs_with_ts_tracked_when_indexing_disabled_.insert( + column_family->GetID()); + } + } + return operation(); +} + +Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts < read_timestamp_) { + return Status::InvalidArgument( + "Cannot decrease read timestamp for validation"); + } + read_timestamp_ = ts; + return Status::OK(); +} + +Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) { + return Status::InvalidArgument( + "Cannot commit at timestamp smaller than or equal to read timestamp"); + } + commit_timestamp_ = ts; + return Status::OK(); +} + +Status PessimisticTransaction::CommitBatch(WriteBatch* batch) { + if (batch && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + // CommitBatch() needs to lock the keys in the batch. + // However, the application also needs to specify the timestamp for the + // keys in batch before calling this API. + // This means timestamp order may violate the order of locking, thus + // violate the sequence number order for the same user key. + // Therefore, we disallow this operation for now. + return Status::NotSupported( + "Batch to commit includes timestamp assigned before locking"); + } + + std::unique_ptr keys_to_unlock(lock_tracker_factory_.Create()); + Status s = LockBatch(batch, keys_to_unlock.get()); + + if (!s.ok()) { + return s; + } + + bool can_commit = false; + + if (IsExpired()) { + s = Status::Expired(); + } else if (expiration_time_ > 0) { + TransactionState expected = STARTED; + can_commit = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_COMMIT); + } else if (txn_state_ == STARTED) { + // lock stealing is not a concern + can_commit = true; + } + + if (can_commit) { + txn_state_.store(AWAITING_COMMIT); + s = CommitBatchInternal(batch); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + txn_db_impl_->UnLock(this, *keys_to_unlock); + + return s; +} + +Status PessimisticTransaction::Prepare() { + if (name_.empty()) { + return Status::InvalidArgument( + "Cannot prepare a transaction that has not been named."); + } + + if (IsExpired()) { + return Status::Expired(); + } + + Status s; + bool can_prepare = false; + + if (expiration_time_ > 0) { + // must concern ourselves with expiraton and/or lock stealing + // need to compare/exchange bc locks could be stolen under us here + TransactionState expected = STARTED; + can_prepare = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_PREPARE); + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not possible + txn_state_.store(AWAITING_PREPARE); + can_prepare = true; + } + + if (can_prepare) { + // transaction can't expire after preparation + expiration_time_ = 0; + assert(log_number_ == 0 || + txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + + s = PrepareInternal(); + if (s.ok()) { + txn_state_.store(PREPARED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == PREPARED) { + s = Status::InvalidArgument("Transaction has already been prepared."); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_); + assert(s.ok()); + class MarkLogCallback : public PreReleaseCallback { + public: + MarkLogCallback(DBImpl* db, bool two_write_queues) + : db_(db), two_write_queues_(two_write_queues) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber, bool is_mem_disabled, + uint64_t log_number, size_t /*index*/, + size_t /*total*/) override { +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + assert(log_number != 0); + assert(!two_write_queues_ || is_mem_disabled); // implies the 2nd queue + db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number); + return Status::OK(); + } + + private: + DBImpl* db_; + bool two_write_queues_; + } mark_log_callback(db_impl_, + db_impl_->immutable_db_options().two_write_queues); + + WriteCallback* const kNoWriteCallback = nullptr; + const uint64_t kRefNoLog = 0; + const bool kDisableMemtable = true; + SequenceNumber* const KIgnoreSeqUsed = nullptr; + const size_t kNoBatchCount = 0; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + kNoWriteCallback, &log_number_, kRefNoLog, + kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount, + &mark_log_callback); + return s; +} + +Status PessimisticTransaction::Commit() { + bool commit_without_prepare = false; + bool commit_prepared = false; + + if (IsExpired()) { + return Status::Expired(); + } + + if (expiration_time_ > 0) { + // we must atomicaly compare and exchange the state here because at + // this state in the transaction it is possible for another thread + // to change our state out from under us in the even that we expire and have + // our locks stolen. In this case the only valid state is STARTED because + // a state of PREPARED would have a cleared expiration_time_. + TransactionState expected = STARTED; + commit_without_prepare = std::atomic_compare_exchange_strong( + &txn_state_, &expected, AWAITING_COMMIT); + TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1"); + } else if (txn_state_ == PREPARED) { + // expiration and lock stealing is not a concern + commit_prepared = true; + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not a concern + if (skip_prepare_) { + commit_without_prepare = true; + } else { + return Status::TxnNotPrepared(); + } + } + + Status s; + if (commit_without_prepare) { + assert(!commit_prepared); + if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) { + s = Status::InvalidArgument( + "Commit-time batch contains values that will not be committed."); + } else { + txn_state_.store(AWAITING_COMMIT); + if (log_number_ > 0) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + s = CommitWithoutPrepareInternal(); + if (!name_.empty()) { + txn_db_impl_->UnregisterTransaction(this); + } + Clear(); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } + } else if (commit_prepared) { + txn_state_.store(AWAITING_COMMIT); + + s = CommitInternal(); + + if (!s.ok()) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Commit write failed"); + return s; + } + + // FindObsoleteFiles must now look to the memtables + // to determine what prep logs must be kept around, + // not the prep section heap. + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + txn_db_impl_->UnregisterTransaction(this); + + Clear(); + txn_state_.store(COMMITTED); + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::CommitWithoutPrepareInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + + if (needs_ts) { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + + Status s = + wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf); + if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits::max(); + }); + if (!s.ok()) { + return s; + } + } + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must set transaction commit timestamp"); + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + auto s = db_impl_->WriteImpl(write_options_, wb, + /*callback*/ nullptr, /*log_used*/ nullptr, + /*log_ref*/ 0, /*disable_memtable*/ false, + &seq_used, /*batch_cnt=*/0, + /*pre_release_callback=*/nullptr, post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) { + uint64_t seq_used = kMaxSequenceNumber; + auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ 0, + /*disable_memtable*/ false, &seq_used); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + + Status s; + if (!needs_ts) { + s = WriteBatchInternal::MarkCommit(working_batch, name_); + } else { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_, + commit_ts); + if (s.ok()) { + s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != + cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits::max(); + }); + } + } + + if (!s.ok()) { + return s; + } + + // any operations appended to this working_batch will be ignored from WAL + working_batch->MarkWalTerminationPoint(); + + // insert prepared batch into Memtable only skipping WAL. + // Memtable will ignore BeginPrepare/EndPrepare markers + // in non recovery mode and simply insert the values + s = WriteBatchInternal::Append(working_batch, wb); + assert(s.ok()); + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + s = Status::InvalidArgument("Must set transaction commit timestamp"); + return s; + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ log_number_, + /*disable_memtable*/ false, &seq_used, + /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, + post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status PessimisticTransaction::Rollback() { + Status s; + if (txn_state_ == PREPARED) { + txn_state_.store(AWAITING_ROLLBACK); + + s = RollbackInternal(); + + if (s.ok()) { + // we do not need to keep our prepared section around + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + Clear(); + txn_state_.store(ROLLEDBACK); + } + } else if (txn_state_ == STARTED) { + if (log_number_ > 0) { + assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + assert(GetId() > 0); + s = RollbackInternal(); + + if (s.ok()) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + // prepare couldn't have taken place + Clear(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("This transaction has already been committed."); + } else { + s = Status::InvalidArgument( + "Two phase transaction is not in state for rollback."); + } + + return s; +} + +Status WriteCommittedTxn::RollbackInternal() { + WriteBatch rollback_marker; + auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &rollback_marker); + return s; +} + +Status PessimisticTransaction::RollbackToSavePoint() { + if (txn_state_ != STARTED) { + return Status::InvalidArgument("Transaction is beyond state for rollback."); + } + + if (save_points_ != nullptr && !save_points_->empty()) { + // Unlock any keys locked since last transaction + auto& save_point_tracker = *save_points_->top().new_locks_; + std::unique_ptr t( + tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker)); + if (t) { + txn_db_impl_->UnLock(this, *t); + } + } + + return TransactionBaseImpl::RollbackToSavePoint(); +} + +// Lock all keys in this batch. +// On success, caller should unlock keys_to_unlock +Status PessimisticTransaction::LockBatch(WriteBatch* batch, + LockTracker* keys_to_unlock) { + if (!batch) { + return Status::InvalidArgument("batch is nullptr"); + } + + class Handler : public WriteBatch::Handler { + public: + // Sorted map of column_family_id to sorted set of keys. + // Since LockBatch() always locks keys in sorted order, it cannot deadlock + // with itself. We're not using a comparator here since it doesn't matter + // what the sorting is as long as it's consistent. + std::map> keys_; + + Handler() {} + + void RecordKey(uint32_t column_family_id, const Slice& key) { + std::string key_str = key.ToString(); + + auto& cfh_keys = keys_[column_family_id]; + auto iter = cfh_keys.find(key_str); + if (iter == cfh_keys.end()) { + // key not yet seen, store it. + cfh_keys.insert({std::move(key_str)}); + } + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + }; + + // Iterating on this handler will add all keys in this batch into keys + Handler handler; + Status s = batch->Iterate(&handler); + if (!s.ok()) { + return s; + } + + // Attempt to lock all keys + for (const auto& cf_iter : handler.keys_) { + uint32_t cfh_id = cf_iter.first; + auto& cfh_keys = cf_iter.second; + + for (const auto& key_iter : cfh_keys) { + const std::string& key = key_iter; + + s = txn_db_impl_->TryLock(this, cfh_id, key, true /* exclusive */); + if (!s.ok()) { + break; + } + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = kMaxSequenceNumber; + r.read_only = false; + r.exclusive = true; + keys_to_unlock->Track(r); + } + + if (!s.ok()) { + break; + } + } + + if (!s.ok()) { + txn_db_impl_->UnLock(this, *keys_to_unlock); + } + + return s; +} + +// Attempt to lock this key. +// Returns OK if the key has been successfully locked. Non-ok, otherwise. +// If check_shapshot is true and this transaction has a snapshot set, +// this key will only be locked if there have been no writes to this key since +// the snapshot time. +Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked || !do_validate); + Status s; + if (UNLIKELY(skip_concurrency_control_)) { + return s; + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + std::string key_str = key.ToString(); + + PointLockStatus status; + bool lock_upgrade; + bool previously_locked; + if (tracked_locks_->IsPointLockSupported()) { + status = tracked_locks_->GetPointLockStatus(cfh_id, key_str); + previously_locked = status.locked; + lock_upgrade = previously_locked && exclusive && !status.exclusive; + } else { + // If the record is tracked, we can assume it was locked, too. + previously_locked = assume_tracked; + status.locked = false; + lock_upgrade = false; + } + + // Lock this key if this transactions hasn't already locked it or we require + // an upgrade. + if (!previously_locked || lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, exclusive); + } + + const ColumnFamilyHandle* const cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + SetSnapshotIfNeeded(); + + // Even though we do not care about doing conflict checking for this write, + // we still need to take a lock to make sure we do not cause a conflict with + // some other write. However, we do not need to check if there have been + // any writes since this transaction's snapshot. + // TODO(agiardullo): could optimize by supporting shared txn locks in the + // future. + SequenceNumber tracked_at_seq = + status.locked ? status.seq : kMaxSequenceNumber; + if (!do_validate || (snapshot_ == nullptr && + (0 == ts_sz || kMaxTxnTimestamp == read_timestamp_))) { + if (assume_tracked && !previously_locked && + tracked_locks_->IsPointLockSupported()) { + s = Status::InvalidArgument( + "assume_tracked is set but it is not tracked yet"); + } + // Need to remember the earliest sequence number that we know that this + // key has not been modified after. This is useful if this same + // transaction later tries to lock this key again. + if (tracked_at_seq == kMaxSequenceNumber) { + // Since we haven't checked a snapshot, we only know this key has not + // been modified since after we locked it. + // Note: when last_seq_same_as_publish_seq_==false this is less than the + // latest allocated seq but it is ok since i) this is just a heuristic + // used only as a hint to avoid actual check for conflicts, ii) this would + // cause a false positive only if the snapthot is taken right after the + // lock, which would be an unusual sequence. + tracked_at_seq = db_->GetLatestSequenceNumber(); + } + } else if (s.ok()) { + // If a snapshot is set, we need to make sure the key hasn't been modified + // since the snapshot. This must be done after we locked the key. + // If we already have validated an earilier snapshot it must has been + // reflected in tracked_at_seq and ValidateSnapshot will return OK. + s = ValidateSnapshot(column_family, key, &tracked_at_seq); + + if (!s.ok()) { + // Failed to validate key + // Unlock key we just locked + if (lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, false /* exclusive */); + assert(s.ok()); + } else if (!previously_locked) { + txn_db_impl_->UnLock(this, cfh_id, key.ToString()); + } + } + } + + if (s.ok()) { + // We must track all the locked keys so that we can unlock them later. If + // the key is already locked, this func will update some stats on the + // tracked key. It could also update the tracked_at_seq if it is lower + // than the existing tracked key seq. These stats are necessary for + // RollbackToSavePoint to determine whether a key can be safely removed + // from tracked_keys_. Removal can only be done if a key was only locked + // during the current savepoint. + // + // Recall that if assume_tracked is true, we assume that TrackKey has been + // called previously since the last savepoint, with the same exclusive + // setting, and at a lower sequence number, so skipping here should be + // safe. + if (!assume_tracked) { + TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive); + } else { +#ifndef NDEBUG + if (tracked_locks_->IsPointLockSupported()) { + PointLockStatus lock_status = + tracked_locks_->GetPointLockStatus(cfh_id, key_str); + assert(lock_status.locked); + assert(lock_status.seq <= tracked_at_seq); + assert(lock_status.exclusive == exclusive); + } +#endif + } + } + + return s; +} + +Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_endp, + const Endpoint& end_endp) { + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + uint32_t cfh_id = GetColumnFamilyID(cfh); + + Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp); + + if (s.ok()) { + RangeLockRequest req{cfh_id, start_endp, end_endp}; + tracked_locks_->Track(req); + } + return s; +} + +// Return OK() if this key has not been modified more recently than the +// transaction snapshot_. +// tracked_at_seq is the global seq at which we either locked the key or already +// have done ValidateSnapshot. +Status PessimisticTransaction::ValidateSnapshot( + ColumnFamilyHandle* column_family, const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_ || read_timestamp_ < kMaxTxnTimestamp); + + SequenceNumber snap_seq = 0; + if (snapshot_) { + snap_seq = snapshot_->GetSequenceNumber(); + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated (or locked) at a sequence number + // earlier than the current snapshot's sequence number, we already know it + // has not been modified aftter snap_seq either. + return Status::OK(); + } + } else { + snap_seq = db_impl_->GetLatestSequenceNumber(); + } + + // Otherwise we have either + // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key + // 2: snap_seq < tracked_at_seq: last time we lock the key was via + // do_validate=false which means we had skipped ValidateSnapshot. In both + // cases we should do ValidateSnapshot now. + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0 && read_timestamp_ < kMaxTxnTimestamp) { + assert(ts_sz == sizeof(read_timestamp_)); + PutFixed64(&ts_buf, read_timestamp_); + } + + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf, + false /* cache_only */); +} + +bool PessimisticTransaction::TryStealingLocks() { + assert(IsExpired()); + TransactionState expected = STARTED; + return std::atomic_compare_exchange_strong(&txn_state_, &expected, + LOCKS_STOLEN); +} + +void PessimisticTransaction::UnlockGetForUpdate( + ColumnFamilyHandle* column_family, const Slice& key) { + txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString()); +} + +Status PessimisticTransaction::SetName(const TransactionName& name) { + Status s; + if (txn_state_ == STARTED) { + if (name_.length()) { + s = Status::InvalidArgument("Transaction has already been named."); + } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) { + s = Status::InvalidArgument("Transaction name must be unique."); + } else if (name.length() < 1 || name.length() > 512) { + s = Status::InvalidArgument( + "Transaction name length must be between 1 and 512 chars."); + } else { + name_ = name; + txn_db_impl_->RegisterTransaction(this); + } + } else { + s = Status::InvalidArgument("Transaction is beyond state for naming."); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.h b/src/rocksdb/utilities/transactions/pessimistic_transaction.h new file mode 100644 index 000000000..d43d1d3ac --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.h @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +// A transaction under pessimistic concurrency control. This class implements +// the locking API and interfaces with the lock manager as well as the +// pessimistic transactional db. +class PessimisticTransaction : public TransactionBaseImpl { + public: + PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options, + const bool init = true); + // No copying allowed + PessimisticTransaction(const PessimisticTransaction&) = delete; + void operator=(const PessimisticTransaction&) = delete; + + ~PessimisticTransaction() override; + + void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + // It is basically Commit without going through Prepare phase. The write batch + // is also directly provided instead of expecting txn to gradually batch the + // transactions writes to an internal write batch. + Status CommitBatch(WriteBatch* batch); + + Status Rollback() override; + + Status RollbackToSavePoint() override; + + Status SetName(const TransactionName& name) override; + + // Generate a new unique transaction identifier + static TransactionID GenTxnID(); + + TransactionID GetID() const override { return txn_id_; } + + std::vector GetWaitingTxns(uint32_t* column_family_id, + std::string* key) const override { + std::lock_guard lock(wait_mutex_); + std::vector ids(waiting_txn_ids_.size()); + if (key) *key = waiting_key_ ? *waiting_key_ : ""; + if (column_family_id) *column_family_id = waiting_cf_id_; + std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin()); + return ids; + } + + void SetWaitingTxn(autovector ids, uint32_t column_family_id, + const std::string* key) { + std::lock_guard lock(wait_mutex_); + waiting_txn_ids_ = ids; + waiting_cf_id_ = column_family_id; + waiting_key_ = key; + } + + void ClearWaitingTxn() { + std::lock_guard lock(wait_mutex_); + waiting_txn_ids_.clear(); + waiting_cf_id_ = 0; + waiting_key_ = nullptr; + } + + // Returns the time (in microseconds according to Env->GetMicros()) + // that this transaction will be expired. Returns 0 if this transaction does + // not expire. + uint64_t GetExpirationTime() const { return expiration_time_; } + + // returns true if this transaction has an expiration_time and has expired. + bool IsExpired() const; + + // Returns the number of microseconds a transaction can wait on acquiring a + // lock or -1 if there is no timeout. + int64_t GetLockTimeout() const { return lock_timeout_; } + void SetLockTimeout(int64_t timeout) override { + lock_timeout_ = timeout * 1000; + } + + // Returns true if locks were stolen successfully, false otherwise. + bool TryStealingLocks(); + + bool IsDeadlockDetect() const override { return deadlock_detect_; } + + int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; } + + virtual Status GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_key, + const Endpoint& end_key) override; + + protected: + // Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery + bool use_only_the_last_commit_time_batch_for_recovery_ = false; + // Refer to + // TransactionOptions::skip_prepare + bool skip_prepare_ = false; + + virtual Status PrepareInternal() = 0; + + virtual Status CommitWithoutPrepareInternal() = 0; + + // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch + // with no duplicate keys. If zero, then the number of sub-batches is unknown. + virtual Status CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt = 0) = 0; + + virtual Status CommitInternal() = 0; + + virtual Status RollbackInternal() = 0; + + virtual void Initialize(const TransactionOptions& txn_options); + + Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock); + + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + void Clear() override; + + PessimisticTransactionDB* txn_db_impl_; + DBImpl* db_impl_; + + // If non-zero, this transaction should not be committed after this time (in + // microseconds according to Env->NowMicros()) + uint64_t expiration_time_; + + // Timestamp used by the transaction to perform all GetForUpdate. + // Use this timestamp for conflict checking. + // read_timestamp_ == kMaxTxnTimestamp means this transaction has not + // performed any GetForUpdate. It is possible that the transaction has + // performed blind writes or Get, though. + TxnTimestamp read_timestamp_{kMaxTxnTimestamp}; + TxnTimestamp commit_timestamp_{kMaxTxnTimestamp}; + + private: + friend class TransactionTest_ValidateSnapshotTest_Test; + // Used to create unique ids for transactions. + static std::atomic txn_id_counter_; + + // Unique ID for this transaction + TransactionID txn_id_; + + // IDs for the transactions that are blocking the current transaction. + // + // empty if current transaction is not waiting. + autovector waiting_txn_ids_; + + // The following two represents the (cf, key) that a transaction is waiting + // on. + // + // If waiting_key_ is not null, then the pointer should always point to + // a valid string object. The reason is that it is only non-null when the + // transaction is blocked in the PointLockManager::AcquireWithTimeout + // function. At that point, the key string object is one of the function + // parameters. + uint32_t waiting_cf_id_; + const std::string* waiting_key_; + + // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_. + mutable std::mutex wait_mutex_; + + // Timeout in microseconds when locking a key or -1 if there is no timeout. + int64_t lock_timeout_; + + // Whether to perform deadlock detection or not. + bool deadlock_detect_; + + // Whether to perform deadlock detection or not. + int64_t deadlock_detect_depth_; + + // Refer to TransactionOptions::skip_concurrency_control + bool skip_concurrency_control_; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq); + + void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; +}; + +class WriteCommittedTxn : public PessimisticTransaction { + public: + WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WriteCommittedTxn(const WriteCommittedTxn&) = delete; + void operator=(const WriteCommittedTxn&) = delete; + + ~WriteCommittedTxn() override {} + + using TransactionBaseImpl::GetForUpdate; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + using TransactionBaseImpl::Put; + // `key` does NOT include timestamp even when it's enabled. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::PutUntracked; + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + + using TransactionBaseImpl::Delete; + // `key` does NOT include timestamp even when it's enabled. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::DeleteUntracked; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + + using TransactionBaseImpl::SingleDelete; + // `key` does NOT include timestamp even when it's enabled. + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDeleteUntracked; + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + + using TransactionBaseImpl::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + + Status SetReadTimestampForValidation(TxnTimestamp ts) override; + Status SetCommitTimestamp(TxnTimestamp ts) override; + TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; } + + private: + template + Status GetForUpdateImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + TValue* value, bool exclusive, + const bool do_validate); + + template + Status Operate(ColumnFamilyHandle* column_family, const TKey& key, + const bool do_validate, const bool assume_tracked, + TOperation&& operation); + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + Status CommitInternal() override; + + Status RollbackInternal() override; + + // Column families that enable timestamps and whose data are written when + // indexing_enabled_ is false. If a key is written when indexing_enabled_ is + // true, then the corresponding column family is not added to cfs_with_ts + // even if it enables timestamp. + std::unordered_set cfs_with_ts_tracked_when_indexing_disabled_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc new file mode 100644 index 000000000..950ef8042 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc @@ -0,0 +1,782 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/pessimistic_transaction_db.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +PessimisticTransactionDB::PessimisticTransactionDB( + DB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check(db)), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); + info_log_ = db_impl_->GetDBOptions().info_log; +} + +// Support initiliazing PessimisticTransactionDB from a stackable db +// +// PessimisticTransactionDB +// ^ ^ +// | | +// | + +// | StackableDB +// | ^ +// | | +// + + +// DBImpl +// ^ +// |(inherit) +// + +// DB +// +PessimisticTransactionDB::PessimisticTransactionDB( + StackableDB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check(db->GetRootDB())), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); +} + +PessimisticTransactionDB::~PessimisticTransactionDB() { + while (!transactions_.empty()) { + delete transactions_.begin()->second; + // TODO(myabandeh): this seems to be an unsafe approach as it is not quite + // clear whether delete would also remove the entry from transactions_. + } +} + +Status PessimisticTransactionDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + const Comparator* const ucmp = cf_options.comparator; + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return Status::OK(); + } + if (ts_sz != sizeof(TxnTimestamp)) { + std::ostringstream oss; + oss << "Timestamp of transaction must have " << sizeof(TxnTimestamp) + << " bytes. CF comparator " << std::string(ucmp->Name()) + << " timestamp size is " << ts_sz << " bytes"; + return Status::InvalidArgument(oss.str()); + } + if (txn_db_options_.write_policy != WRITE_COMMITTED) { + return Status::NotSupported("Only WriteCommittedTxn supports timestamp"); + } + return Status::OK(); +} + +Status PessimisticTransactionDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + Status s = EnableAutoCompaction(compaction_enabled_cf_handles); + + // create 'real' transactions from recovered shell transactions + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + auto rtrxs = dbimpl->recovered_transactions(); + + for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) { + auto recovered_trx = it->second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() == 1); + const auto& seq = recovered_trx->batches_.begin()->first; + const auto& batch_info = recovered_trx->batches_.begin()->second; + assert(batch_info.log_number_); + assert(recovered_trx->name_.length()); + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + // This would help avoiding deadlock for keys that although exist in the WAL + // did not go through concurrency control. This includes the merge that + // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if + // there is a conflict between the keys of two transactions that must be + // avoided, it is already avoided by the application, MyRocks, before the + // restart (ii) application, MyRocks, guarntees to rollback/commit the + // recovered transactions before new transactions start. + t_options.skip_concurrency_control = true; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + real_trx->SetLogNumber(batch_info.log_number_); + assert(seq != kMaxSequenceNumber); + if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) { + real_trx->SetId(seq); + } + + s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + break; + } + + s = real_trx->RebuildFromWriteBatch(batch_info.batch_); + // WriteCommitted set this to to disable this check that is specific to + // WritePrepared txns + assert(batch_info.batch_cnt_ == 0 || + real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_); + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + } + return s; +} + +Transaction* WriteCommittedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteCommittedTxn(this, write_options, txn_options); + } +} + +TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options) { + TransactionDBOptions validated = txn_db_options; + + if (txn_db_options.num_stripes == 0) { + validated.num_stripes = 1; + } + + return validated; +} + +Status TransactionDB::Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = TransactionDB::Open(db_options, txn_db_options, dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status TransactionDB::Open( + const DBOptions& db_options, const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, TransactionDB** dbptr) { + Status s; + DB* db = nullptr; + if (txn_db_options.write_policy == WRITE_COMMITTED && + db_options.unordered_write) { + return Status::NotSupported( + "WRITE_COMMITTED is incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_UNPREPARED && + db_options.unordered_write) { + // TODO(lth): support it + return Status::NotSupported( + "WRITE_UNPREPARED is currently incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_PREPARED && + db_options.unordered_write && !db_options.two_write_queues) { + return Status::NotSupported( + "WRITE_PREPARED is incompatible with unordered_writes if " + "two_write_queues is not enabled."); + } + + std::vector column_families_copy = column_families; + std::vector compaction_enabled_cf_indices; + DBOptions db_options_2pc = db_options; + PrepareWrap(&db_options_2pc, &column_families_copy, + &compaction_enabled_cf_indices); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db, + use_seq_per_batch, use_batch_per_txn); + if (s.ok()) { + ROCKS_LOG_WARN(db->GetDBOptions().info_log, + "Transaction write_policy is %" PRId32, + static_cast(txn_db_options.write_policy)); + // if WrapDB return non-ok, db will be deleted in WrapDB() via + // ~StackableDB(). + s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles, + dbptr); + } + return s; +} + +void TransactionDB::PrepareWrap( + DBOptions* db_options, std::vector* column_families, + std::vector* compaction_enabled_cf_indices) { + compaction_enabled_cf_indices->clear(); + + // Enable MemTable History if not already enabled + for (size_t i = 0; i < column_families->size(); i++) { + ColumnFamilyOptions* cf_options = &(*column_families)[i].options; + + if (cf_options->max_write_buffer_size_to_maintain == 0 && + cf_options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + cf_options->max_write_buffer_size_to_maintain = -1; + } + if (!cf_options->disable_auto_compactions) { + // Disable compactions momentarily to prevent race with DB::Open + cf_options->disable_auto_compactions = true; + compaction_enabled_cf_indices->push_back(i); + } + } + db_options->allow_2pc = true; +} + +namespace { +template +Status WrapAnotherDBInternal( + DBType* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + assert(db != nullptr); + assert(dbptr != nullptr); + *dbptr = nullptr; + std::unique_ptr txn_db; + // txn_db owns object pointed to by the raw db pointer. + switch (txn_db_options.write_policy) { + case WRITE_UNPREPARED: + txn_db.reset(new WriteUnpreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_PREPARED: + txn_db.reset(new WritePreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_COMMITTED: + default: + txn_db.reset(new WriteCommittedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + } + txn_db->UpdateCFComparatorMap(handles); + Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles); + // In case of a failure at this point, db is deleted via the txn_db destructor + // and set to nullptr. + if (s.ok()) { + *dbptr = txn_db.release(); + } else { + for (auto* h : handles) { + delete h; + } + // txn_db still owns db, and ~StackableDB() will be called when txn_db goes + // out of scope, deleting the input db pointer. + ROCKS_LOG_FATAL(db->GetDBOptions().info_log, + "Failed to initialize txn_db: %s", s.ToString().c_str()); + } + return s; +} +} // namespace + +Status TransactionDB::WrapDB( + // make sure this db is already opened with memtable history enabled, + // auto compaction distabled and 2 phase commit enabled + DB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +Status TransactionDB::WrapStackableDB( + // make sure this stackable_db is already opened with memtable history + // enabled, auto compaction distabled and 2 phase commit enabled + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +// Let LockManager know that this column family exists so it can +// allocate a LockMap for it. +void PessimisticTransactionDB::AddColumnFamily( + const ColumnFamilyHandle* handle) { + lock_manager_->AddColumnFamily(handle); +} + +Status PessimisticTransactionDB::CreateColumnFamily( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle) { + InstrumentedMutexLock l(&column_family_mutex_); + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamily(options, column_family_name, handle); + if (s.ok()) { + lock_manager_->AddColumnFamily(*handle); + UpdateCFComparatorMap(*handle); + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamilies(options, column_family_names, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + for (auto& cf_desc : column_families) { + Status s = VerifyCFOptions(cf_desc.options); + if (!s.ok()) { + return s; + } + } + + Status s = db_->CreateColumnFamilies(column_families, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +// Let LockManager know that it can deallocate the LockMap for this +// column family. +Status PessimisticTransactionDB::DropColumnFamily( + ColumnFamilyHandle* column_family) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamily(column_family); + if (s.ok()) { + lock_manager_->RemoveColumnFamily(column_family); + } + + return s; +} + +Status PessimisticTransactionDB::DropColumnFamilies( + const std::vector& column_families) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamilies(column_families); + if (s.ok()) { + for (auto* handle : column_families) { + lock_manager_->RemoveColumnFamily(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const std::string& key, + bool exclusive) { + return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive); +} + +Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const Endpoint& start_endp, + const Endpoint& end_endp) { + return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(), + /*exclusive=*/true); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + const LockTracker& keys) { + lock_manager_->UnLock(txn, keys, GetEnv()); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + uint32_t cfh_id, const std::string& key) { + lock_manager_->UnLock(txn, cfh_id, key, GetEnv()); +} + +// Used when wrapping DB write operations in a transaction +Transaction* PessimisticTransactionDB::BeginInternalTransaction( + const WriteOptions& options) { + TransactionOptions txn_options; + Transaction* txn = BeginTransaction(options, txn_options, nullptr); + + // Use default timeout for non-transactional writes + txn->SetLockTimeout(txn_db_options_.default_lock_timeout); + return txn; +} + +// All user Put, Merge, Delete, and Write requests must be intercepted to make +// sure that they lock all keys that they are writing to avoid causing conflicts +// with any concurrent transactions. The easiest way to do this is to wrap all +// write operations in a transaction. +// +// Put(), Merge(), and Delete() only lock a single key per call. Write() will +// sort its keys before locking them. This guarantees that TransactionDB write +// methods cannot deadlock with each other (but still could deadlock with a +// Transaction). +Status PessimisticTransactionDB::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do PutUntracked(). + s = txn->PutUntracked(column_family, key, val); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // DeleteUntracked(). + s = txn->DeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // SingleDeleteUntracked(). + s = txn->SingleDeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // MergeUntracked(). + s = txn->MergeUntracked(column_family, key, value); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + return WriteWithConcurrencyControl(opts, updates); +} + +Status WriteCommittedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (txn_db_options_.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +Status WriteCommittedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (optimizations.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +void PessimisticTransactionDB::InsertExpirableTransaction( + TransactionID tx_id, PessimisticTransaction* tx) { + assert(tx->GetExpirationTime() > 0); + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.insert({tx_id, tx}); +} + +void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.erase(tx_id); +} + +bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks( + TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + + auto tx_it = expirable_transactions_map_.find(tx_id); + if (tx_it == expirable_transactions_map_.end()) { + return true; + } + PessimisticTransaction& tx = *(tx_it->second); + return tx.TryStealingLocks(); +} + +void PessimisticTransactionDB::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + auto txn_impl = static_cast_with_check(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +Transaction* PessimisticTransactionDB::GetTransactionByName( + const TransactionName& name) { + std::lock_guard lock(name_map_mutex_); + auto it = transactions_.find(name); + if (it == transactions_.end()) { + return nullptr; + } else { + return it->second; + } +} + +void PessimisticTransactionDB::GetAllPreparedTransactions( + std::vector* transv) { + assert(transv); + transv->clear(); + std::lock_guard lock(name_map_mutex_); + for (auto it = transactions_.begin(); it != transactions_.end(); ++it) { + if (it->second->GetState() == Transaction::PREPARED) { + transv->push_back(it->second); + } + } +} + +LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() { + return lock_manager_->GetPointLockStatus(); +} + +std::vector PessimisticTransactionDB::GetDeadlockInfoBuffer() { + return lock_manager_->GetDeadlockInfoBuffer(); +} + +void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) { + lock_manager_->Resize(target_size); +} + +void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) { + assert(txn); + assert(txn->GetName().length() > 0); + assert(GetTransactionByName(txn->GetName()) == nullptr); + assert(txn->GetState() == Transaction::STARTED); + std::lock_guard lock(name_map_mutex_); + transactions_[txn->GetName()] = txn; +} + +void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) { + assert(txn); + std::lock_guard lock(name_map_mutex_); + auto it = transactions_.find(txn->GetName()); + assert(it != transactions_.end()); + transactions_.erase(it); +} + +std::pair> +PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) { + if (kMaxTxnTimestamp == ts) { + return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr); + } + assert(db_impl_); + return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts); +} + +std::shared_ptr +PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshot(ts); +} + +void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan( + TxnTimestamp ts) { + assert(db_impl_); + db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts); +} + +Status PessimisticTransactionDB::GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& timestamped_snapshots) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots); +} + +Status SnapshotCreationCallback::operator()(SequenceNumber seq, + bool disable_memtable) { + assert(db_impl_); + assert(commit_ts_ != kMaxTxnTimestamp); + + const bool two_write_queues = + db_impl_->immutable_db_options().two_write_queues; + assert(!two_write_queues || !disable_memtable); +#ifdef NDEBUG + (void)two_write_queues; + (void)disable_memtable; +#endif + + const bool seq_per_batch = db_impl_->seq_per_batch(); + if (!seq_per_batch) { + assert(db_impl_->GetLastPublishedSequence() <= seq); + } else { + assert(db_impl_->GetLastPublishedSequence() < seq); + } + + // Create a snapshot which can also be used for write conflict checking. + auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_); + snapshot_creation_status_ = ret.first; + snapshot_ = ret.second; + if (snapshot_creation_status_.ok()) { + assert(snapshot_); + } else { + assert(!snapshot_); + } + if (snapshot_ && snapshot_notifier_) { + snapshot_notifier_->SnapshotCreated(snapshot_.get()); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h new file mode 100644 index 000000000..25cd11054 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h @@ -0,0 +1,318 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include + +#include "db/db_iter.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB : public TransactionDB { + public: + explicit PessimisticTransactionDB(DB* db, + const TransactionDBOptions& txn_db_options); + + explicit PessimisticTransactionDB(StackableDB* db, + const TransactionDBOptions& txn_db_options); + + virtual ~PessimisticTransactionDB(); + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual Status Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles); + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override = 0; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + inline Status WriteWithConcurrencyControl(const WriteOptions& opts, + WriteBatch* updates) { + Status s; + if (opts.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + updates, opts.protection_bytes_per_key); + } + if (s.ok()) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + txn->DisableIndexing(); + + auto txn_impl = static_cast_with_check(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, + // Transactions should use a lock timeout. + s = txn_impl->CommitBatch(updates); + + delete txn; + } + + return s; + } + + using StackableDB::CreateColumnFamily; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector& column_family_names, + std::vector* handles) override; + + Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + + using StackableDB::DropColumnFamily; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + + Status DropColumnFamilies( + const std::vector& column_families) override; + + Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key, bool exclusive); + Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id, + const Endpoint& start_endp, const Endpoint& end_endp); + + void UnLock(PessimisticTransaction* txn, const LockTracker& keys); + void UnLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key); + + void AddColumnFamily(const ColumnFamilyHandle* handle); + + static TransactionDBOptions ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options); + + const TransactionDBOptions& GetTxnDBOptions() const { + return txn_db_options_; + } + + void InsertExpirableTransaction(TransactionID tx_id, + PessimisticTransaction* tx); + void RemoveExpirableTransaction(TransactionID tx_id); + + // If transaction is no longer available, locks can be stolen + // If transaction is available, try stealing locks directly from transaction + // It is the caller's responsibility to ensure that the referred transaction + // is expirable (GetExpirationTime() > 0) and that it is expired. + bool TryStealingExpiredTransactionLocks(TransactionID tx_id); + + Transaction* GetTransactionByName(const TransactionName& name) override; + + void RegisterTransaction(Transaction* txn); + void UnregisterTransaction(Transaction* txn); + + // not thread safe. current use case is during recovery (single thread) + void GetAllPreparedTransactions(std::vector* trans) override; + + LockManager::PointLockStatus GetLockStatusData() override; + + std::vector GetDeadlockInfoBuffer() override; + void SetDeadlockInfoBufferSize(uint32_t target_size) override; + + // The default implementation does nothing. The actual implementation is moved + // to the child classes that actually need this information. This was due to + // an odd performance drop we observed when the added std::atomic member to + // the base class even when the subclass do not read it in the fast path. + virtual void UpdateCFComparatorMap(const std::vector&) {} + virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {} + + // Use the returned factory to create LockTrackers in transactions. + const LockTrackerFactory& GetLockTrackerFactory() const { + return lock_manager_->GetLockTrackerFactory(); + } + + std::pair> CreateTimestampedSnapshot( + TxnTimestamp ts) override; + + std::shared_ptr GetTimestampedSnapshot( + TxnTimestamp ts) const override; + + void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override; + + Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector>& + timestamped_snapshots) const override; + + protected: + DBImpl* db_impl_; + std::shared_ptr info_log_; + const TransactionDBOptions txn_db_options_; + + static Status FailIfBatchHasTs(const WriteBatch* wb); + + static Status FailIfCfEnablesTs(const DB* db, + const ColumnFamilyHandle* column_family); + + void ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()); + + virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options); + + private: + friend class WritePreparedTxnDB; + friend class WritePreparedTxnDBMock; + friend class WriteUnpreparedTxn; + friend class TransactionTest_DoubleCrashInRecovery_Test; + friend class TransactionTest_DoubleEmptyWrite_Test; + friend class TransactionTest_DuplicateKeys_Test; + friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test; + friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test; + friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test; + friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test; + + Transaction* BeginInternalTransaction(const WriteOptions& options); + + std::shared_ptr lock_manager_; + + // Must be held when adding/dropping column families. + InstrumentedMutex column_family_mutex_; + + // Used to ensure that no locks are stolen from an expirable transaction + // that has started a commit. Only transactions with an expiration time + // should be in this map. + std::mutex map_mutex_; + std::unordered_map + expirable_transactions_map_; + + // map from name to two phase transaction instance + std::mutex name_map_mutex_; + std::unordered_map transactions_; + + // Signal that we are testing a crash scenario. Some asserts could be relaxed + // in such cases. + virtual void TEST_Crash() {} +}; + +// A PessimisticTransactionDB that writes the data to the DB after the commit. +// In this way the DB only contains the committed data. +class WriteCommittedTxnDB : public PessimisticTransactionDB { + public: + explicit WriteCommittedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + explicit WriteCommittedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + virtual ~WriteCommittedTxnDB() {} + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Optimized version of ::Write that makes use of skip_concurrency_control + // hint + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, + WriteBatch* updates) override; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; +}; + +inline Status PessimisticTransactionDB::FailIfBatchHasTs( + const WriteBatch* batch) { + if (batch != nullptr && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + return Status::NotSupported( + "Writes with timestamp must go through transaction API instead of " + "TransactionDB."); + } + return Status::OK(); +} + +inline Status PessimisticTransactionDB::FailIfCfEnablesTs( + const DB* db, const ColumnFamilyHandle* column_family) { + assert(db); + column_family = column_family ? column_family : db->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + return Status::NotSupported( + "Write operation with user timestamp must go through the transaction " + "API instead of TransactionDB."); + } + return Status::OK(); +} + +class SnapshotCreationCallback : public PostMemTableCallback { + public: + explicit SnapshotCreationCallback( + DBImpl* dbi, TxnTimestamp commit_ts, + const std::shared_ptr& notifier, + std::shared_ptr& snapshot) + : db_impl_(dbi), + commit_ts_(commit_ts), + snapshot_notifier_(notifier), + snapshot_(snapshot) { + assert(db_impl_); + } + + ~SnapshotCreationCallback() override { + snapshot_creation_status_.PermitUncheckedError(); + } + + Status operator()(SequenceNumber seq, bool disable_memtable) override; + + private: + DBImpl* const db_impl_; + const TxnTimestamp commit_ts_; + std::shared_ptr snapshot_notifier_; + std::shared_ptr& snapshot_; + + Status snapshot_creation_status_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/snapshot_checker.cc b/src/rocksdb/utilities/transactions/snapshot_checker.cc new file mode 100644 index 000000000..76d16681a --- /dev/null +++ b/src/rocksdb/utilities/transactions/snapshot_checker.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/snapshot_checker.h" + +#ifdef ROCKSDB_LITE +#include +#endif // ROCKSDB_LITE + +#include "port/lang.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_LITE +WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( + WritePreparedTxnDB* /*txn_db*/) {} + +SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( + SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const { + // Should never be called in LITE mode. + assert(false); + return SnapshotCheckerResult::kInSnapshot; +} + +#else + +WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( + WritePreparedTxnDB* txn_db) + : txn_db_(txn_db){}; + +SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const { + bool snapshot_released = false; + // TODO(myabandeh): set min_uncommitted + bool in_snapshot = txn_db_->IsInSnapshot( + sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released); + if (snapshot_released) { + return SnapshotCheckerResult::kSnapshotReleased; + } + return in_snapshot ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; +} + +#endif // ROCKSDB_LITE + +DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() { + STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance); + return &instance; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc new file mode 100644 index 000000000..e9b474415 --- /dev/null +++ b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc @@ -0,0 +1,466 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef ROCKSDB_LITE +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n"); + return 0; +} +#else // ROCKSDB_LITE +#include + +#include "util/cast_util.h" +#include "utilities/transactions/transaction_test.h" + +namespace ROCKSDB_NAMESPACE { +INSTANTIATE_TEST_CASE_P( + Unsupported, TimestampedSnapshotWithTsSanityCheck, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite))); + +INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Values(WRITE_COMMITTED), + ::testing::Values(kOrderedWrite))); + +namespace { +// Not thread-safe. Caller needs to provide external synchronization. +class TsCheckingTxnNotifier : public TransactionNotifier { + public: + explicit TsCheckingTxnNotifier() = default; + + ~TsCheckingTxnNotifier() override {} + + void SnapshotCreated(const Snapshot* new_snapshot) override { + assert(new_snapshot); + if (prev_snapshot_seq_ != kMaxSequenceNumber) { + assert(prev_snapshot_seq_ <= new_snapshot->GetSequenceNumber()); + } + prev_snapshot_seq_ = new_snapshot->GetSequenceNumber(); + if (prev_snapshot_ts_ != kMaxTxnTimestamp) { + assert(prev_snapshot_ts_ <= new_snapshot->GetTimestamp()); + } + prev_snapshot_ts_ = new_snapshot->GetTimestamp(); + } + + TxnTimestamp prev_snapshot_ts() const { return prev_snapshot_ts_; } + + private: + SequenceNumber prev_snapshot_seq_ = kMaxSequenceNumber; + TxnTimestamp prev_snapshot_ts_ = kMaxTxnTimestamp; +}; +} // anonymous namespace + +TEST_P(TimestampedSnapshotWithTsSanityCheck, WithoutCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TimestampedSnapshotWithTsSanityCheck, SetCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_P(TransactionTest, WithoutCommitTs) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TransactionTest, ReuseExistingTxn) { + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v1")); + ASSERT_OK(txn->Prepare()); + + auto notifier = std::make_shared(); + std::shared_ptr snapshot1; + Status s = + txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/100, &snapshot1); + ASSERT_OK(s); + ASSERT_EQ(100, snapshot1->GetTimestamp()); + + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions(), txn); + assert(txn1 == txn); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn->Put("a", "v2")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr snapshot2; + s = txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/110, &snapshot2); + ASSERT_OK(s); + ASSERT_EQ(110, snapshot2->GetTimestamp()); + delete txn; + + { + std::string value; + ReadOptions read_opts; + read_opts.snapshot = snapshot1.get(); + ASSERT_OK(db->Get(read_opts, "a", &value)); + ASSERT_EQ("v1", value); + + read_opts.snapshot = snapshot2.get(); + ASSERT_OK(db->Get(read_opts, "a", &value)); + ASSERT_EQ("v2", value); + } +} + +TEST_P(TransactionTest, CreateSnapshotWhenCommit) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + + constexpr int batch_size = 10; + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), "v0")); + } + const SequenceNumber seq0 = db->GetLatestSequenceNumber(); + ASSERT_EQ(static_cast(batch_size), seq0); + + txn->SetSnapshot(); + { + const Snapshot* const snapshot = txn->GetSnapshot(); + assert(snapshot); + ASSERT_EQ(seq0, snapshot->GetSequenceNumber()); + } + + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(txn->Put("k" + std::to_string(i), "v1")); + } + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Prepare()); + + std::shared_ptr snapshot; + constexpr TxnTimestamp timestamp = 1; + auto notifier = std::make_shared(); + Status s = txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot); + ASSERT_OK(s); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + assert(snapshot); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_EQ(seq0 + batch_size, snapshot->GetSequenceNumber()); + const Snapshot* const raw_snapshot_ptr = txn->GetSnapshot(); + ASSERT_EQ(raw_snapshot_ptr, snapshot.get()); + ASSERT_EQ(snapshot, txn->GetTimestampedSnapshot()); + + { + std::shared_ptr snapshot1 = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::shared_ptr snapshot1 = + db->GetTimestampedSnapshot(timestamp); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::vector > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector >{snapshot}, + snapshots); + } +} + +TEST_P(TransactionTest, CreateSnapshot) { + // First create a non-timestamped snapshot + ManagedSnapshot snapshot_guard(db); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), + "v0_" + std::to_string(i))); + } + { + auto ret = db->CreateTimestampedSnapshot(kMaxTxnTimestamp); + ASSERT_TRUE(ret.first.IsInvalidArgument()); + auto snapshot = ret.second; + ASSERT_EQ(nullptr, snapshot.get()); + } + constexpr TxnTimestamp timestamp = 100; + Status s; + std::shared_ptr ts_snap0; + std::tie(s, ts_snap0) = db->CreateTimestampedSnapshot(timestamp); + ASSERT_OK(s); + assert(ts_snap0); + ASSERT_EQ(timestamp, ts_snap0->GetTimestamp()); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Delete(WriteOptions(), "k" + std::to_string(i))); + } + { + ReadOptions read_opts; + read_opts.snapshot = ts_snap0.get(); + for (int i = 0; i < 10; ++i) { + std::string value; + s = db->Get(read_opts, "k" + std::to_string(i), &value); + ASSERT_OK(s); + ASSERT_EQ("v0_" + std::to_string(i), value); + } + } + { + std::shared_ptr snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::shared_ptr snapshot = + db->GetTimestampedSnapshot(timestamp); + ASSERT_OK(s); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::vector > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector >{ts_snap0}, + snapshots); + } +} + +TEST_P(TransactionTest, SequenceAndTsOrder) { + Status s; + std::shared_ptr snapshot; + std::tie(s, snapshot) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + assert(snapshot); + { + // Cannot request smaller timestamp for the new timestamped snapshot. + std::shared_ptr tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(50); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + + // If requesting a new timestamped snapshot with the same timestamp and + // sequence number, we avoid creating new snapshot object but reuse + // exisisting one. + std::shared_ptr snapshot1; + std::tie(s, snapshot1) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + ASSERT_EQ(snapshot.get(), snapshot1.get()); + + // If there is no write, but we request a larger timestamp, we still create + // a new snapshot object. + std::shared_ptr snapshot2; + std::tie(s, snapshot2) = db->CreateTimestampedSnapshot(200); + ASSERT_OK(s); + assert(snapshot2); + ASSERT_NE(snapshot.get(), snapshot2.get()); + ASSERT_EQ(snapshot2->GetSequenceNumber(), snapshot->GetSequenceNumber()); + ASSERT_EQ(200, snapshot2->GetTimestamp()); + + // Increase sequence number. + ASSERT_OK(db->Put(WriteOptions(), "foo", "v0")); + { + // We are requesting the same timestamp for a larger sequence number, thus + // we cannot create timestamped snapshot. + std::shared_ptr tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(200); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + { + std::unique_ptr txn1( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn1->Put("bar", "v0")); + std::shared_ptr ss; + ASSERT_OK(txn1->CommitAndTryCreateSnapshot(nullptr, 200, &ss)); + // Cannot create snapshot because requested timestamp is the same as the + // latest timestamped snapshot while sequence number is strictly higher. + ASSERT_EQ(nullptr, ss); + } + { + std::unique_ptr txn2( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->Put("bar", "v0")); + std::shared_ptr ss; + // Application should never do this. This is just to demonstrate error + // handling. + ASSERT_OK(txn2->CommitAndTryCreateSnapshot(nullptr, 100, &ss)); + // Cannot create snapshot because requested timestamp is smaller than + // latest timestamped snapshot. + ASSERT_EQ(nullptr, ss); + } +} + +TEST_P(TransactionTest, CloseDbWithSnapshots) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("foo", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr snapshot; + constexpr TxnTimestamp timestamp = 121; + auto notifier = std::make_shared(); + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot)); + assert(snapshot); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_TRUE(db->Close().IsAborted()); +} + +TEST_P(TransactionTest, MultipleTimestampedSnapshots) { + auto* dbimpl = static_cast_with_check(db->GetRootDB()); + assert(dbimpl); + const bool seq_per_batch = dbimpl->seq_per_batch(); + // TODO: remove the following assert(!seq_per_batch) once timestamped snapshot + // is supported in write-prepared/write-unprepared transactions. + assert(!seq_per_batch); + constexpr size_t txn_size = 10; + constexpr TxnTimestamp ts_delta = 10; + constexpr size_t num_txns = 100; + std::vector > snapshots(num_txns); + constexpr TxnTimestamp start_ts = 10000; + auto notifier = std::make_shared(); + for (size_t i = 0; i < num_txns; ++i) { + std::unique_ptr txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn" + std::to_string(i))); + for (size_t j = 0; j < txn_size; ++j) { + ASSERT_OK(txn->Put("k" + std::to_string(j), + "v" + std::to_string(j) + "_" + std::to_string(i))); + } + if (0 == (i % 2)) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, start_ts + i * ts_delta, + &snapshots[i])); + assert(snapshots[i]); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + { + auto snapshot = db->GetTimestampedSnapshot(start_ts + 1); + ASSERT_EQ(nullptr, snapshot); + } + + constexpr TxnTimestamp max_ts = start_ts + num_txns * ts_delta; + for (size_t i = 0; i < num_txns; ++i) { + auto snapshot = db->GetTimestampedSnapshot(start_ts + i * ts_delta); + ASSERT_EQ(snapshots[i], snapshot); + + std::vector > tmp_snapshots; + Status s = db->GetTimestampedSnapshots(max_ts, start_ts + i * ts_delta, + tmp_snapshots); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_TRUE(tmp_snapshots.empty()); + + for (size_t j = i; j < num_txns; ++j) { + std::vector > expected_snapshots( + snapshots.begin() + i, snapshots.begin() + j); + tmp_snapshots.clear(); + s = db->GetTimestampedSnapshots(start_ts + i * ts_delta, + start_ts + j * ts_delta, tmp_snapshots); + if (i < j) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsInvalidArgument()); + } + ASSERT_EQ(expected_snapshots, tmp_snapshots); + } + } + + { + std::vector > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots, tmp_snapshots); + + const std::shared_ptr latest_snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshots.back(), latest_snapshot); + } + + for (size_t i = 0; i <= num_txns; ++i) { + std::vector > snapshots1( + snapshots.begin() + i, snapshots.end()); + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + assert(snapshot1); + ASSERT_EQ(start_ts + (i - 1) * ts_delta, snapshot1->GetTimestamp()); + } + + db->ReleaseTimestampedSnapshotsOlderThan(start_ts + i * ts_delta); + + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + ASSERT_EQ(nullptr, snapshot1); + } + + std::vector > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots1, tmp_snapshots); + } + + // Even after released by db, the applications still hold reference to shared + // snapshots. + for (size_t i = 0; i < num_txns; ++i) { + assert(snapshots[i]); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + snapshots.clear(); + ASSERT_OK(db->Close()); + delete db; + db = nullptr; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_base.cc b/src/rocksdb/utilities/transactions/transaction_base.cc new file mode 100644 index 000000000..83fd94ac8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_base.cc @@ -0,0 +1,731 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_base.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +Status Transaction::CommitAndTryCreateSnapshot( + std::shared_ptr notifier, TxnTimestamp ts, + std::shared_ptr* snapshot) { + if (snapshot) { + snapshot->reset(); + } + TxnTimestamp commit_ts = GetCommitTimestamp(); + if (commit_ts == kMaxTxnTimestamp) { + if (ts == kMaxTxnTimestamp) { + return Status::InvalidArgument("Commit timestamp unset"); + } else { + const Status s = SetCommitTimestamp(ts); + if (!s.ok()) { + return s; + } + } + } else if (ts != kMaxTxnTimestamp) { + if (ts != commit_ts) { + // For now we treat this as error. + return Status::InvalidArgument("Different commit ts specified"); + } + } + SetSnapshotOnNextOperation(notifier); + Status s = Commit(); + if (!s.ok()) { + return s; + } + assert(s.ok()); + // If we reach here, we must return ok status for this function. + std::shared_ptr new_snapshot = GetTimestampedSnapshot(); + + if (snapshot) { + *snapshot = new_snapshot; + } + return Status::OK(); +} + +TransactionBaseImpl::TransactionBaseImpl( + DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory) + : db_(db), + dbimpl_(static_cast_with_check(db)), + write_options_(write_options), + cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())), + lock_tracker_factory_(lock_tracker_factory), + start_time_(dbimpl_->GetSystemClock()->NowMicros()), + write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key), + tracked_locks_(lock_tracker_factory_.Create()), + commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */), + indexing_enabled_(true) { + assert(dynamic_cast(db_) != nullptr); + log_number_ = 0; + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +TransactionBaseImpl::~TransactionBaseImpl() { + // Release snapshot if snapshot is set + SetSnapshotInternal(nullptr); +} + +void TransactionBaseImpl::Clear() { + save_points_.reset(nullptr); + write_batch_.Clear(); + commit_time_batch_.Clear(); + tracked_locks_->Clear(); + num_puts_ = 0; + num_deletes_ = 0; + num_merges_ = 0; + + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +void TransactionBaseImpl::Reinitialize(DB* db, + const WriteOptions& write_options) { + Clear(); + ClearSnapshot(); + id_ = 0; + db_ = db; + name_.clear(); + log_number_ = 0; + write_options_ = write_options; + start_time_ = dbimpl_->GetSystemClock()->NowMicros(); + indexing_enabled_ = true; + cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily()); + WriteBatchInternal::UpdateProtectionInfo( + write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key) + .PermitUncheckedError(); + WriteBatchInternal::UpdateProtectionInfo( + &commit_time_batch_, write_options_.protection_bytes_per_key) + .PermitUncheckedError(); +} + +void TransactionBaseImpl::SetSnapshot() { + const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary(); + SetSnapshotInternal(snapshot); +} + +void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) { + // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to + // be released, not deleted when it is no longer referenced. + snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot, + this, std::placeholders::_1, db_)); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; +} + +void TransactionBaseImpl::SetSnapshotOnNextOperation( + std::shared_ptr notifier) { + snapshot_needed_ = true; + snapshot_notifier_ = notifier; +} + +void TransactionBaseImpl::SetSnapshotIfNeeded() { + if (snapshot_needed_) { + std::shared_ptr notifier = snapshot_notifier_; + SetSnapshot(); + if (notifier != nullptr) { + notifier->SnapshotCreated(GetSnapshot()); + } + } +} + +Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family, + const SliceParts& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + size_t key_size = 0; + for (int i = 0; i < key.num_parts; ++i) { + key_size += key.parts[i].size(); + } + + std::string str; + str.reserve(key_size); + + for (int i = 0; i < key.num_parts; ++i) { + str.append(key.parts[i].data(), key.parts[i].size()); + } + + return TryLock(column_family, str, read_only, exclusive, do_validate, + assume_tracked); +} + +void TransactionBaseImpl::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset( + new std::stack>()); + } + save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_, + num_puts_, num_deletes_, num_merges_, + lock_tracker_factory_); + write_batch_.SetSavePoint(); +} + +Status TransactionBaseImpl::RollbackToSavePoint() { + if (save_points_ != nullptr && save_points_->size() > 0) { + // Restore saved SavePoint + TransactionBaseImpl::SavePoint& save_point = save_points_->top(); + snapshot_ = save_point.snapshot_; + snapshot_needed_ = save_point.snapshot_needed_; + snapshot_notifier_ = save_point.snapshot_notifier_; + num_puts_ = save_point.num_puts_; + num_deletes_ = save_point.num_deletes_; + num_merges_ = save_point.num_merges_; + + // Rollback batch + Status s = write_batch_.RollbackToSavePoint(); + assert(s.ok()); + + // Rollback any keys that were tracked since the last savepoint + tracked_locks_->Subtract(*save_point.new_locks_); + + save_points_->pop(); + + return s; + } else { + assert(write_batch_.RollbackToSavePoint().IsNotFound()); + return Status::NotFound(); + } +} + +Status TransactionBaseImpl::PopSavePoint() { + if (save_points_ == nullptr || save_points_->empty()) { + // No SavePoint yet. + assert(write_batch_.PopSavePoint().IsNotFound()); + return Status::NotFound(); + } + + assert(!save_points_->empty()); + // If there is another savepoint A below the current savepoint B, then A needs + // to inherit tracked_keys in B so that if we rollback to savepoint A, we + // remember to unlock keys in B. If there is no other savepoint below, then we + // can safely discard savepoint info. + if (save_points_->size() == 1) { + save_points_->pop(); + } else { + TransactionBaseImpl::SavePoint top(lock_tracker_factory_); + std::swap(top, save_points_->top()); + save_points_->pop(); + + save_points_->top().new_locks_->Merge(*top.new_locks_); + } + + return write_batch_.PopSavePoint(); +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, + pinnable_val); +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && value != nullptr) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + } + return s; +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && pinnable_val != nullptr) { + s = Get(read_options, column_family, key, pinnable_val); + } + return s; +} + +std::vector TransactionBaseImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, + num_keys, keys, values, statuses, + sorted_input); +} + +std::vector TransactionBaseImpl::MultiGetForUpdate( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + // Regardless of whether the MultiGet succeeded, track these keys. + size_t num_keys = keys.size(); + values->resize(num_keys); + + // Lock all keys + for (size_t i = 0; i < num_keys; ++i) { + Status s = TryLock(column_family[i], keys[i], true /* read_only */, + true /* exclusive */); + if (!s.ok()) { + // Fail entire multiget if we cannot lock all keys + return std::vector(num_keys, s); + } + } + + // TODO(agiardullo): optimize multiget? + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter, + &read_options); +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter, + &read_options); +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, + const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +void TransactionBaseImpl::PutLogData(const Slice& blob) { + auto s = write_batch_.PutLogData(blob); + (void)s; + assert(s.ok()); +} + +WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() { + return &write_batch_; +} + +uint64_t TransactionBaseImpl::GetElapsedTime() const { + return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000; +} + +uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; } + +uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; } + +uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; } + +uint64_t TransactionBaseImpl::GetNumKeys() const { + return tracked_locks_->GetNumPointLocks(); +} + +void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key, + SequenceNumber seq, bool read_only, + bool exclusive) { + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = seq; + r.read_only = read_only; + r.exclusive = exclusive; + + // Update map of all tracked keys for this transaction + tracked_locks_->Track(r); + + if (save_points_ != nullptr && !save_points_->empty()) { + // Update map of tracked keys in this SavePoint + save_points_->top().new_locks_->Track(r); + } +} + +// Gets the write batch that should be used for Put/Merge/Deletes. +// +// Returns either a WriteBatch or WriteBatchWithIndex depending on whether +// DisableIndexing() has been called. +WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() { + if (indexing_enabled_) { + // Use WriteBatchWithIndex + return &write_batch_; + } else { + // Don't use WriteBatchWithIndex. Return base WriteBatch. + return write_batch_.GetWriteBatch(); + } +} + +void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) { + if (snapshot != nullptr) { + ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log, + "ReleaseSnapshot %" PRIu64 " Set", + snapshot->GetSequenceNumber()); + db->ReleaseSnapshot(snapshot); + } +} + +void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) { + PointLockRequest r; + r.column_family_id = GetColumnFamilyID(column_family); + r.key = key.ToString(); + r.read_only = true; + + bool can_untrack = false; + if (save_points_ != nullptr && !save_points_->empty()) { + // If there is no GetForUpdate of the key in this save point, + // then cannot untrack from the global lock tracker. + UntrackStatus s = save_points_->top().new_locks_->Untrack(r); + can_untrack = (s != UntrackStatus::NOT_TRACKED); + } else { + // No save point, so can untrack from the global lock tracker. + can_untrack = true; + } + + if (can_untrack) { + // If erased from the global tracker, then can unlock the key. + UntrackStatus s = tracked_locks_->Untrack(r); + bool can_unlock = (s == UntrackStatus::REMOVED); + if (can_unlock) { + UnlockGetForUpdate(column_family, key); + } + } +} + +Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) { + struct IndexedWriteBatchBuilder : public WriteBatch::Handler { + Transaction* txn_; + DBImpl* db_; + IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db) + : txn_(txn), db_(db) { + assert(dynamic_cast(txn_) != nullptr); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return txn_->Delete(db_->GetColumnFamilyHandle(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val); + } + + // this is used for reconstructing prepared transactions upon + // recovery. there should not be any meta markers in the batches + // we are processing. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + IndexedWriteBatchBuilder copycat(this, dbimpl_); + return src_batch->Iterate(©cat); +} + +WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { + return &commit_time_batch_; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_base.h b/src/rocksdb/utilities/transactions/transaction_base.h new file mode 100644 index 000000000..1bcb20ca9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_base.h @@ -0,0 +1,384 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionBaseImpl : public Transaction { + public: + TransactionBaseImpl(DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory); + + ~TransactionBaseImpl() override; + + // Remove pending operations queued in this transaction. + virtual void Clear(); + + void Reinitialize(DB* db, const WriteOptions& write_options); + + // Called before executing Put, Merge, Delete, and GetForUpdate. If TryLock + // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed. + // do_validate will be false if called from PutUntracked, DeleteUntracked, + // MergeUntracked, or GetForUpdate(do_validate=false) + virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, + const bool do_validate = true, + const bool assume_tracked = false) = 0; + + void SetSavePoint() override; + + Status RollbackToSavePoint() override; + + Status PopSavePoint() override; + + using Transaction::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override { + return Get(options, db_->DefaultColumnFamily(), key, value); + } + + using Transaction::GetForUpdate; + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, value, + exclusive, do_validate); + } + + using Transaction::MultiGet; + std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) override { + return MultiGet(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input = false) override; + + using Transaction::MultiGetForUpdate; + std::vector MultiGetForUpdate( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + std::vector MultiGetForUpdate( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override { + return MultiGetForUpdate(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); + } + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); + } + + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDeleteUntracked(const Slice& key) override { + return SingleDeleteUntracked(nullptr, key); + } + + void PutLogData(const Slice& blob) override; + + WriteBatchWithIndex* GetWriteBatch() override; + + virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ + } + + const Snapshot* GetSnapshot() const override { + // will return nullptr when there is no snapshot + return snapshot_.get(); + } + + std::shared_ptr GetTimestampedSnapshot() const override { + return snapshot_; + } + + virtual void SetSnapshot() override; + void SetSnapshotOnNextOperation( + std::shared_ptr notifier = nullptr) override; + + void ClearSnapshot() override { + snapshot_.reset(); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; + } + + void DisableIndexing() override { indexing_enabled_ = false; } + + void EnableIndexing() override { indexing_enabled_ = true; } + + bool IndexingEnabled() const { return indexing_enabled_; } + + uint64_t GetElapsedTime() const override; + + uint64_t GetNumPuts() const override; + + uint64_t GetNumDeletes() const override; + + uint64_t GetNumMerges() const override; + + uint64_t GetNumKeys() const override; + + void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; + void UndoGetForUpdate(const Slice& key) override { + return UndoGetForUpdate(nullptr, key); + }; + + WriteOptions* GetWriteOptions() override { return &write_options_; } + + void SetWriteOptions(const WriteOptions& write_options) override { + write_options_ = write_options; + } + + // Used for memory management for snapshot_ + void ReleaseSnapshot(const Snapshot* snapshot, DB* db); + + // iterates over the given batch and makes the appropriate inserts. + // used for rebuilding prepared transactions after recovery. + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WriteBatch* GetCommitTimeWriteBatch() override; + + LockTracker& GetTrackedLocks() { return *tracked_locks_; } + + protected: + // Add a key to the list of tracked keys. + // + // seqno is the earliest seqno this key was involved with this transaction. + // readonly should be set to true if no data was written for this key + void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno, + bool readonly, bool exclusive); + + // Called when UndoGetForUpdate determines that this key can be unlocked. + virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + // Sets a snapshot if SetSnapshotOnNextOperation() has been called. + void SetSnapshotIfNeeded(); + + // Initialize write_batch_ for 2PC by inserting Noop. + inline void InitWriteBatch(bool clear = false) { + if (clear) { + write_batch_.Clear(); + } + assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader); + auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + assert(s.ok()); + } + + WriteBatchBase* GetBatchForWrite(); + + DB* db_; + DBImpl* dbimpl_; + + WriteOptions write_options_; + + const Comparator* cmp_; + + const LockTrackerFactory& lock_tracker_factory_; + + // Stores that time the txn was constructed, in microseconds. + uint64_t start_time_; + + // Stores the current snapshot that was set by SetSnapshot or null if + // no snapshot is currently set. + std::shared_ptr snapshot_; + + // Count of various operations pending in this transaction + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + struct SavePoint { + std::shared_ptr snapshot_; + bool snapshot_needed_ = false; + std::shared_ptr snapshot_notifier_; + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + // Record all locks tracked since the last savepoint + std::shared_ptr new_locks_; + + SavePoint(std::shared_ptr snapshot, bool snapshot_needed, + std::shared_ptr snapshot_notifier, + uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges, + const LockTrackerFactory& lock_tracker_factory) + : snapshot_(snapshot), + snapshot_needed_(snapshot_needed), + snapshot_notifier_(snapshot_notifier), + num_puts_(num_puts), + num_deletes_(num_deletes), + num_merges_(num_merges), + new_locks_(lock_tracker_factory.Create()) {} + + explicit SavePoint(const LockTrackerFactory& lock_tracker_factory) + : new_locks_(lock_tracker_factory.Create()) {} + }; + + // Records writes pending in this transaction + WriteBatchWithIndex write_batch_; + + // For Pessimistic Transactions this is the set of acquired locks. + // Optimistic Transactions will keep note the requested locks (not actually + // locked), and do conflict checking until commit time based on the tracked + // lock requests. + std::unique_ptr tracked_locks_; + + // Stack of the Snapshot saved at each save point. Saved snapshots may be + // nullptr if there was no snapshot at the time SetSavePoint() was called. + std::unique_ptr>> + save_points_; + + private: + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + + // Extra data to be persisted with the commit. Note this is only used when + // prepare phase is not skipped. + WriteBatch commit_time_batch_; + + // If true, future Put/Merge/Deletes will be indexed in the + // WriteBatchWithIndex. + // If false, future Put/Merge/Deletes will be inserted directly into the + // underlying WriteBatch and not indexed in the WriteBatchWithIndex. + bool indexing_enabled_; + + // SetSnapshotOnNextOperation() has been called and the snapshot has not yet + // been reset. + bool snapshot_needed_ = false; + + // SetSnapshotOnNextOperation() has been called and the caller would like + // a notification through the TransactionNotifier interface + std::shared_ptr snapshot_notifier_ = nullptr; + + Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false); + + void SetSnapshotInternal(const Snapshot* snapshot); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc new file mode 100644 index 000000000..345c4be90 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_db_mutex_impl.h" + +#include +#include +#include +#include + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexImpl : public TransactionDBMutex { + public: + TransactionDBMutexImpl() {} + ~TransactionDBMutexImpl() override {} + + Status Lock() override; + + Status TryLockFor(int64_t timeout_time) override; + + void UnLock() override { mutex_.unlock(); } + + friend class TransactionDBCondVarImpl; + + private: + std::mutex mutex_; +}; + +class TransactionDBCondVarImpl : public TransactionDBCondVar { + public: + TransactionDBCondVarImpl() {} + ~TransactionDBCondVarImpl() override {} + + Status Wait(std::shared_ptr mutex) override; + + Status WaitFor(std::shared_ptr mutex, + int64_t timeout_time) override; + + void Notify() override { cv_.notify_one(); } + + void NotifyAll() override { cv_.notify_all(); } + + private: + std::condition_variable cv_; +}; + +std::shared_ptr +TransactionDBMutexFactoryImpl::AllocateMutex() { + return std::shared_ptr(new TransactionDBMutexImpl()); +} + +std::shared_ptr +TransactionDBMutexFactoryImpl::AllocateCondVar() { + return std::shared_ptr(new TransactionDBCondVarImpl()); +} + +Status TransactionDBMutexImpl::Lock() { + mutex_.lock(); + return Status::OK(); +} + +Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { + bool locked = true; + + if (timeout_time == 0) { + locked = mutex_.try_lock(); + } else { + // Previously, this code used a std::timed_mutex. However, this was changed + // due to known bugs in gcc versions < 4.9. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54562 + // + // Since this mutex isn't held for long and only a single mutex is ever + // held at a time, it is reasonable to ignore the lock timeout_time here + // and only check it when waiting on the condition_variable. + mutex_.lock(); + } + + if (!locked) { + // timeout acquiring mutex + return Status::TimedOut(Status::SubCode::kMutexTimeout); + } + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::Wait( + std::shared_ptr mutex) { + auto mutex_impl = reinterpret_cast(mutex.get()); + + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); + cv_.wait(lock); + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::WaitFor( + std::shared_ptr mutex, int64_t timeout_time) { + Status s; + + auto mutex_impl = reinterpret_cast(mutex.get()); + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); + + if (timeout_time < 0) { + // If timeout is negative, do not use a timeout + cv_.wait(lock); + } else { + auto duration = std::chrono::microseconds(timeout_time); + auto cv_status = cv_.wait_for(lock, duration); + + // Check if the wait stopped due to timing out. + if (cv_status == std::cv_status::timeout) { + s = Status::TimedOut(Status::SubCode::kMutexTimeout); + } + } + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + // CV was signaled, or we spuriously woke up (but didn't time out) + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h new file mode 100644 index 000000000..fbee92832 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutex; +class TransactionDBCondVar; + +// Default implementation of TransactionDBMutexFactory. May be overridden +// by TransactionDBOptions.custom_mutex_factory. +class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory { + public: + std::shared_ptr AllocateMutex() override; + std::shared_ptr AllocateCondVar() override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_test.cc b/src/rocksdb/utilities/transactions/transaction_test.cc new file mode 100644 index 000000000..caf1566b9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_test.cc @@ -0,0 +1,6550 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_test.h" + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, TransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, TransactionStressTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); +INSTANTIATE_TEST_CASE_P( + StackableDBAsBaseDB, TransactionTest, + ::testing::Values( + std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite))); + +// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it +// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set). +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + MySQLStyleTransactionTest, MySQLStyleTransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, DoubleEmptyWrite) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + + WriteBatch batch; + + ASSERT_OK(db->Write(write_options, &batch)); + ASSERT_OK(db->Write(write_options, &batch)); + + // Also test committing empty transactions in 2PC + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->Commit()); + delete txn0; + + // Also test that it works during recovery + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid2")); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + txn0 = db->GetTransactionByName("xid2"); + ASSERT_OK(txn0->Commit()); + delete txn0; +} + +TEST_P(TransactionTest, SuccessTest) { + ASSERT_OK(db->ResetStats()); + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_LE(0, txn->GetID()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_EQ(1, txn->GetNumPuts()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) { + const TxnDBWritePolicy write_policy = std::get<2>(GetParam()); + + if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) { + ROCKSDB_GTEST_BYPASS("Test applies to write-committed only"); + return; + } + + ASSERT_OK(db->Put(WriteOptions(), "key0", "value")); + + TransactionOptions txn_opts; + txn_opts.use_only_the_last_commit_time_batch_for_recovery = true; + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts); + assert(txn); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&](void* arg) { + // db mutex not held. + auto* mems = reinterpret_cast*>(arg); + assert(mems); + ASSERT_EQ(1, mems->size()); + auto* ctwb = txn->GetCommitTimeWriteBatch(); + ASSERT_OK(ctwb->Put("gtid", "123")); + ASSERT_OK(txn->Commit()); + delete txn; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(txn->Put("key1", "value")); + ASSERT_OK(txn->SetName("txn1")); + + ASSERT_OK(txn->Prepare()); + + auto dbimpl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr)); + ASSERT_OK(dbimpl->TEST_FlushMemTable( + /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr)); + + ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable()); + + { + std::string value; + ASSERT_OK(db->Get(ReadOptions(), "key1", &value)); + ASSERT_EQ("value", value); + } + + delete db; + db = nullptr; + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + ASSERT_OK(s); + assert(db); + + { + std::string value; + ASSERT_OK(db->Get(ReadOptions(), "gtid", &value)); + ASSERT_EQ("123", value); + + ASSERT_OK(db->Get(ReadOptions(), "key1", &value)); + ASSERT_EQ("value", value); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// The test clarifies the contract of do_validate and assume_tracked +// in GetForUpdate and Put/Merge/Delete +TEST_P(TransactionTest, AssumeExclusiveTracked) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + TransactionOptions txn_options; + txn_options.lock_timeout = 1; + const bool EXCLUSIVE = true; + const bool DO_VALIDATE = true; + const bool ASSUME_LOCKED = true; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + txn->SetSnapshot(); + + // commit a value after the snapshot is taken + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + + // By default write should fail to the commit after our snapshot + s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE); + ASSERT_TRUE(s.IsBusy()); + // But the user could direct the db to skip validating the snapshot. The read + // value then should be the most recently committed + ASSERT_OK( + txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE)); + ASSERT_EQ(value, "bar"); + + // Although ValidateSnapshot is skipped the key must have still got locked + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_TRUE(s.IsTimedOut()); + + // By default the write operations should fail due to the commit after the + // snapshot + s = txn->Put(Slice("foo"), Slice("bar1")); + ASSERT_TRUE(s.IsBusy()); + s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"), + !ASSUME_LOCKED); + ASSERT_TRUE(s.IsBusy()); + // But the user could direct the db that it already assumes exclusive lock on + // the key due to the previous GetForUpdate call. + ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"), + ASSUME_LOCKED)); + ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"), + ASSUME_LOCKED)); + ASSERT_OK( + txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED)); + ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"), + ASSUME_LOCKED)); + + ASSERT_OK(txn->Rollback()); + delete txn; +} + +// This test clarifies the contract of ValidateSnapshot +TEST_P(TransactionTest, ValidateSnapshotTest) { + for (bool with_flush : {true}) { + for (bool with_2pc : {true}) { + ASSERT_OK(ReOpen()); + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + assert(db != nullptr); + Transaction* txn1 = + db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn1); + ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1"))); + if (with_2pc) { + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Prepare()); + } + + if (with_flush) { + auto db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // Make sure the flushed memtable is not kept in memory + int max_memtable_in_history = + std::max( + options.max_write_buffer_number, + static_cast(options.max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)) + + 1; + for (int i = 0; i < max_memtable_in_history; i++) { + ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value"))); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + } + } + + Transaction* txn2 = + db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn2); + txn2->SetSnapshot(); + + ASSERT_OK(txn1->Commit()); + delete txn1; + + auto pes_txn2 = dynamic_cast(txn2); + // Test the simple case where the key is not tracked yet + auto trakced_seq = kMaxSequenceNumber; + auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo", + &trakced_seq); + ASSERT_TRUE(s.IsBusy()); + delete txn2; + } + } +} + +TEST_P(TransactionTest, WaitingTxn) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.lock_timeout = 1; + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + /* create second cf */ + ColumnFamilyHandle* cfa; + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->Put(write_options, cfa, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + TransactionID id1 = txn1->GetID(); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) { + std::string key; + uint32_t cf_id; + std::vector wait = txn2->GetWaitingTxns(&cf_id, &key); + ASSERT_EQ(key, "foo"); + ASSERT_EQ(wait.size(), 1); + ASSERT_EQ(wait[0], id1); + ASSERT_EQ(cf_id, 0U); + }); + + get_perf_context()->Reset(); + // lock key in default cf + s = txn1->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0); + + // lock key in cfa + s = txn1->GetForUpdate(read_options, cfa, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0); + + auto lock_data = db->GetLockStatusData(); + // Locked keys exist in both column family. + ASSERT_EQ(lock_data.size(), 2); + + auto cf_iterator = lock_data.begin(); + + // The iterator points to an unordered_multimap + // thus the test can not assume any particular order. + + // Column family is 1 or 0 (cfa). + if (cf_iterator->first != 1 && cf_iterator->first != 0) { + FAIL(); + } + // The locked key is "foo" and is locked by txn1 + ASSERT_EQ(cf_iterator->second.key, "foo"); + ASSERT_EQ(cf_iterator->second.ids.size(), 1); + ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID()); + + cf_iterator++; + + // Column family is 0 (default) or 1. + if (cf_iterator->first != 1 && cf_iterator->first != 0) { + FAIL(); + } + // The locked key is "foo" and is locked by txn1 + ASSERT_EQ(cf_iterator->second.key, "foo"); + ASSERT_EQ(cf_iterator->second.ids.size(), 1); + ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + s = txn2->GetForUpdate(read_options, "foo", &value); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1); + ASSERT_GE(get_perf_context()->key_lock_wait_time, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + delete cfa; + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, SharedLocks) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn2); + ASSERT_TRUE(txn3); + + // Test shared access between txns + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + auto lock_data = db->GetLockStatusData(); + ASSERT_EQ(lock_data.size(), 1); + + auto cf_iterator = lock_data.begin(); + ASSERT_EQ(cf_iterator->second.key, "foo"); + + // We compare whether the set of txns locking this key is the same. To do + // this, we need to sort both vectors so that the comparison is done + // correctly. + std::vector expected_txns = {txn1->GetID(), txn2->GetID(), + txn3->GetID()}; + std::vector lock_txns = cf_iterator->second.ids; + ASSERT_EQ(expected_txns, lock_txns); + ASSERT_FALSE(cf_iterator->second.exclusive); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + ASSERT_OK(txn3->Rollback()); + + // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it. + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn2->UndoGetForUpdate("foo"); + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + ASSERT_OK(txn3->Rollback()); + + // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock. + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn2->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + + // Test txn1 trying to downgrade its lock. + s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + // Should still fail after "downgrading". + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + + // Test txn1 holding an exclusive lock and txn2 trying to obtain shared + // access. + s = txn1->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + delete txn1; + delete txn2; + delete txn3; +} + +TEST_P(TransactionTest, DeadlockCycleShared) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + + // Set up a wait for chain like this: + // + // Tn -> T(n*2) + // Tn -> T(n*2 + 1) + // + // So we have: + // T1 -> T2 -> T4 ... + // | |> T5 ... + // |> T3 -> T6 ... + // |> T7 ... + // up to T31, then T[16 - 31] -> T1. + // Note that Tn holds lock on floor(n / 2). + + std::vector txns(31); + + for (uint32_t i = 0; i < 31; i++) { + txns[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns[i]); + auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2), + nullptr, false /* exclusive */); + ASSERT_OK(s); + } + + std::atomic checkpoints(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // We want the leaf transactions to block and hold everyone back. + std::vector threads; + for (uint32_t i = 0; i < 15; i++) { + std::function blocking_thread = [&, i] { + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1), + nullptr, true /* exclusive */); + ASSERT_OK(s); + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + }; + threads.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints.load() != 15) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle T[16 - 31] -> T1 + for (uint32_t i = 15; i < 31; i++) { + auto s = + txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */); + ASSERT_TRUE(s.IsDeadlock()); + + // Calculate next buffer len, plateau at 5 when 5 records are inserted. + const uint32_t curr_dlock_buffer_len_ = + (i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_); + auto dlock_entry = dlock_buffer[0].path; + ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks); + int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time; + int64_t cur_deadlock_time = 0; + for (auto const& dl_path_rec : dlock_buffer) { + cur_deadlock_time = dl_path_rec.deadlock_time; + ASSERT_NE(cur_deadlock_time, 0); + ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time); + pre_deadlock_time = cur_deadlock_time; + } + + int64_t curr_waiting_key = 0; + + // Offset of each txn id from the root of the shared dlock tree's txn id. + int64_t offset_root = dlock_entry[0].m_txn_id - 1; + // Offset of the final entry in the dlock path from the root's txn id. + TransactionID leaf_id = + dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root; + + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { + auto dl_node = *it; + ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id); + ASSERT_EQ(dl_node.m_cf_id, 0U); + ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key)); + ASSERT_EQ(dl_node.m_exclusive, true); + + if (curr_waiting_key == 0) { + curr_waiting_key = leaf_id; + } + curr_waiting_key /= 2; + leaf_id /= 2; + } + } + + // Rollback the leaf transaction. + for (uint32_t i = 15; i < 31; i++) { + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + } + + for (auto& t : threads) { + t.join(); + } + + // Downsize the buffer and verify the 3 latest deadlocks are preserved. + auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(3); + auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 3); + + for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) { + for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) { + ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id, + dlock_buffer_before_resize[i].path[j].m_txn_id); + } + } + + // Upsize the buffer and verify the 3 latest dealocks are preserved. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(5); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 3); + + for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) { + for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) { + ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id, + dlock_buffer_before_resize[i].path[j].m_txn_id); + } + } + + // Downsize to 0 and verify the size is consistent. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(0); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 0); + + // Upsize from 0 to verify the size is persistent. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(3); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 0); + + // Contrived case of shared lock of cycle size 2 to verify that a shared + // lock causing a deadlock is correctly reported as "shared" in the buffer. + std::vector txns_shared(2); + + // Create a cycle of size 2. + for (uint32_t i = 0; i < 2; i++) { + txns_shared[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns_shared[i]); + auto s = + txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + ASSERT_OK(s); + } + + std::atomic checkpoints_shared(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector threads_shared; + for (uint32_t i = 0; i < 1; i++) { + std::function blocking_thread = [&, i] { + auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1), + nullptr); + ASSERT_OK(s); + ASSERT_OK(txns_shared[i]->Rollback()); + delete txns_shared[i]; + }; + threads_shared.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints_shared.load() != 1) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle T2 -> T1 with a shared lock. + auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false); + ASSERT_TRUE(s.IsDeadlock()); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + + // Verify the size of the buffer and the single path. + ASSERT_EQ(dlock_buffer.size(), 1); + ASSERT_EQ(dlock_buffer[0].path.size(), 2); + + // Verify the exclusivity field of the transactions in the deadlock path. + ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive); + ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive); + ASSERT_OK(txns_shared[1]->Rollback()); + delete txns_shared[1]; + + for (auto& t : threads_shared) { + t.join(); + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, DeadlockCycle) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + // offset by 2 from the max depth to test edge case + const uint32_t kMaxCycleLength = 52; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + + for (uint32_t len = 2; len < kMaxCycleLength; len++) { + // Set up a long wait for chain like this: + // + // T1 -> T2 -> T3 -> ... -> Tlen + + std::vector txns(len); + + for (uint32_t i = 0; i < len; i++) { + txns[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns[i]); + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + ASSERT_OK(s); + } + + std::atomic checkpoints(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // We want the last transaction in the chain to block and hold everyone + // back. + std::vector threads; + for (uint32_t i = 0; i + 1 < len; i++) { + std::function blocking_thread = [&, i] { + auto s = + txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr); + ASSERT_OK(s); + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + }; + threads.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints.load() != len - 1) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle Tlen -> T1 + auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr); + ASSERT_TRUE(s.IsDeadlock()); + + const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1); + uint32_t curr_waiting_key = 0; + TransactionID curr_txn_id = txns[0]->GetID(); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_); + uint32_t check_len = len; + bool check_limit_flag = false; + + // Special case for a deadlock path that exceeds the maximum depth. + if (len > 50) { + check_len = 0; + check_limit_flag = true; + } + auto dlock_entry = dlock_buffer[0].path; + ASSERT_EQ(dlock_entry.size(), check_len); + ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag); + + int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time; + int64_t cur_deadlock_time = 0; + for (auto const& dl_path_rec : dlock_buffer) { + cur_deadlock_time = dl_path_rec.deadlock_time; + ASSERT_NE(cur_deadlock_time, 0); + ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time); + pre_deadlock_time = cur_deadlock_time; + } + + // Iterates backwards over path verifying decreasing txn_ids. + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { + auto dl_node = *it; + ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1); + ASSERT_EQ(dl_node.m_cf_id, 0u); + ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key)); + ASSERT_EQ(dl_node.m_exclusive, true); + + curr_txn_id--; + if (curr_waiting_key == 0) { + curr_waiting_key = len; + } + curr_waiting_key--; + } + + // Rollback the last transaction. + ASSERT_OK(txns[len - 1]->Rollback()); + delete txns[len - 1]; + + for (auto& t : threads) { + t.join(); + } + } +} + +TEST_P(TransactionStressTest, DeadlockStress) { + const uint32_t NUM_TXN_THREADS = 10; + const uint32_t NUM_KEYS = 100; + const uint32_t NUM_ITERS = 1000; + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + std::vector keys; + + for (uint32_t i = 0; i < NUM_KEYS; i++) { + ASSERT_OK(db->Put(write_options, Slice(std::to_string(i)), Slice(""))); + keys.push_back(std::to_string(i)); + } + + size_t tid = std::hash()(std::this_thread::get_id()); + Random rnd(static_cast(tid)); + std::function stress_thread = [&](uint32_t seed) { + std::default_random_engine g(seed); + + Transaction* txn; + for (uint32_t i = 0; i < NUM_ITERS; i++) { + txn = db->BeginTransaction(write_options, txn_options); + auto random_keys = keys; + std::shuffle(random_keys.begin(), random_keys.end(), g); + + // Lock keys in random order. + for (const auto& k : random_keys) { + // Lock mostly for shared access, but exclusive 1/4 of the time. + auto s = + txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0); + if (!s.ok()) { + ASSERT_TRUE(s.IsDeadlock()); + ASSERT_OK(txn->Rollback()); + break; + } + } + + delete txn; + } + }; + + std::vector threads; + for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) { + threads.emplace_back(stress_thread, rnd.Next()); + } + + for (auto& t : threads) { + t.join(); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, CommitTimeBatchFailTest) { + WriteOptions write_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog")); + + s = txn1->Put("foo", "bar"); + ASSERT_OK(s); + + // fails due to non-empty commit-time batch + s = txn1->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + delete txn1; +} + +TEST_P(TransactionTest, LogMarkLeakTest) { + TransactionOptions txn_options; + WriteOptions write_options; + options.write_buffer_size = 1024; + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + Random rnd(47); + std::vector txns; + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + // At the beginning there should be no log containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + for (size_t i = 0; i < 100; i++) { + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + ASSERT_OK(txn->Put(Slice("foo" + std::to_string(i)), Slice("bar"))); + ASSERT_OK(txn->Prepare()); + ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + if (rnd.OneIn(5)) { + txns.push_back(txn); + } else { + ASSERT_OK(txn->Commit()); + delete txn; + } + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + } + for (auto txn : txns) { + ASSERT_OK(txn->Commit()); + delete txn; + } + // At the end there should be no log left containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + // Make sure that the underlying data structures are properly truncated and + // cause not leak + ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0); + ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0); +} + +TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) { + for (bool cwb4recovery : {true, false}) { + ASSERT_OK(ReOpen()); + WriteOptions write_options; + ReadOptions read_options; + + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // regular db put + s = db->Put(write_options, Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // regular db read + ASSERT_OK(db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar2"); + + // commit time put + if (cwb4recovery) { + ASSERT_OK( + txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"))); + ASSERT_OK( + txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"))); + } + + // nothing has been prepped yet + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + s = txn->Prepare(); + ASSERT_OK(s); + + // data not im mem yet + s = db->Get(read_options, Slice("foo"), &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, Slice("gtid"), &value); + ASSERT_TRUE(s.IsNotFound()); + + // find trans in list of prepared transactions + std::vector prepared_trans; + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 1); + ASSERT_EQ(prepared_trans.front()->GetName(), "xid"); + + auto log_containing_prep = + db_impl->TEST_FindMinLogContainingOutstandingPrep(); + ASSERT_GT(log_containing_prep, 0); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + // value is now available + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // we already committed + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // no longer is prepared results + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 0); + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); + + // heap should not care about prepared section anymore + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // but now our memtable should be referencing the prep section + ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep()); + ASSERT_EQ(log_containing_prep, + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // After flush the recoverable state must be visible + if (cwb4recovery) { + s = db->Get(read_options, "gtid", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs"); + + s = db->Get(read_options, "gtid2", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "cats"); + } + + // after memtable flush we can now relese the log + ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep); + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + + delete txn; + + if (cwb4recovery) { + // kill and reopen to trigger recovery + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + s = db->Get(read_options, "gtid", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs"); + + s = db->Get(read_options, "gtid2", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "cats"); + } + } +} + +TEST_P(TransactionTest, TwoPhaseNameTest) { + Status s; + + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn3); + delete txn3; + + // cant prepare txn without name + s = txn1->Prepare(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // name too short + s = txn1->SetName(""); + ASSERT_EQ(s, Status::InvalidArgument()); + + // name too long + s = txn1->SetName(std::string(513, 'x')); + ASSERT_EQ(s, Status::InvalidArgument()); + + // valid set name + s = txn1->SetName("name1"); + ASSERT_OK(s); + + // cant have duplicate name + s = txn2->SetName("name1"); + ASSERT_EQ(s, Status::InvalidArgument()); + + // shouldn't be able to prepare + s = txn2->Prepare(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // valid name set + s = txn2->SetName("name2"); + ASSERT_OK(s); + + // cant reset name + s = txn2->SetName("name3"); + ASSERT_EQ(s, Status::InvalidArgument()); + + ASSERT_EQ(txn1->GetName(), "name1"); + ASSERT_EQ(txn2->GetName(), "name2"); + + s = txn1->Prepare(); + ASSERT_OK(s); + + // can't rename after prepare + s = txn1->SetName("name4"); + ASSERT_EQ(s, Status::InvalidArgument()); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) { + for (bool cwb4recovery : {true, false}) { + for (bool test_with_empty_wal : {true, false}) { + if (!cwb4recovery && test_with_empty_wal) { + continue; + } + ASSERT_OK(ReOpen()); + Status s; + std::string value; + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = + cwb4recovery; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn1->SetName("joe"); + ASSERT_OK(s); + + s = txn2->SetName("bob"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + + if (cwb4recovery) { + ASSERT_OK( + txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"))); + } + + s = txn2->Prepare(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn2; + if (cwb4recovery) { + if (test_with_empty_wal) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // After flush the state must be visible + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + } + ASSERT_OK(db->FlushWAL(true)); + // kill and reopen to trigger recovery + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + } + } + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, TwoPhaseExpirationTest) { + Status s; + + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.expiration = 500; // 500ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn1); + + s = txn1->SetName("joe"); + ASSERT_OK(s); + s = txn2->SetName("bob"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Prepare(); + ASSERT_EQ(s, Status::Expired()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TwoPhaseRollbackTest) { + WriteOptions write_options; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("tfoo"), Slice("tbar")); + ASSERT_OK(s); + + // value is readable form txn + s = txn->Get(read_options, Slice("tfoo"), &value); + ASSERT_OK(s); + ASSERT_EQ(value, "tbar"); + + // issue rollback + s = txn->Rollback(); + ASSERT_OK(s); + + // value is nolonger readable + s = txn->Get(read_options, Slice("tfoo"), &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(txn->GetNumPuts(), 0); + + // put new txn values + s = txn->Put(Slice("tfoo2"), Slice("tbar2")); + ASSERT_OK(s); + + // new value is readable from txn + s = txn->Get(read_options, Slice("tfoo2"), &value); + ASSERT_OK(s); + ASSERT_EQ(value, "tbar2"); + + s = txn->Prepare(); + ASSERT_OK(s); + + // flush to next wal + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // issue rollback (marker written to WAL) + s = txn->Rollback(); + ASSERT_OK(s); + + // value is nolonger readable + s = txn->Get(read_options, Slice("tfoo2"), &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(txn->GetNumPuts(), 0); + + // make commit + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // try rollback again + s = txn->Rollback(); + ASSERT_EQ(s, Status::InvalidArgument()); + + delete txn; +} + +TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // txn read + s = txn->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // regular db put + s = db->Put(write_options, Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // regular db read + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "bar2"); + + // nothing has been prepped yet + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + // still not available to db + s = db->Get(read_options, Slice("foo"), &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(db->FlushWAL(false)); + delete txn; + // kill and reopen + reinterpret_cast(db)->TEST_Crash(); + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + db_impl = static_cast_with_check(db->GetRootDB()); + + // find trans in list of prepared transactions + std::vector prepared_trans; + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 1); + + txn = prepared_trans.front(); + ASSERT_TRUE(txn); + ASSERT_EQ(txn->GetName(), "xid"); + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // log has been marked + auto log_containing_prep = + db_impl->TEST_FindMinLogContainingOutstandingPrep(); + ASSERT_GT(log_containing_prep, 0); + + // value is readable from txn + s = txn->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + // value is now available + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + // we already committed + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // no longer is prepared results + prepared_trans.clear(); + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 0); + + // transaction should no longer be visible + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); + + // heap should not care about prepared section anymore + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // but now our memtable should be referencing the prep section + ASSERT_EQ(log_containing_prep, + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep()); + + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // Add a dummy record to memtable before a flush. Otherwise, the + // memtable will be empty and flush will be skipped. + s = db->Put(write_options, Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // after memtable flush we can now release the log + ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep); + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + + delete txn; + + // deleting transaction should unregister transaction + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// TODO this test needs to be updated with serial commits +TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) { + // mix transaction writes and regular writes + const uint32_t NUM_TXN_THREADS = 50; + std::atomic txn_thread_num(0); + + std::function txn_write_thread = [&]() { + uint32_t id = txn_thread_num.fetch_add(1); + + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + TransactionOptions txn_options; + txn_options.lock_timeout = 1000000; + if (id % 2 == 0) { + txn_options.expiration = 1000000; + } + TransactionName name("xid_" + std::string(1, 'A' + static_cast(id))); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName(name)); + for (int i = 0; i < 10; i++) { + std::string key(name + "_" + std::string(1, static_cast('A' + i))); + ASSERT_OK(txn->Put(key, "val")); + } + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + }; + + // assure that all thread are in the same write group + std::atomic t_wait_on_prepare(0); + std::atomic t_wait_on_commit(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + auto* writer = reinterpret_cast(arg); + + if (writer->ShouldWriteToWAL()) { + t_wait_on_prepare.fetch_add(1); + // wait for friends + while (t_wait_on_prepare.load() < NUM_TXN_THREADS) { + env->SleepForMicroseconds(10); + } + } else if (writer->ShouldWriteToMemtable()) { + t_wait_on_commit.fetch_add(1); + // wait for friends + while (t_wait_on_commit.load() < NUM_TXN_THREADS) { + env->SleepForMicroseconds(10); + } + } else { + FAIL(); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) { + threads.emplace_back(txn_write_thread); + } + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ReadOptions read_options; + std::string value; + Status s; + for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) { + TransactionName name("xid_" + std::string(1, 'A' + static_cast(t))); + for (int i = 0; i < 10; i++) { + std::string key(name + "_" + std::string(1, static_cast('A' + i))); + s = db->Get(read_options, key, &value); + ASSERT_OK(s); + ASSERT_EQ(value, "val"); + } + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("bob"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + delete txn; + + for (int i = 0; i < 1000; i++) { + std::string key(i, 'k'); + std::string val(1000, 'v'); + assert(db != nullptr); + s = db->Put(write_options, key, val); + ASSERT_OK(s); + + if (i % 29 == 0) { + // crash + env->SetFilesystemActive(false); + reinterpret_cast(db)->TEST_Crash(); + ReOpenNoDelete(); + } else if (i % 37 == 0) { + // close + ReOpenNoDelete(); + } + } + + // commit old txn + txn = db->GetTransactionByName("bob"); + ASSERT_TRUE(txn); + s = txn->Commit(); + ASSERT_OK(s); + + // verify data txn data + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + // verify non txn data + for (int i = 0; i < 1000; i++) { + std::string key(i, 'k'); + std::string val(1000, 'v'); + s = db->Get(read_options, key, &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, val); + } + + delete txn; +} + +TEST_P(TransactionTest, TwoPhaseSequenceTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + s = txn->Put(Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + s = txn->Put(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn->Put(Slice("foo4"), Slice("bar4")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + ReOpenNoDelete(); + assert(db != nullptr); + + // value is now available + s = db->Get(read_options, "foo4", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar4"); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("a"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + reinterpret_cast(db)->TEST_Crash(); + ReOpenNoDelete(); + + // commit old txn + assert(db != nullptr); // Make clang analyze happy. + txn = db->GetTransactionByName("a"); + assert(txn != nullptr); + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + delete txn; + + txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("b"); + ASSERT_OK(s); + + s = txn->Put(Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + + s = txn->Prepare(); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + // value is now available + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + s = db->Get(read_options, "foo2", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar2"); +} + +TEST_P(TransactionTest, TwoPhaseLogRollingTest) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + Status s; + std::string v; + ColumnFamilyHandle *cfa, *cfb; + + // Create 2 new column families + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + WriteOptions wopts; + wopts.disableWAL = false; + wopts.sync = true; + + TransactionOptions topts1; + Transaction* txn1 = db->BeginTransaction(wopts, topts1); + s = txn1->SetName("xid1"); + ASSERT_OK(s); + + TransactionOptions topts2; + Transaction* txn2 = db->BeginTransaction(wopts, topts2); + s = txn2->SetName("xid2"); + ASSERT_OK(s); + + // transaction put in two column families + s = txn1->Put(cfa, "ka1", "va1"); + ASSERT_OK(s); + + // transaction put in two column families + s = txn2->Put(cfa, "ka2", "va2"); + ASSERT_OK(s); + s = txn2->Put(cfb, "kb2", "vb2"); + ASSERT_OK(s); + + // write prep section to wal + s = txn1->Prepare(); + ASSERT_OK(s); + + // our log should be in the heap + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber()); + + // flush default cf to crate new log + s = db->Put(wopts, "foo", "bar"); + ASSERT_OK(s); + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // make sure we are on a new log + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber()); + + // put txn2 prep section in this log + s = txn2->Prepare(); + ASSERT_OK(s); + ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber()); + + // heap should still see first log + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + + // commit txn1 + s = txn1->Commit(); + ASSERT_OK(s); + + // heap should now show txn2s log + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn2->GetLogNumber()); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // we should see txn1s log refernced by the memtables + ASSERT_EQ(txn1->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush default cf to crate new log + s = db->Put(wopts, "foo", "bar2"); + ASSERT_OK(s); + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // make sure we are on a new log + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber()); + + // commit txn2 + s = txn2->Commit(); + ASSERT_OK(s); + + // heap should not show any logs + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // should show the first txn log + ASSERT_EQ(txn1->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush only cfa memtable + s = db_impl->TEST_FlushMemTable(true, false, cfa); + ASSERT_OK(s); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // should show the first txn log + ASSERT_EQ(txn2->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush only cfb memtable + s = db_impl->TEST_FlushMemTable(true, false, cfb); + ASSERT_OK(s); + + // should show not dependency on logs + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + delete txn1; + delete txn2; + delete cfa; + delete cfb; +} + +TEST_P(TransactionTest, TwoPhaseLogRollingTest2) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + Status s; + ColumnFamilyHandle *cfa, *cfb; + + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + WriteOptions wopts; + wopts.disableWAL = false; + wopts.sync = true; + + auto cfh_a = static_cast_with_check(cfa); + auto cfh_b = static_cast_with_check(cfb); + + TransactionOptions topts1; + Transaction* txn1 = db->BeginTransaction(wopts, topts1); + s = txn1->SetName("xid1"); + ASSERT_OK(s); + s = txn1->Put(cfa, "boys", "girls1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(wopts, topts1); + s = txn2->SetName("xid2"); + ASSERT_OK(s); + s = txn2->Put(cfb, "up", "down1"); + ASSERT_OK(s); + + // prepre transaction in LOG A + s = txn1->Prepare(); + ASSERT_OK(s); + + // prepre transaction in LOG A + s = txn2->Prepare(); + ASSERT_OK(s); + + // regular put so that mem table can actually be flushed for log rolling + s = db->Put(wopts, "cats", "dogs1"); + ASSERT_OK(s); + + auto prepare_log_no = txn1->GetLastLogNumber(); + + // roll to LOG B + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // now we pause background work so that + // imm()s are not flushed before we can check their status + s = db_impl->PauseBackgroundWork(); + ASSERT_OK(s); + + ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no); + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // This cf is empty and should ref the latest log + ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no); + ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // This cf is not flushed yet and should ref the log that has its data + ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no); + break; + default: + assert(false); + } + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + + // commit in LOG B + s = txn1->Commit(); + ASSERT_OK(s); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), + prepare_log_no); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + break; + default: + assert(false); + } + + ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog()); + + // request a flush for all column families such that the earliest + // alive log file can be killed + ASSERT_OK(db_impl->TEST_SwitchWAL()); + // log cannot be flushed because txn2 has not been commited + ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed()); + ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog()); + + // assert that cfa has a flush requested + ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested()); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // cfb should not be flushed becuse it has no data from LOG A + ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // cfb should be flushed becuse it has prepared data from LOG A + ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested()); + break; + default: + assert(false); + } + + // cfb now has data from LOG A + s = txn2->Commit(); + ASSERT_OK(s); + + ASSERT_OK(db_impl->TEST_SwitchWAL()); + ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog()); + + // we should see that cfb now has a flush requested + ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested()); + + // all data in LOG A resides in a memtable that has been + // requested for a flush + ASSERT_TRUE(db_impl->TEST_IsLogGettingFlushed()); + + delete txn1; + delete txn2; + delete cfa; + delete cfb; +} +/* + * 1) use prepare to keep first log around to determine starting sequence + * during recovery. + * 2) insert many values, skipping wal, to increase seqid. + * 3) insert final value into wal + * 4) recover and see that final value was properly recovered - not + * hidden behind improperly summed sequence ids + */ +TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + WriteOptions wal_on, wal_off; + wal_on.sync = true; + wal_on.disableWAL = false; + wal_off.disableWAL = true; + ReadOptions read_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(wal_on, txn_options); + + s = txn1->SetName("1"); + ASSERT_OK(s); + + s = db->Put(wal_on, "first", "first"); + ASSERT_OK(s); + + s = txn1->Put(Slice("dummy"), Slice("dummy")); + ASSERT_OK(s); + s = txn1->Prepare(); + ASSERT_OK(s); + + s = db->Put(wal_off, "cats", "dogs1"); + ASSERT_OK(s); + s = db->Put(wal_off, "cats", "dogs2"); + ASSERT_OK(s); + s = db->Put(wal_off, "cats", "dogs3"); + ASSERT_OK(s); + + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + s = db->Put(wal_on, "cats", "dogs4"); + ASSERT_OK(s); + + ASSERT_OK(db->FlushWAL(false)); + + // kill and reopen + env->SetFilesystemActive(false); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + s = db->Get(read_options, "first", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "first"); + + s = db->Get(read_options, "cats", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs4"); +} + +TEST_P(TransactionTest, FirstWriteTest) { + WriteOptions write_options; + + // Test conflict checking against the very first write to a db. + // The transaction's snapshot will have seq 1 and the following write + // will have sequence 1. + Status s = db->Put(write_options, "A", "a"); + + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + ASSERT_OK(s); + + s = txn->Put("A", "b"); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, FirstWriteTest2) { + WriteOptions write_options; + + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + // Test conflict checking against the very first write to a db. + // The transaction's snapshot is a seq 0 while the following write + // will have sequence 1. + Status s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = txn->Put("A", "b"); + ASSERT_TRUE(s.IsBusy()); + + delete txn; +} + +TEST_P(TransactionTest, WriteOptionsTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = true; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_TRUE(txn->GetWriteOptions()->sync); + + write_options.sync = false; + txn->SetWriteOptions(write_options); + ASSERT_FALSE(txn->GetWriteOptions()->sync); + ASSERT_TRUE(txn->GetWriteOptions()->disableWAL); + + delete txn; +} + +TEST_P(TransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "A")); + ASSERT_OK(db->Put(write_options, "foo2", "B")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("foo", "A2"); + ASSERT_OK(s); + + s = txn->Put("foo2", "B2"); + ASSERT_OK(s); + + // This Put outside of a transaction will conflict with the previous write + s = db->Put(write_options, "foo", "xxx"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A"); + + s = txn->Commit(); + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A2"); + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "B2"); + + delete txn; +} + +TEST_P(TransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + // This Put outside of a transaction will conflict with a later write + s = db->Put(write_options, "foo", "barz"); + ASSERT_OK(s); + + s = txn->Put("foo2", "X"); + ASSERT_OK(s); + + s = txn->Put("foo", + "bar2"); // Conflicts with write done after snapshot taken + ASSERT_TRUE(s.IsBusy()); + + s = txn->Put("foo3", "Y"); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + ASSERT_EQ(2, txn->GetNumKeys()); + + s = txn->Commit(); + ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3 + + // Verify that transaction wrote foo2 and foo3 but not foo + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "X"); + + db->Get(read_options, "foo3", &value); + ASSERT_EQ(value, "Y"); + + delete txn; +} + +TEST_P(TransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "bar")); + ASSERT_OK(db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + s = db->Put(write_options, "foo", "barz"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("x", "y"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + s = txn->Commit(); + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, FlushTest2) { + const size_t num_tests = 3; + + for (size_t n = 0; n < num_tests; n++) { + // Test different table factories + switch (n) { + case 0: + break; + case 1: + options.table_factory.reset(new mock::MockTableFactory()); + break; + case 2: { + PlainTableOptions pt_opts; + pt_opts.hash_table_ratio = 0; + options.table_factory.reset(NewPlainTableFactory(pt_opts)); + break; + } + } + + Status s = ReOpen(); + ASSERT_OK(s); + assert(db != nullptr); + + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2"))); + ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3"))); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + // verify foo is locked by txn + s = db->Delete(write_options, "foo"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Put(write_options, "Z", "z"); + ASSERT_OK(s); + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + s = db->Put(write_options, "S", "s"); + ASSERT_OK(s); + s = db->SingleDelete(write_options, "S"); + ASSERT_OK(s); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in memtable + ASSERT_TRUE(s.IsBusy()); + + // force a memtable flush + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy2"); + ASSERT_OK(s); + + // force a memtable flush + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + s = db->Put(write_options, "dummy", "dummy3"); + ASSERT_OK(s); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + s = txn->Put("X", "Y"); + // Should succeed after verifying there is no write to X in SST file + ASSERT_OK(s); + + s = txn->Put("Z", "zz"); + // Should fail after encountering a write to Z in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->GetForUpdate(read_options, "foo2", &value); + // should succeed since key was written before txn started + ASSERT_OK(s); + // verify foo2 is locked by txn + s = db->Delete(write_options, "foo2"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in SST file + ASSERT_TRUE(s.IsBusy()); + + // Write a bunch of keys to db to force a compaction + Random rnd(47); + for (int i = 0; i < 1000; i++) { + s = db->Put(write_options, std::to_string(i), + test::CompressibleString(&rnd, 0.8, 100, &value)); + ASSERT_OK(s); + } + + s = txn->Put("X", "yy"); + // Should succeed after verifying there is no write to X in SST file + ASSERT_OK(s); + + s = txn->Put("Z", "zzz"); + // Should fail after encountering a write to Z in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->GetForUpdate(read_options, "foo3", &value); + // should succeed since key was written before txn started + ASSERT_OK(s); + // verify foo3 is locked by txn + s = db->Delete(write_options, "foo3"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(db_impl->TEST_WaitForCompact()); + + s = txn->Commit(); + ASSERT_OK(s); + + // Transaction should only write the keys that succeeded. + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("yy", value); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z", value); + + delete txn; + } +} + +TEST_P(TransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "AAA", "bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Modify key after transaction start + ASSERT_OK(db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snap + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Should commit since read/write was done after data changed + s = txn->Commit(); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "AAA", "bar")); + ASSERT_OK(db->Put(write_options, "BBB", "bar")); + ASSERT_OK(db->Put(write_options, "CCC", "bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_OK(db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Modify BBB before snapshot is taken + ASSERT_OK(db->Put(write_options, "BBB", "bar1")); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("BBB", "bar2"); + ASSERT_OK(s); + + ASSERT_OK(db->Put(write_options, "CCC", "bar1")); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("CCC", "bar2"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = db->BeginTransaction(write_options); + + // Potentially conflicting writes + ASSERT_OK(db->Put(write_options, "ZZZ", "zzz")); + ASSERT_OK(db->Put(write_options, "XXX", "xxx")); + + txn->SetSnapshot(); + + TransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + s = txn->Put("ZZZ", "zzzz"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + s = txn2->Put("ZZZ", "xxxxx"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "ZZZ", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + delete txn2; +} + +TEST_P(TransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + db = nullptr; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + std::vector handles; + + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "foo")); + ASSERT_OK(batch.Put(handles[1], "AAA", "bar")); + ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar")); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ")); + + // These keys do not conflict with existing writes since they're in + // different column families + s = txn->Delete("AAA"); + ASSERT_OK(s); + s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + s = txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + ASSERT_OK(s); + ASSERT_EQ(3, txn->GetNumKeys()); + + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + + s = txn2->Delete(handles[2], "XXX"); + ASSERT_OK(s); + s = txn2->Delete(handles[1], "XXX"); + ASSERT_OK(s); + + // This write will cause a conflict with the earlier batch write + s = txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + // In the above the latest change to AAAZZZ in handles[1] is delete. + s = db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; + + txn = db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + std::vector multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector values(4); + std::vector results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + s = txn->SingleDelete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYY"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYYY"); + ASSERT_OK(s); + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "AAAZZZ", "barbarbar"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumKeys()); + + // Txn should commit + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + ASSERT_OK(db->Put(write_options, handles[2], "foo", "000")); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + // All results should fail since there was a conflict + ASSERT_TRUE(results[0].IsBusy()); + ASSERT_TRUE(results[1].IsBusy()); + ASSERT_TRUE(results[2].IsBusy()); + ASSERT_TRUE(results[3].IsBusy()); + + s = db->Get(read_options, handles[2], "foo", &value); + ASSERT_EQ(value, "000"); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionTest, MultiGetBatchedTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ColumnFamilyHandle* cf; + ColumnFamilyOptions cf_options; + + // Create a new column families + s = db->CreateColumnFamily(cf_options, "CF", &cf); + ASSERT_OK(s); + + delete cf; + delete db; + db = nullptr; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + + std::vector handles; + + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put(handles[1], "aaa", "val1")); + ASSERT_OK(batch.Put(handles[1], "bbb", "val2")); + ASSERT_OK(batch.Put(handles[1], "ccc", "val3")); + ASSERT_OK(batch.Put(handles[1], "ddd", "foo")); + ASSERT_OK(batch.Put(handles[1], "eee", "val5")); + ASSERT_OK(batch.Put(handles[1], "fff", "val6")); + ASSERT_OK(batch.Merge(handles[1], "ggg", "foo")); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + // Write some data to the db + s = txn->Delete(handles[1], "bbb"); + ASSERT_OK(s); + s = txn->Put(handles[1], "ccc", "val3_new"); + ASSERT_OK(s); + s = txn->Merge(handles[1], "ddd", "bar"); + ASSERT_OK(s); + + std::vector keys = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"}; + std::vector values(keys.size()); + std::vector statuses(keys.size()); + + txn->MultiGet(snapshot_read_options, handles[1], keys.size(), keys.data(), + values.data(), statuses.data()); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ(values[0], "val1"); + ASSERT_TRUE(statuses[1].IsNotFound()); + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ(values[2], "val3_new"); + ASSERT_TRUE(statuses[3].ok()); + ASSERT_EQ(values[3], "foo,bar"); + ASSERT_TRUE(statuses[4].ok()); + ASSERT_EQ(values[4], "val5"); + ASSERT_TRUE(statuses[5].ok()); + ASSERT_EQ(values[5], "val6"); + ASSERT_TRUE(statuses[6].ok()); + ASSERT_EQ(values[6], "foo"); + delete txn; + for (auto handle : handles) { + delete handle; + } +} + +// This test calls WriteBatchWithIndex::MultiGetFromBatchAndDB with a large +// number of keys, i.e greater than MultiGetContext::MAX_BATCH_SIZE, which is +// is 32. This forces autovector allocations in the MultiGet code paths +// to use std::vector in addition to stack allocations. The MultiGet keys +// includes Merges, which are handled specially in MultiGetFromBatchAndDB by +// allocating an autovector of MergeContexts +TEST_P(TransactionTest, MultiGetLargeBatchedTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ColumnFamilyHandle* cf; + ColumnFamilyOptions cf_options; + + std::vector key_str; + for (int i = 0; i < 100; ++i) { + key_str.emplace_back(std::to_string(i)); + } + // Create a new column families + s = db->CreateColumnFamily(cf_options, "CF", &cf); + ASSERT_OK(s); + + delete cf; + delete db; + db = nullptr; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + + std::vector handles; + + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + // Write some data to the db + WriteBatch batch; + for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) { + std::string val = "val" + std::to_string(i); + ASSERT_OK(batch.Put(handles[1], key_str[i], val)); + } + s = db->Write(write_options, &batch); + ASSERT_OK(s); + + WriteBatchWithIndex wb; + // Write some data to the db + s = wb.Delete(handles[1], std::to_string(1)); + ASSERT_OK(s); + s = wb.Put(handles[1], std::to_string(2), "new_val" + std::to_string(2)); + ASSERT_OK(s); + // Write a lot of merges so when we call MultiGetFromBatchAndDB later on, + // it is forced to use std::vector in ROCKSDB_NAMESPACE::autovector to + // allocate MergeContexts. The number of merges needs to be > + // MultiGetContext::MAX_BATCH_SIZE + for (int i = 8; i < MultiGetContext::MAX_BATCH_SIZE + 24; ++i) { + s = wb.Merge(handles[1], std::to_string(i), "merge"); + ASSERT_OK(s); + } + + // MultiGet a lot of keys in order to force std::vector reallocations + std::vector keys; + for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE + 32; ++i) { + keys.emplace_back(key_str[i]); + } + std::vector values(keys.size()); + std::vector statuses(keys.size()); + + wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(), + keys.data(), values.data(), statuses.data(), false); + for (size_t i = 0; i < keys.size(); ++i) { + if (i == 1) { + ASSERT_TRUE(statuses[1].IsNotFound()); + } else if (i == 2) { + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ(values[2], "new_val" + std::to_string(2)); + } else if (i >= 8 && i < 56) { + ASSERT_TRUE(statuses[i].ok()); + ASSERT_EQ(values[i], "val" + std::to_string(i) + ",merge"); + } else { + ASSERT_TRUE(statuses[i].ok()); + if (values[i] != "val" + std::to_string(i)) { + ASSERT_EQ(values[i], "val" + std::to_string(i)); + } + } + } + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionTest, MultiGetSnapshot) { + WriteOptions write_options; + TransactionOptions transaction_options; + Transaction* txn1 = db->BeginTransaction(write_options, transaction_options); + + Slice key = "foo"; + + Status s = txn1->Put(key, "bar"); + ASSERT_OK(s); + + s = txn1->SetName("test"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + // Get snapshot between prepare and commit + // Un-committed data should be invisible to other transactions + const Snapshot* s1 = db->GetSnapshot(); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + Transaction* txn2 = db->BeginTransaction(write_options, transaction_options); + ReadOptions read_options; + read_options.snapshot = s1; + + std::vector keys; + std::vector values(1); + std::vector statuses(1); + keys.push_back(key); + auto cfd = db->DefaultColumnFamily(); + txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(), + statuses.data()); + ASSERT_TRUE(statuses[0].IsNotFound()); + delete txn2; + + db->ReleaseSnapshot(s1); +} + +TEST_P(TransactionTest, ColumnFamiliesTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ColumnFamilyHandle *one, *two; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "ONE", &one); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "TWO", &two); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + s = txn1->Put(one, "X", "1"); + ASSERT_OK(s); + s = txn1->Put(two, "X", "2"); + ASSERT_OK(s); + s = txn1->Put("X", "0"); + ASSERT_OK(s); + + s = txn2->Put(one, "X", "11"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + // Drop first column family + s = db->DropColumnFamily(one); + ASSERT_OK(s); + + // Should fail since column family was dropped. + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + // Should fail since column family was dropped + s = txn1->Put(one, "X", "111"); + ASSERT_TRUE(s.IsInvalidArgument()); + + s = txn1->Put(two, "X", "222"); + ASSERT_OK(s); + + s = txn1->Put("X", "000"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, two, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("222", value); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("000", value); + + s = db->DropColumnFamily(two); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + delete one; + delete two; +} + +TEST_P(TransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + ASSERT_OK(txn->Rollback()); + delete txn; + + txn = db->BeginTransaction(write_options); + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + // Conflicts with previous GetForUpdate + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_TRUE(s.IsTimedOut()); + + // transaction expired! + s = txn->Commit(); + ASSERT_OK(s); + delete txn; +} + +TEST_P(TransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector multiget_keys = {"1", "2", "3"}; + std::vector multiget_values; + + std::vector results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_EQ(results.size(), 3); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn2->Rollback()); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_EQ(results.size(), 3); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("4", "x"); + ASSERT_OK(s); + + s = txn2->Delete("4"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsBusy()); + + ASSERT_OK(txn2->Rollback()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + TransactionOptions txn_options; + std::string value; + Status s; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "1"); + ASSERT_OK(s); + + s = txn2->Put("1", "2"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "3"); + ASSERT_OK(s); + s = txn2->Put("1", "4"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "5"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "6"); + ASSERT_TRUE(s.IsBusy()); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "7"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn2->SetSnapshot(); + s = txn2->Put("1", "8"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("8", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options); + txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "9"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "10"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "10"); +} + +TEST_P(TransactionTest, UntrackedWrites) { + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + // TODO(lth): For WriteUnprepared, validate that untracked writes are + // not supported. + return; + } + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->PutUntracked("untracked", "0"); + ASSERT_OK(s); + ASSERT_OK(txn->Rollback()); + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = db->Put(write_options, "untracked", "x"); + ASSERT_OK(s); + + // Untracked writes should succeed even though key was written after snapshot + s = txn->PutUntracked("untracked", "1"); + ASSERT_OK(s); + s = txn->MergeUntracked("untracked", "2"); + ASSERT_OK(s); + s = txn->DeleteUntracked("untracked"); + ASSERT_OK(s); + + // Conflict + s = txn->Put("untracked", "3"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(TransactionTest, ExpiredTransaction) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + s = txn1->Put("Y", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should be able to write to X since txn1 has expired + s = txn2->Put("X", "2"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("2", value); + + s = txn1->Put("Z", "1"); + ASSERT_OK(s); + + // txn1 should fail to commit since it is expired + s = txn1->Commit(); + ASSERT_TRUE(s.IsExpired()); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "Z", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, ReinitializeTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Reinitialize transaction to no long expire + txn_options.expiration = -1; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "z"); + ASSERT_OK(s); + + // Should commit since not expired + s = txn1->Commit(); + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "zz"); + ASSERT_OK(s); + + // Reinitilize txn1 and verify that Z gets unlocked + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr); + s = txn2->Put("Z", "zzz"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzz"); + + // Verify snapshots get reinitialized correctly + txn1->SetSnapshot(); + s = txn1->Put("Z", "zzzz"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + const Snapshot* snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_TRUE(snapshot); + + s = txn1->Put("Z", "a"); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + + s = txn1->Put("Y", "y"); + ASSERT_OK(s); + + txn_options.set_snapshot = false; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + s = txn1->Put("X", "x"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->SetName("name"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->SetName("name"); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, Rollback) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + ASSERT_OK(s); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should not be able to write to X since txn1 has it locked + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->Rollback()); + delete txn1; + + // txn2 should now be able to write to X + s = txn2->Put("X", "3"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn2; +} + +TEST_P(TransactionTest, LockLimitTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + delete db; + db = nullptr; + + // Open DB with a lock limit of 3 + txn_db_options.max_num_locks = 3; + ASSERT_OK(ReOpen()); + assert(db != nullptr); + ASSERT_OK(s); + + // Create a txn and verify we can only lock up to 3 keys + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + s = txn->Put("X", "x"); + ASSERT_OK(s); + + s = txn->Put("Y", "y"); + ASSERT_OK(s); + + s = txn->Put("Z", "z"); + ASSERT_OK(s); + + // lock limit reached + s = txn->Put("W", "w"); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->Put("X", "xx"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "W", &value); + ASSERT_TRUE(s.IsBusy()); + s = txn->GetForUpdate(read_options, "V", &value); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->GetForUpdate(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = txn->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // "X" currently locked + s = txn2->Put("X", "x"); + ASSERT_TRUE(s.IsTimedOut()); + + // lock limit reached + s = txn2->Put("M", "m"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("xx", value); + + s = db->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Committing txn should release its locks and allow txn2 to proceed + s = txn2->Put("X", "x2"); + ASSERT_OK(s); + + s = txn2->Delete("X"); + ASSERT_OK(s); + + s = txn2->Put("M", "m"); + ASSERT_OK(s); + + s = txn2->Put("Z", "z2"); + ASSERT_OK(s); + + // lock limit reached + s = txn2->Delete("Y"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z2", value); + + s = db->Get(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; +} + +TEST_P(TransactionTest, IteratorTest) { + // This test does writes without snapshot validation, and then tries to create + // iterator later, which is unsupported in write unprepared. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + if (i == 2) { + // "C" was modified after txn's snapshot + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + s = txn->Commit(); + ASSERT_OK(s); + + delete iter; + delete txn; +} + +TEST_P(TransactionTest, DisableIndexingTest) { + // Skip this test for write unprepared. It does not solely rely on WBWI for + // read your own writes, so depending on whether batches are flushed or not, + // only some writes will be visible. + // + // Also, write unprepared does not support creating iterators if there has + // been txn->Put() without snapshot validation. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn->DisableIndexing(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + + iter->Seek("B"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + s = txn->Delete("A"); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn->EnableIndexing(); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + iter->Seek("B"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bb", iter->value().ToString()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + delete iter; + delete txn; +} + +TEST_P(TransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + txn->SetSavePoint(); // 1 + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + ASSERT_EQ(1, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumPuts()); + ASSERT_EQ(1, txn->GetNumDeletes()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2 + + ASSERT_EQ(3, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Rollback()); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + ASSERT_EQ(2, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->SingleDelete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "F" should be. The current implementation will + // return NotFound in this case. + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_EQ(3, txn->GetNumPuts()); + ASSERT_EQ(2, txn->GetNumDeletes()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3 + + ASSERT_EQ(2, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(TransactionTest, SavepointTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->Put("A", ""); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 1 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + s = txn1->Put("C", "c"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 2 + + // Verify that "A" and "C" is still locked while "B" is not + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_OK(s); + + s = txn1->Put("A", "aa"); + ASSERT_OK(s); + s = txn1->Put("B", "bb"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->Put("A", "aaa"); + ASSERT_OK(s); + s = txn1->Put("B", "bbb"); + ASSERT_OK(s); + s = txn1->Put("C", "ccc"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 3 + + // Verify that "A", "B", "C" are still locked + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1 + + // Verify that only "A" is locked + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_OK(s); + s = txn2->Put("C", "c3po"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + // Verify "A" "C" "B" are no longer locked + s = txn2->Put("A", "a4"); + ASSERT_OK(s); + s = txn2->Put("B", "b4"); + ASSERT_OK(s); + s = txn2->Put("C", "c4"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; +} + +TEST_P(TransactionTest, SavepointTest3) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->PopSavePoint(); // No SavePoint present + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("A", ""); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Still no SavePoint present + ASSERT_TRUE(s.IsNotFound()); + + txn1->SetSavePoint(); // 1 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Remove 1 + ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound()); + + // Verify that "A" is still locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + delete txn2; + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + + s = txn1->Put("B", "b2"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Roll back to 2 + + s = txn1->PopSavePoint(); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + std::string value; + + // tnx1 should have modified "A" to "a" + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + // tnx1 should have set "B" to just "b" + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(TransactionTest, SavepointTest4) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->SetSavePoint(); // 1 + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Remove 2 + ASSERT_OK(s); + + // Verify that A/B still exists. + std::string value; + ASSERT_OK(txn1->Get(read_options, "A", &value)); + ASSERT_EQ("a", value); + + ASSERT_OK(txn1->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1 + + // Verify that everything was rolled back. + s = txn1->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Nothing should be locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", ""); + ASSERT_OK(s); + + s = txn2->Put("B", ""); + ASSERT_OK(s); + + delete txn2; + delete txn1; +} + +TEST_P(TransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->UndoGetForUpdate("A"); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + txn1 = db->BeginTransaction(write_options, txn_options); + + txn1->UndoGetForUpdate("A"); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Verify that A is locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("A"); + + // Verify that A is now unlocked + s = txn2->Put("A", "a2"); + ASSERT_OK(s); + ASSERT_OK(txn2->Commit()); + delete txn2; + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a2", value); + + s = txn1->Delete("A"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("B", "b3"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + + // Verify that A and B are still locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a4"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b4"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->Rollback()); + delete txn1; + + // Verify that A and B are no longer locked + s = txn2->Put("A", "a5"); + ASSERT_OK(s); + s = txn2->Put("B", "b5"); + ASSERT_OK(s); + s = txn2->Commit(); + delete txn2; + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + s = txn1->Put("B", "b5"); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B,C are locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B are locked and C is not + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_OK(s); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify B is locked and A and C are not + s = txn2->Put("A", "a7"); + ASSERT_OK(s); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c7"); + ASSERT_OK(s); + s = txn2->Put("X", "x7"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +TEST_P(TransactionTest, UndoGetForUpdateTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + s = db->Put(write_options, "A", ""); + ASSERT_OK(s); + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("F", "f"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 1 + + txn1->UndoGetForUpdate("A"); + + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("E", "e"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "E", &value); + ASSERT_OK(s); + + s = txn1->GetForUpdate(read_options, "F", &value); + ASSERT_OK(s); + + // Verify A,B,C,D,E,F are still locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f1"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("E"); + + // Verify A,B,D,E,F are still locked and C is not. + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("H", "h"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,D,E,F,H are still locked and C,G are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("H", "h3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 2 + + // Verify A,B,D,E,F are still locked and C,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,E,F are still locked and C,D,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 1 + + // Verify A,B,F are still locked and C,D,E,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify F is still locked and A,B,C,D,E,G,H are not. + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("A", "a3"); + ASSERT_OK(s); + s = txn2->Put("B", "b3"); + ASSERT_OK(s); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TimeoutTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + delete db; + db = nullptr; + + // transaction writes have an infinite timeout, + // but we will override this when we start a txn + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = -1; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(db != nullptr); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options0; + txn_options0.expiration = 100; // 100ms + txn_options0.lock_timeout = 50; // txn timeout no longer infinite + Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + ASSERT_GE(txn1->GetElapsedTime(), + static_cast(txn_options0.expiration)); + + s = txn1->Commit(); + ASSERT_TRUE(s.IsExpired()); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + delete db; + + // transaction writes have 10ms timeout, + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = 50; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options; + txn_options.expiration = 100; // 100ms + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + txn_options.expiration = 6000000; // 100 minutes + txn_options.lock_timeout = 1; // 1ms + txn1 = db->BeginTransaction(write_options, txn_options); + txn1->SetLockTimeout(100); + + TransactionOptions txn_options2; + txn_options2.expiration = 10; // 10ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options2); + ASSERT_OK(s); + + s = txn2->Put("a", "2"); + ASSERT_OK(s); + + // txn1 has a lock timeout longer than txn2's expiration, so it will win + s = txn1->Delete("a"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + // txn2 should be expired out since txn1 waiting until its timeout expired. + s = txn2->Commit(); + ASSERT_TRUE(s.IsExpired()); + + delete txn1; + delete txn2; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + txn_options2.expiration = 100000000; + txn2 = db->BeginTransaction(write_options, txn_options2); + + s = txn1->Delete("asdf"); + ASSERT_OK(s); + + // txn2 has a smaller lock timeout than txn1's expiration, so it will time out + s = txn2->Delete("asdf"); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("asdf", "asdf"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "asdf", &value); + ASSERT_OK(s); + ASSERT_EQ("asdf", value); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, SingleDeleteTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn = db->BeginTransaction(write_options); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("A", "a2"); + ASSERT_OK(s); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->SingleDelete("B"); + ASSERT_OK(s); + + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "A" should be. The current implementation will + // return NotFound in this case. + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn2->Put("B", "b"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "A" should be. The current implementation will + // return NotFound in this case. + s = db->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(TransactionTest, MergeTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + s = db->Put(write_options, "A", "a0"); + ASSERT_OK(s); + + s = txn->Merge("A", "1"); + ASSERT_OK(s); + + s = txn->Merge("A", "2"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a0,1,2", value); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Merge("A", "3"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a,3", value); + + TransactionOptions txn_options; + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // verify that txn has "A" locked + s = txn2->Merge("A", "4"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a,3", value); +} + +TEST_P(TransactionTest, DeleteRangeSupportTest) { + // The `DeleteRange()` API is banned everywhere. + ASSERT_TRUE( + db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + + // But range deletions can be added via the `Write()` API by specifying the + // proper flags to promise there are no conflicts according to the DB type + // (see `TransactionDB::DeleteRange()` API doc for details). + for (bool skip_concurrency_control : {false, true}) { + for (bool skip_duplicate_key_check : {false, true}) { + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + TransactionDBWriteOptimizations flags; + flags.skip_concurrency_control = skip_concurrency_control; + flags.skip_duplicate_key_check = skip_duplicate_key_check; + Status s = db->Write(WriteOptions(), flags, &wb); + std::string value; + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + if (skip_concurrency_control) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + case WRITE_PREPARED: + // Intentional fall-through + case WRITE_UNPREPARED: + if (skip_concurrency_control && skip_duplicate_key_check) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + } + // Without any promises from the user, range deletion via other `Write()` + // APIs are still banned. + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + ASSERT_NOK(db->Write(WriteOptions(), &wb)); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + } +} + +TEST_P(TransactionTest, DeferSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + s = db->Put(write_options, "A", "a0"); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + txn1->SetSnapshotOnNextOperation(); + auto snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + s = txn2->Put("A", "a2"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->GetForUpdate(read_options, "A", &value); + // Should not conflict with txn2 since snapshot wasn't set until + // GetForUpdate was called. + ASSERT_OK(s); + ASSERT_EQ("a2", value); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + s = db->Put(write_options, "B", "b0"); + ASSERT_OK(s); + + // Cannot lock B since it was written after the snapshot was set + s = txn1->Put("B", "b1"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a1", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b0", value); +} + +TEST_P(TransactionTest, DeferSnapshotTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSnapshot(); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c0"); + ASSERT_OK(s); + s = db->Put(write_options, "D", "d0"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + + txn1->SetSnapshotOnNextOperation(); + + s = txn1->Get(snapshot_read_options, "C", &value); + // Snapshot was set before C was written + ASSERT_TRUE(s.IsNotFound()); + s = txn1->Get(snapshot_read_options, "D", &value); + // Snapshot was set before D was written + ASSERT_TRUE(s.IsNotFound()); + + // Snapshot should not have changed yet. + snapshot_read_options.snapshot = txn1->GetSnapshot(); + + s = txn1->Get(snapshot_read_options, "C", &value); + // Snapshot was set before C was written + ASSERT_TRUE(s.IsNotFound()); + s = txn1->Get(snapshot_read_options, "D", &value); + // Snapshot was set before D was written + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c0", value); + + s = db->Put(write_options, "D", "d00"); + ASSERT_OK(s); + + // Snapshot is now set + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "D", &value); + ASSERT_OK(s); + ASSERT_EQ("d0", value); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +TEST_P(TransactionTest, DeferSnapshotSavePointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSavePoint(); // 1 + + s = db->Put(write_options, "T", "1"); + ASSERT_OK(s); + + txn1->SetSnapshotOnNextOperation(); + + s = db->Put(write_options, "T", "2"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = db->Put(write_options, "T", "3"); + ASSERT_OK(s); + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + + s = db->Put(write_options, "T", "4"); + ASSERT_OK(s); + + txn1->SetSnapshot(); + txn1->SetSnapshotOnNextOperation(); + + txn1->SetSavePoint(); // 4 + + s = db->Put(write_options, "T", "5"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("4", value); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 4 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("4", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 3 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + s = txn1->Get(read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 2 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->Delete("A"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_TRUE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 1 + ASSERT_OK(s); + + s = txn1->Delete("A"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + class Notifier : public TransactionNotifier { + private: + const Snapshot** snapshot_ptr_; + + public: + explicit Notifier(const Snapshot** snapshot_ptr) + : snapshot_ptr_(snapshot_ptr) {} + + void SnapshotCreated(const Snapshot* newSnapshot) override { + *snapshot_ptr_ = newSnapshot; + } + }; + + std::shared_ptr notifier = + std::make_shared(&read_options.snapshot); + Status s; + + s = db->Put(write_options, "B", "0"); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSnapshotOnNextOperation(notifier); + ASSERT_FALSE(read_options.snapshot); + + s = db->Put(write_options, "B", "1"); + ASSERT_OK(s); + + // A Get does not generate the snapshot + s = txn1->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_FALSE(read_options.snapshot); + ASSERT_EQ(value, "1"); + + // Any other operation does + s = txn1->Put("A", "0"); + ASSERT_OK(s); + + // Now change "B". + s = db->Put(write_options, "B", "2"); + ASSERT_OK(s); + + // The original value should still be read + s = txn1->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_TRUE(read_options.snapshot); + ASSERT_EQ(value, "1"); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, ClearSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + s = db->Put(write_options, "foo", "0"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = db->Put(write_options, "foo", "1"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + + // No snapshot created yet + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "1"); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_TRUE(snapshot_read_options.snapshot); + + s = db->Put(write_options, "foo", "2"); + ASSERT_OK(s); + + // Snapshot was created before change to '2' + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "1"); + + txn->ClearSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + + // Snapshot has now been cleared + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "2"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, ToggleAutoCompactionTest) { + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + + // open DB with three column families + std::vector column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + ColumnFamilyOptions* cf_opt_default = &column_families[0].options; + ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options; + ColumnFamilyOptions* cf_opt_cfb = &column_families[2].options; + cf_opt_default->disable_auto_compactions = false; + cf_opt_cfa->disable_auto_compactions = true; + cf_opt_cfb->disable_auto_compactions = false; + + std::vector handles; + + s = TransactionDB::Open(options, txn_db_options, dbname, column_families, + &handles, &db); + ASSERT_OK(s); + + auto cfh_default = static_cast_with_check(handles[0]); + auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions(); + + auto cfh_a = static_cast_with_check(handles[1]); + auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions(); + + auto cfh_b = static_cast_with_check(handles[2]); + auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions(); + + ASSERT_EQ(opt_default.disable_auto_compactions, false); + ASSERT_EQ(opt_a.disable_auto_compactions, true); + ASSERT_EQ(opt_b.disable_auto_compactions, false); + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) { + // In this test, txn1 should succeed committing, + // as the callback is called after txn1 starts committing. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"TransactionTest::ExpirableTransactionDataRace:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) { + WriteOptions write_options; + TransactionOptions txn_options; + + // Force txn1 to expire + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Status s; + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions write_options; + TransactionOptions txn_options; + + txn_options.expiration = 1000; // 1 second + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + Status s; + s = txn1->Put("X", "1"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + ReadOptions read_options; + std::string value; + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +namespace { +// cmt_delay_ms is the delay between prepare and commit +// first_id is the id of the first transaction +Status TransactionStressTestInserter( + TransactionDB* db, const size_t num_transactions, const size_t num_sets, + const size_t num_keys_per_set, Random64* rand, + const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = true; + + // Inside the inserter we might also retake the snapshot. We do both since two + // separte functions are engaged for each. + txn_options.set_snapshot = rand->OneIn(2); + + RandomTransactionInserter inserter( + rand, write_options, read_options, num_keys_per_set, + static_cast(num_sets), cmt_delay_ms, first_id); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.TransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + inserter.GetLastStatus().PermitUncheckedError(); + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (num_transactions != 1 && + inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +// Worker threads add a number to a key from each set of keys. The checker +// threads verify that the sum of all keys in each set are equal. +TEST_P(MySQLStyleTransactionTest, TransactionStressTest) { + // Small write buffer to trigger more compactions + options.write_buffer_size = 1024; + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice& key) { + return RandomTransactionInserter::RollbackDeletionTypeCallback(key); + }; + ASSERT_OK(ReOpenNoDelete()); + constexpr size_t num_workers = 4; // worker threads count + constexpr size_t num_checkers = 2; // checker threads count + constexpr size_t num_slow_checkers = 2; // checker threads emulating backups + constexpr size_t num_slow_workers = 1; // slow worker threads count + constexpr size_t num_transactions_per_thread = 1000; + constexpr uint16_t num_sets = 3; + constexpr size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector threads; + std::atomic finished = {0}; + constexpr bool TAKE_SNAPSHOT = true; + uint64_t time_seed = env->NowMicros(); + printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce + + std::function call_inserter = [&] { + size_t thd_seed = std::hash()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread, + num_sets, num_keys_per_set, &rand)); + finished++; + }; + std::function call_checker = [&] { + size_t thd_seed = std::hash()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + // Verify that data is consistent + while (finished < num_workers) { + ASSERT_OK(RandomTransactionInserter::Verify( + db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand)); + } + }; + std::function call_slow_checker = [&] { + size_t thd_seed = std::hash()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + // Verify that data is consistent + while (finished < num_workers) { + uint64_t delay_ms = rand.Uniform(100) + 1; + Status s = RandomTransactionInserter::Verify( + db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms); + ASSERT_OK(s); + } + }; + std::function call_slow_inserter = [&] { + size_t thd_seed = std::hash()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + uint64_t id = 0; + // Verify that data is consistent + while (finished < num_workers) { + uint64_t delay_ms = rand.Uniform(500) + 1; + ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set, + &rand, delay_ms, id++)); + } + }; + + for (uint32_t i = 0; i < num_workers; i++) { + threads.emplace_back(call_inserter); + } + for (uint32_t i = 0; i < num_checkers; i++) { + threads.emplace_back(call_checker); + } + if (with_slow_threads_) { + for (uint32_t i = 0; i < num_slow_checkers; i++) { + threads.emplace_back(call_slow_checker); + } + for (uint32_t i = 0; i < num_slow_workers; i++) { + threads.emplace_back(call_slow_inserter); + } + } + + // Wait for all threads to finish + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set, + !TAKE_SNAPSHOT); + ASSERT_OK(s); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, MemoryLimitTest) { + TransactionOptions txn_options; + // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data. + txn_options.max_write_batch_size = 29; + // Set threshold to unlimited so that the write batch does not get flushed, + // and can hit the memory limit. + txn_options.write_batch_flush_threshold = 0; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_LE(0, txn->GetID()); + + s = txn->Put(Slice("a"), Slice("....")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + s = txn->Put(Slice("b"), Slice("....")); + ASSERT_OK(s); + ASSERT_EQ(2, txn->GetNumPuts()); + + s = txn->Put(Slice("b"), Slice("....")); + ASSERT_TRUE(s.IsMemoryLimit()); + ASSERT_EQ(2, txn->GetNumPuts()); + + ASSERT_OK(txn->Rollback()); + delete txn; +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// This test clarifies the existing expectation from the sequence number +// algorithm. It could detect mistakes in updating the code but it is not +// necessarily the one acceptable way. If the algorithm is legitimately changed, +// this unit test should be updated as well. +TEST_P(TransactionStressTest, SeqAdvanceTest) { + // TODO(myabandeh): must be test with false before new releases + const bool short_test = true; + WriteOptions wopts; + FlushOptions fopt; + + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Do the test with NUM_BRANCHES branches in it. Each run of a test takes some + // of the branches. This is the same as counting a binary number where i-th + // bit represents whether we take branch i in the represented by the number. + const size_t NUM_BRANCHES = short_test ? 6 : 10; + // Helper function that shows if the branch is to be taken in the run + // represented by the number n. + auto branch_do = [&](size_t n, size_t* branch) { + assert(*branch < NUM_BRANCHES); + const size_t filter = static_cast(1) << *branch; + return n & filter; + }; + const size_t max_n = static_cast(1) << NUM_BRANCHES; + for (size_t n = 0; n < max_n; n++) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + size_t branch = 0; + auto seq = db_impl->GetLatestSequenceNumber(); + exp_seq = seq; + TestTxn0(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + // Doing it twice might detect some bugs + TestTxn0(1); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + TestTxn1(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn3(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn4(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn2(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + ASSERT_OK(ReOpen()); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// Verify that the optimization would not compromize the correctness +TEST_P(TransactionTest, Optimizations) { + size_t comb_cnt = size_t(1) << 2; // 2 is number of optimization vars + for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) { + TransactionDBWriteOptimizations optimizations; + optimizations.skip_concurrency_control = IsInCombination(0, new_comb); + optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb); + + ASSERT_OK(ReOpen()); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("k"), Slice("v1"))); + ASSERT_OK(db->Write(write_options, &batch)); + + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("v1")); + } +} + +// A comparator that uses only the first three bytes +class ThreeBytewiseComparator : public Comparator { + public: + ThreeBytewiseComparator() {} + const char* Name() const override { return "test.ThreeBytewiseComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na.compare(nb); + } + bool Equal(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na == nb; + } + // These methods below don't seem relevant to this test. Implement them if + // proven othersize. + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortSuccessor(key); + } +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionTest, GetWithoutSnapshot) { + WriteOptions write_options; + std::atomic finish = {false}; + ASSERT_OK(db->Put(write_options, "key", "value")); + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + for (int i = 0; i < 100; i++) { + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put("key", "overridedvalue")); + ASSERT_OK(txn->Put("key", "value")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + } + finish = true; + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + while (!finish) { + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value")); + } + }); + commit_thread.join(); + read_thread.join(); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// Test that the transactional db can handle duplicate keys in the write batch +TEST_P(TransactionTest, DuplicateKeys) { + ColumnFamilyOptions cf_options; + std::string cf_name = "two"; + ColumnFamilyHandle* cf_handle = nullptr; + { + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("key"), Slice("value"))); + ASSERT_OK(batch.Put(Slice("key2"), Slice("value2"))); + // duplicate the keys + ASSERT_OK(batch.Put(Slice("key"), Slice("value3"))); + // duplicate the 2nd key. It should not be counted duplicate since a + // sub-patch is cut after the last duplicate. + ASSERT_OK(batch.Put(Slice("key2"), Slice("value4"))); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate keys + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5"))); + + ASSERT_OK(db->Write(write_options, &batch)); + + ReadOptions ropt; + PinnableSlice pinnable_val; + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value3")); + s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value4")); + s = db->Get(ropt, cf_handle, "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value5")); + + delete cf_handle; + } + + // Test with non-bytewise comparator + { + ASSERT_OK(ReOpen()); + std::unique_ptr comp_gc(new ThreeBytewiseComparator()); + cf_options.comparator = comp_gc.get(); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value"))); + // The first three bytes are the same, do it must be counted as duplicate + ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2"))); + // check for 2nd duplicate key in cf with non-default comparator + ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b"))); + ASSERT_OK(db->Write(write_options, &batch)); + + // The value must be the most recent value for all the keys equal to "key", + // including "key2" + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value2b")); + + // Test duplicate keys with rollback + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3"))); + ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4"))); + ASSERT_OK(txn0->Rollback()); + ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value2b")); + delete txn0; + + delete cf_handle; + cf_options.comparator = BytewiseComparator(); + } + + for (bool do_prepare : {true, false}) { + for (bool do_rollback : {true, false}) { + for (bool with_commit_batch : {true, false}) { + if (with_commit_batch && !do_prepare) { + continue; + } + if (with_commit_batch && do_rollback) { + continue; + } + ASSERT_OK(ReOpen()); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = true; + WriteOptions write_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto s = txn0->SetName("xid"); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0"), Slice("bar0a")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0"), Slice("bar0b")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo1"), Slice("bar1")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo2"), Slice("bar2a")); + ASSERT_OK(s); + // Repeat a key after the start of a sub-patch. This should not cause a + // duplicate in the most recent sub-patch and hence not creating a new + // sub-patch. + s = txn0->Put(Slice("foo0"), Slice("bar0c")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo2"), Slice("bar2b")); + ASSERT_OK(s); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate. + s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo4"), Slice("bar4")); + ASSERT_OK(s); + s = txn0->Delete(Slice("foo4")); + ASSERT_OK(s); + s = txn0->SingleDelete(Slice("foo4")); + ASSERT_OK(s); + if (do_prepare) { + s = txn0->Prepare(); + ASSERT_OK(s); + } + if (do_rollback) { + // Test rolling back the batch with duplicates + s = txn0->Rollback(); + ASSERT_OK(s); + } else { + if (with_commit_batch) { + assert(do_prepare); + auto cb = txn0->GetCommitTimeWriteBatch(); + // duplicate a key in the original batch + // TODO(myabandeh): the behavior of GetCommitTimeWriteBatch + // conflicting with the prepared batch is currently undefined and + // gives different results in different implementations. + + // s = cb->Put(Slice("foo0"), Slice("bar0d")); + // ASSERT_OK(s); + // add a new duplicate key + s = cb->Put(Slice("foo6"), Slice("bar6a")); + ASSERT_OK(s); + s = cb->Put(Slice("foo6"), Slice("bar6b")); + ASSERT_OK(s); + // add a duplicate key that is removed in the same batch + s = cb->Put(Slice("foo7"), Slice("bar7a")); + ASSERT_OK(s); + s = cb->Delete(Slice("foo7")); + ASSERT_OK(s); + } + s = txn0->Commit(); + ASSERT_OK(s); + } + delete txn0; + ReadOptions ropt; + PinnableSlice pinnable_val; + + if (do_rollback) { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, cf_handle, "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + } else { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0c")); + s = db->Get(ropt, cf_handle, "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0-cf1")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar2a,bar2b")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar3,bar3")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + if (with_commit_batch) { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val); + if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_COMMITTED) { + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar6b")); + } else { + ASSERT_TRUE(s.IsNotFound()); + } + s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + } + } + delete cf_handle; + } // with_commit_batch + } // do_rollback + } // do_prepare + + if (!options.unordered_write) { + // Also test with max_successive_merges > 0. max_successive_merges will not + // affect our algorithm for duplicate key insertion but we add the test to + // verify that. + cf_options.max_successive_merges = 2; + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpen()); + db->CreateColumnFamily(cf_options, cf_name, &cf_handle); + WriteOptions write_options; + // Ensure one value for the key + ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value"))); + WriteBatch batch; + // Merge more than max_successive_merges times + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4"))); + ASSERT_OK(db->Write(write_options, &batch)); + ReadOptions read_options; + std::string value; + ASSERT_OK(db->Get(read_options, cf_handle, "key", &value)); + ASSERT_EQ(value, "value,1,2,3,4"); + delete cf_handle; + } + + { + // Test that the duplicate detection is not compromised after rolling back + // to a save point + TransactionOptions txn_options; + WriteOptions write_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b"))); + txn0->SetSavePoint(); + ASSERT_OK(txn0->RollbackToSavePoint()); + ASSERT_OK(txn0->Commit()); + delete txn0; + } + + // Test sucessfull recovery after a crash + { + ASSERT_OK(ReOpen()); + TransactionOptions txn_options; + WriteOptions write_options; + ReadOptions ropt; + Transaction* txn0; + PinnableSlice pinnable_val; + Status s; + + std::unique_ptr comp_gc(new ThreeBytewiseComparator()); + cf_options.comparator = comp_gc.get(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + delete cf_handle; + std::vector cfds{ + ColumnFamilyDescriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)), + ColumnFamilyDescriptor(cf_name, cf_options), + }; + std::vector handles; + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + + assert(db != nullptr); + ASSERT_OK(db->Put(write_options, "foo0", "init")); + ASSERT_OK(db->Put(write_options, "foo1", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init")); + + // one entry + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0a")); + + // two entries, no duplicate + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "fol1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + + // one duplicate with ::Put + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0d")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1c")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1d")); + + // Duplicate with ::Put, ::Delete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::SingleDelete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g"))); + ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->SingleDelete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::Merge + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i"))); + ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f"))); + ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0f,bar0g")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1i,bar1j")); + + for (auto h : handles) { + delete h; + } + delete db; + db = nullptr; + } +} + +// Test that the reseek optimization in iterators will not result in an infinite +// loop if there are too many uncommitted entries before the snapshot. +TEST_P(TransactionTest, ReseekOptimization) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ColumnFamilyDescriptor cfd; + ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd)); + auto max_skip = cfd.options.max_sequential_skip_in_iterations; + + ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv"))); + + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + // Duplicate keys will result into separate sequence numbers in WritePrepared + // and WriteUnPrepared + for (size_t i = 0; i < 2 * max_skip; i++) { + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar"))); + } + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv"))); + + ReadOptions read_options; + // To avoid loops + read_options.max_skippable_internal_keys = 10 * max_skip; + Iterator* iter = db->NewIterator(read_options); + ASSERT_OK(iter->status()); + size_t cnt = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + iter->Next(); + ASSERT_OK(iter->status()); + cnt++; + } + ASSERT_EQ(cnt, 2); + cnt = 0; + iter->SeekToLast(); + while (iter->Valid()) { + iter->Prev(); + ASSERT_OK(iter->status()); + cnt++; + } + ASSERT_EQ(cnt, 2); + delete iter; + ASSERT_OK(txn0->Rollback()); + delete txn0; +} + +// After recovery in kPointInTimeRecovery mode, the corrupted log file remains +// there. The new log files should be still read succesfully during recovery of +// the 2nd crash. +TEST_P(TransactionTest, DoubleCrashInRecovery) { + for (const bool manual_wal_flush : {false, true}) { + for (const bool write_after_recovery : {false, true}) { + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.manual_wal_flush = manual_wal_flush; + ASSERT_OK(ReOpen()); + std::string cf_name = "two"; + ColumnFamilyOptions cf_options; + ColumnFamilyHandle* cf_handle = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + + // Add a prepare entry to prevent the older logs from being deleted. + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("foo-prepare"), Slice("bar-prepare"))); + ASSERT_OK(txn->Prepare()); + + FlushOptions flush_ops; + ASSERT_OK(db->Flush(flush_ops)); + // Now we have a log that cannot be deleted + + ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1")); + // Flush only the 2nd cf + ASSERT_OK(db->Flush(flush_ops, cf_handle)); + + // The value is large enough to be touched by the corruption we ingest + // below. + std::string large_value(400, ' '); + // key/value not touched by corruption + ASSERT_OK(db->Put(write_options, "foo2", "bar2")); + // key/value touched by corruption + ASSERT_OK(db->Put(write_options, "foo3", large_value)); + // key/value not touched by corruption + ASSERT_OK(db->Put(write_options, "foo4", "bar4")); + + ASSERT_OK(db->FlushWAL(true)); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + uint64_t wal_file_id = db_impl->TEST_LogfileNumber(); + std::string fname = LogFileName(dbname, wal_file_id); + reinterpret_cast(db)->TEST_Crash(); + delete txn; + delete cf_handle; + delete db; + db = nullptr; + + // Corrupt the last log file in the middle, so that it is not corrupted + // in the tail. + std::string file_content; + ASSERT_OK(ReadFileToString(env, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(env->DeleteFile(fname)); + ASSERT_OK(WriteStringToFile(env, file_content, fname, true)); + + // Recover from corruption + std::vector handles; + std::vector column_families; + column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("two", ColumnFamilyOptions())); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + if (write_after_recovery) { + // Write data to the log right after the corrupted log + ASSERT_OK(db->Put(write_options, "foo5", large_value)); + } + + // Persist data written to WAL during recovery or by the last Put + ASSERT_OK(db->FlushWAL(true)); + // 2nd crash to recover while having a valid log after the corrupted one. + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + txn = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn != nullptr); + ASSERT_OK(txn->Commit()); + delete txn; + for (auto handle : handles) { + delete handle; + } + } + } +} + +TEST_P(TransactionTest, CommitWithoutPrepare) { + { + // skip_prepare = false. + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.skip_prepare = false; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn->Commit().IsTxnNotPrepared()); + delete txn; + } + + { + // skip_prepare = true. + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.skip_prepare = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->Commit()); + delete txn; + } +} + +TEST_P(TransactionTest, OpenAndEnableU64Timestamp) { + ASSERT_OK(ReOpenNoDelete()); + + assert(db); + + const std::string test_cf_name = "test_cf"; + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + { + ColumnFamilyHandle* cfh = nullptr; + const Status s = db->CreateColumnFamily(cf_opts, test_cf_name, &cfh); + if (txn_db_options.write_policy == WRITE_COMMITTED) { + ASSERT_OK(s); + delete cfh; + } else { + ASSERT_TRUE(s.IsNotSupported()); + assert(!cfh); + } + } + + // Bypass transaction db layer. + if (txn_db_options.write_policy != WRITE_COMMITTED) { + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + assert(db_impl); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + } + + { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + std::vector handles; + const Status s = ReOpenNoDelete(cf_descs, &handles); + if (txn_db_options.write_policy == WRITE_COMMITTED) { + ASSERT_OK(s); + for (auto* h : handles) { + delete h; + } + } else { + ASSERT_TRUE(s.IsNotSupported()); + } + } +} + +TEST_P(TransactionTest, OpenAndEnableU32Timestamp) { + class DummyComparatorWithU32Ts : public Comparator { + public: + DummyComparatorWithU32Ts() : Comparator(sizeof(uint32_t)) {} + const char* Name() const override { return "DummyComparatorWithU32Ts"; } + void FindShortSuccessor(std::string*) const override {} + void FindShortestSeparator(std::string*, const Slice&) const override {} + int Compare(const Slice&, const Slice&) const override { return 0; } + }; + + std::unique_ptr dummy_ucmp(new DummyComparatorWithU32Ts()); + + ASSERT_OK(ReOpenNoDelete()); + + assert(db); + + const std::string test_cf_name = "test_cf"; + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = dummy_ucmp.get(); + { + ColumnFamilyHandle* cfh = nullptr; + ASSERT_TRUE(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh) + .IsInvalidArgument()); + } + + // Bypass transaction db layer. + { + ColumnFamilyHandle* cfh = nullptr; + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + assert(db_impl); + ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + } + + { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + std::vector handles; + ASSERT_TRUE(ReOpenNoDelete(cf_descs, &handles).IsInvalidArgument()); + } +} + +TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) { + ColumnFamilyOptions cf_options; + WriteOptions write_options; + + std::vector cf_names; + std::vector cf_handles; + + cf_names.push_back("test_cf"); + + ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar")); + ASSERT_OK(db->DropColumnFamilies(cf_handles)); + + for (auto* h : cf_handles) { + delete h; + } + cf_handles.clear(); + + std::vector cf_descriptors; + + cf_descriptors.emplace_back("test_cf", ColumnFamilyOptions()); + + ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar")); + ASSERT_OK(db->DropColumnFamilies(cf_handles)); + for (auto* h : cf_handles) { + delete h; + } + cf_handles.clear(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_test.h b/src/rocksdb/utilities/transactions/transaction_test.h new file mode 100644 index 000000000..0b86453a4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_test.h @@ -0,0 +1,578 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Return true if the ith bit is set in combination represented by comb +bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); } + +enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite }; + +class TransactionTestBase : public ::testing::Test { + public: + TransactionDB* db; + SpecialEnv special_env; + FaultInjectionTestEnv* env; + std::string dbname; + Options options; + + TransactionDBOptions txn_db_options; + bool use_stackable_db_; + + TransactionTestBase(bool use_stackable_db, bool two_write_queue, + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : db(nullptr), + special_env(Env::Default()), + env(nullptr), + use_stackable_db_(use_stackable_db) { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + options.write_buffer_size = 4 * 1024; + options.unordered_write = write_ordering == kUnorderedWrite; + options.level0_file_num_compaction_trigger = 2; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + special_env.skip_fsync_ = true; + env = new FaultInjectionTestEnv(&special_env); + options.env = env; + options.two_write_queues = two_write_queue; + dbname = test::PerThreadDBPath("transaction_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + txn_db_options.transaction_lock_timeout = 0; + txn_db_options.default_lock_timeout = 0; + txn_db_options.write_policy = write_policy; + txn_db_options.rollback_merge_operands = true; + // This will stress write unprepared, by forcing write batch flush on every + // write. + txn_db_options.default_write_batch_flush_threshold = 1; + // Write unprepared requires all transactions to be named. This setting + // autogenerates the name so that existing tests can pass. + txn_db_options.autogenerate_name = true; + Status s; + if (use_stackable_db == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + EXPECT_OK(s); + } + + ~TransactionTestBase() { + delete db; + db = nullptr; + // This is to skip the assert statement in FaultInjectionTestEnv. There + // seems to be a bug in btrfs that the makes readdir return recently + // unlink-ed files. By using the default fs we simply ignore errors resulted + // from attempting to delete such files in DestroyDB. + if (getenv("KEEP_DB") == nullptr) { + options.env = Env::Default(); + EXPECT_OK(DestroyDB(dbname, options)); + } else { + fprintf(stdout, "db is still in %s\n", dbname.c_str()); + } + delete env; + } + + Status ReOpenNoDelete() { + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpenNoDelete(std::vector& cfs, + std::vector* handles) { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles, + &db); + } else { + s = OpenWithStackableDB(cfs, handles); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpen() { + delete db; + db = nullptr; + DestroyDB(dbname, options); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(db != nullptr); + return s; + } + + Status OpenWithStackableDB(std::vector& cfs, + std::vector* handles) { + std::vector compaction_enabled_cf_indices; + TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices); + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db, + use_seq_per_batch, use_batch_per_txn); + auto stackable_db = std::make_unique(root_db); + if (s.ok()) { + assert(root_db != nullptr); + // If WrapStackableDB() returns non-ok, then stackable_db is already + // deleted within WrapStackableDB(). + s = TransactionDB::WrapStackableDB(stackable_db.release(), txn_db_options, + compaction_enabled_cf_indices, + *handles, &db); + } + return s; + } + + Status OpenWithStackableDB() { + std::vector compaction_enabled_cf_indices; + std::vector column_families{ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))}; + + TransactionDB::PrepareWrap(&options, &column_families, + &compaction_enabled_cf_indices); + std::vector handles; + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, column_families, &handles, + &root_db, use_seq_per_batch, use_batch_per_txn); + if (!s.ok()) { + delete root_db; + return s; + } + StackableDB* stackable_db = new StackableDB(root_db); + assert(root_db != nullptr); + assert(handles.size() == 1); + s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options, + compaction_enabled_cf_indices, handles, + &db); + delete handles[0]; + if (!s.ok()) { + delete stackable_db; + } + return s; + } + + std::atomic linked = {0}; + std::atomic exp_seq = {0}; + std::atomic commit_writes = {0}; + std::atomic expected_commits = {0}; + // Without Prepare, the commit does not write to WAL + std::atomic with_empty_commits = {0}; + void TestTxn0(size_t index) { + // Test DB's internal txn. It involves no prepare phase nor a commit marker. + auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value"); + ASSERT_OK(s); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq++; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + with_empty_commits++; + } + + void TestTxn1(size_t index) { + // Testing directly writing a write batch. Functionality-wise it is + // equivalent to commit without prepare. + WriteBatch wb; + auto istr = std::to_string(index); + ASSERT_OK(wb.Put("k1" + istr, "v1")); + ASSERT_OK(wb.Put("k2" + istr, "v2")); + ASSERT_OK(wb.Put("k3" + istr, "v3")); + auto s = db->Write(WriteOptions(), &wb); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 3; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + ASSERT_OK(s); + with_empty_commits++; + } + + void TestTxn2(size_t index) { + // Commit without prepare. It should write to DB without a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 4; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 4; + // WriteUnprepared implements CommitWithoutPrepareInternal by simply + // calling Prepare then Commit. Consume one seq for the prepare. + exp_seq++; + } + delete txn; + with_empty_commits++; + } + + void TestTxn3(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 5; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per commit marker + exp_seq++; + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per commit marker + exp_seq++; + } + delete txn; + } + + void TestTxn4(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Rollback()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // No seq is consumed for deleting the txn buffer + exp_seq += 0; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } + delete txn; + } + + // Test that we can change write policy after a clean shutdown (which would + // empty the WAL) + void CrossCompatibilityTest(TxnDBWritePolicy from_policy, + TxnDBWritePolicy to_policy, bool empty_wal) { + TransactionOptions txn_options; + ReadOptions read_options; + WriteOptions write_options; + uint32_t index = 0; + Random rnd(1103); + options.write_buffer_size = 1024; // To create more sst files + std::unordered_map committed_kvs; + Transaction* txn; + + txn_db_options.write_policy = from_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + ASSERT_OK(ReOpen()); + + for (int i = 0; i < 1024; i++) { + auto istr = std::to_string(index); + auto k = Slice("foo-" + istr).ToString(); + auto v = Slice("bar-" + istr).ToString(); + // For test the duplicate keys + auto v2 = Slice("bar2-" + istr).ToString(); + auto type = rnd.Uniform(4); + switch (type) { + case 0: + committed_kvs[k] = v; + ASSERT_OK(db->Put(write_options, k, v)); + committed_kvs[k] = v2; + ASSERT_OK(db->Put(write_options, k, v2)); + break; + case 1: { + WriteBatch wb; + committed_kvs[k] = v; + ASSERT_OK(wb.Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(wb.Put(k, v2)); + ASSERT_OK(db->Write(write_options, &wb)); + + } break; + case 2: + case 3: + txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + istr)); + committed_kvs[k] = v; + ASSERT_OK(txn->Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(txn->Put(k, v2)); + + if (type == 3) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->Commit()); + delete txn; + break; + default: + FAIL(); + } + + index++; + } // for i + + txn_db_options.write_policy = to_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + auto db_impl = static_cast_with_check(db->GetRootDB()); + // Before upgrade/downgrade the WAL must be emptied + if (empty_wal) { + ASSERT_OK(db_impl->TEST_FlushMemTable()); + } else { + ASSERT_OK(db_impl->FlushWAL(true)); + } + auto s = ReOpenNoDelete(); + if (empty_wal) { + ASSERT_OK(s); + } else { + // Test that we can detect the WAL that is produced by an incompatible + // WritePolicy and fail fast before mis-interpreting the WAL. + ASSERT_TRUE(s.IsNotSupported()); + return; + } + db_impl = static_cast_with_check(db->GetRootDB()); + // Check that WAL is empty + VectorLogPtr log_files; + ASSERT_OK(db_impl->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + for (auto& kv : committed_kvs) { + std::string value; + s = db->Get(read_options, kv.first, &value); + if (s.IsNotFound()) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_OK(s); + if (kv.second != value) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_EQ(kv.second, value); + } + } +}; + +class TransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + TransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; +}; + +class TransactionStressTest : public TransactionTest {}; + +class MySQLStyleTransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + MySQLStyleTransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + with_slow_threads_(std::get<4>(GetParam())) { + if (with_slow_threads_ && + (txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED)) { + // The corner case with slow threads involves the caches filling + // over which would not happen even with artifial delays. To help + // such cases to show up we lower the size of the cache-related data + // structures. + txn_db_options.wp_snapshot_cache_bits = 1; + txn_db_options.wp_commit_cache_bits = 10; + options.write_buffer_size = 1024; + EXPECT_OK(ReOpen()); + } + }; + + protected: + // Also emulate slow threads by addin artiftial delays + const bool with_slow_threads_; +}; + +class WriteCommittedTxnWithTsTest + : public TransactionTestBase, + public ::testing::WithParamInterface> { + public: + WriteCommittedTxnWithTsTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + WRITE_COMMITTED, kOrderedWrite) {} + ~WriteCommittedTxnWithTsTest() override { + for (auto* h : handles_) { + delete h; + } + } + + Status GetFromDb(ReadOptions read_opts, ColumnFamilyHandle* column_family, + const Slice& key, TxnTimestamp ts, std::string* value) { + std::string ts_buf; + PutFixed64(&ts_buf, ts); + Slice ts_slc = ts_buf; + read_opts.timestamp = &ts_slc; + assert(db); + return db->Get(read_opts, column_family, key, value); + } + + Transaction* NewTxn(WriteOptions write_opts, TransactionOptions txn_opts) { + assert(db); + auto* txn = db->BeginTransaction(write_opts, txn_opts); + assert(txn); + const bool enable_indexing = std::get<2>(GetParam()); + if (enable_indexing) { + txn->EnableIndexing(); + } else { + txn->DisableIndexing(); + } + return txn; + } + + protected: + std::vector handles_{}; +}; + +class TimestampedSnapshotWithTsSanityCheck + : public TransactionTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + explicit TimestampedSnapshotWithTsSanityCheck() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())) {} + ~TimestampedSnapshotWithTsSanityCheck() override { + for (auto* h : handles_) { + delete h; + } + } + + protected: + std::vector handles_{}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/transaction_util.cc b/src/rocksdb/utilities/transactions/transaction_util.cc new file mode 100644 index 000000000..360edc8ec --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_util.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_util.h" + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status TransactionUtil::CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key, + SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only, + ReadCallback* snap_checker, SequenceNumber min_uncommitted) { + Status result; + + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); + + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + cfh->GetName()); + } + + if (result.ok()) { + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts, + cache_only, snap_checker, min_uncommitted); + + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); + } + + return result; +} + +Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, + SequenceNumber snap_seq, + const std::string& key, + const std::string* const read_ts, + bool cache_only, ReadCallback* snap_checker, + SequenceNumber min_uncommitted) { + // When `min_uncommitted` is provided, keys are not always committed + // in sequence number order, and `snap_checker` is used to check whether + // specific sequence number is in the database is visible to the transaction. + // So `snap_checker` must be provided. + assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr); + + Status result; + bool need_to_read_sst = false; + + // Since it would be too slow to check the SST files, we will only use + // the memtables to check whether there have been any recent writes + // to this key after it was accessed in this transaction. But if the + // Memtables do not contain a long enough history, we must fail the + // transaction. + if (earliest_seq == kMaxSequenceNumber) { + // The age of this memtable is unknown. Cannot rely on it to check + // for recent writes. This error shouldn't happen often in practice as + // the Memtable should have a valid earliest sequence number except in some + // corner cases (such as error cases during recovery). + need_to_read_sst = true; + + if (cache_only) { + result = Status::TryAgain( + "Transaction could not check for conflicts as the MemTable does not " + "contain a long enough history to check write at SequenceNumber: ", + std::to_string(snap_seq)); + } + } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) { + // Use <= for min_uncommitted since earliest_seq is actually the largest sec + // before this memtable was created + need_to_read_sst = true; + + if (cache_only) { + // The age of this memtable is too new to use to check for recent + // writes. + char msg[300]; + snprintf(msg, sizeof(msg), + "Transaction could not check for conflicts for operation at " + "SequenceNumber %" PRIu64 + " as the MemTable only contains changes newer than " + "SequenceNumber %" PRIu64 + ". Increasing the value of the " + "max_write_buffer_size_to_maintain option could reduce the " + "frequency " + "of this error.", + snap_seq, earliest_seq); + result = Status::TryAgain(msg); + } + } + + if (result.ok()) { + SequenceNumber seq = kMaxSequenceNumber; + std::string timestamp; + bool found_record_for_key = false; + + // When min_uncommitted == kMaxSequenceNumber, writes are committed in + // sequence number order, so only keys larger than `snap_seq` can cause + // conflict. + // When min_uncommitted != kMaxSequenceNumber, keys lower than + // min_uncommitted will not triggered conflicts, while keys larger than + // min_uncommitted might create conflicts, so we need to read them out + // from the DB, and call callback to snap_checker to determine. So only + // keys lower than min_uncommitted can be skipped. + SequenceNumber lower_bound_seq = + (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted; + Status s = db_impl->GetLatestSequenceForKey( + sv, key, !need_to_read_sst, lower_bound_seq, &seq, + !read_ts ? nullptr : ×tamp, &found_record_for_key, + /*is_blob_index=*/nullptr); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + result = s; + } else if (found_record_for_key) { + bool write_conflict = snap_checker == nullptr + ? snap_seq < seq + : !snap_checker->IsVisible(seq); + // Perform conflict checking based on timestamp if applicable. + if (!write_conflict && read_ts != nullptr) { + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + assert(read_ts->size() == ucmp->timestamp_size()); + assert(read_ts->size() == timestamp.size()); + // Write conflict if *ts < timestamp. + write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0; + } + if (write_conflict) { + result = Status::Busy(); + } + } + } + + return result; +} + +Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only) { + Status result; + + std::unique_ptr cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf); + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + std::to_string(cf)); + break; + } + + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + // For each of the keys in this transaction, check to see if someone has + // written to this key since the start of the transaction. + std::unique_ptr key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + PointLockStatus status = tracker.GetPointLockStatus(cf, key); + const SequenceNumber key_seq = status.seq; + + // TODO: support timestamp-based conflict checking. + // CheckKeysForConflicts() is currently used only by optimistic + // transactions. + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, + /*read_ts=*/nullptr, cache_only); + if (!result.ok()) { + break; + } + } + + db_impl->ReturnAndCleanupSuperVersion(cf, sv); + + if (!result.ok()) { + break; + } + } + + return result; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_util.h b/src/rocksdb/utilities/transactions/transaction_util.h new file mode 100644 index 000000000..a349ba87a --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_util.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "db/dbformat.h" +#include "db/read_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +struct SuperVersion; +class WriteBatchWithIndex; + +class TransactionUtil { + public: + // Verifies there have been no commits to this key in the db since this + // sequence number. If user-defined timestamp is enabled, then also check + // no commits to this key in the db since the given ts. + // + // If cache_only is true, then this function will not attempt to read any + // SST files. This will make it more likely this function will + // return an error if it is unable to determine if there are any conflicts. + // + // See comment of CheckKey() for explanation of `snap_seq`, `ts`, + // `snap_checker` and `min_uncommitted`. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + static Status CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, + const std::string& key, SequenceNumber snap_seq, + const std::string* const ts, bool cache_only, + ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); + + // For each key,SequenceNumber pair tracked by the LockTracker, this function + // will verify there have been no writes to the key in the db since that + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + // + // REQUIRED: + // This function should only be called on the write thread or if the + // mutex is held. + // tracker must support point lock. + static Status CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only); + + private: + // If `snap_checker` == nullptr, writes are always commited in sequence number + // order. All sequence number <= `snap_seq` will not conflict with any + // write, and all keys > `snap_seq` of `key` will trigger conflict. + // If `snap_checker` != nullptr, writes may not commit in sequence number + // order. In this case `min_uncommitted` is a lower bound. + // seq < `min_uncommitted`: no conflict + // seq > `snap_seq`: applicable to conflict + // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine. + // + // If user-defined timestamp is enabled, a write conflict is detected if an + // operation for `key` with timestamp greater than `ts` exists. + static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, SequenceNumber snap_seq, + const std::string& key, const std::string* const ts, + bool cache_only, ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc new file mode 100644 index 000000000..94b8201f7 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc @@ -0,0 +1,588 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/merge_operators.h" +#ifndef ROCKSDB_LITE + +#include "test_util/testutil.h" +#include "utilities/transactions/transaction_test.h" + +namespace ROCKSDB_NAMESPACE { + +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, WriteCommittedTxnWithTsTest, + ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false, + /*enable_indexing=*/false), + std::make_tuple(false, /*two_write_queue=*/true, + /*enable_indexing=*/false), + std::make_tuple(false, /*two_write_queue=*/false, + /*enable_indexing=*/true), + std::make_tuple(false, /*two_write_queue=*/true, + /*enable_indexing=*/true))); + +INSTANTIATE_TEST_CASE_P( + DBAsStackableDB, WriteCommittedTxnWithTsTest, + ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false, + /*enable_indexing=*/false), + std::make_tuple(true, /*two_write_queue=*/true, + /*enable_indexing=*/false), + std::make_tuple(true, /*two_write_queue=*/false, + /*enable_indexing=*/true), + std::make_tuple(true, /*two_write_queue=*/true, + /*enable_indexing=*/true))); + +TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->Put(handles_[1], "foo", "value")); + ASSERT_TRUE(txn->Commit().IsInvalidArgument()); + + auto* pessimistic_txn = + static_cast_with_check(txn.get()); + ASSERT_TRUE( + pessimistic_txn->CommitBatch(/*batch=*/nullptr).IsInvalidArgument()); + + { + WriteBatchWithIndex* wbwi = txn->GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + // Write a key to the batch for nonexisting cf. + ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"", + /*value=*/"")); + } + + ASSERT_OK(txn->SetCommitTimestamp(20)); + + ASSERT_TRUE(txn->Commit().IsInvalidArgument()); + txn.reset(); + + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Put(handles_[1], "foo", "value")); + { + WriteBatchWithIndex* wbwi = txn1->GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + // Write a key to the batch for non-existing cf. + ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"", + /*value=*/"")); + } + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(21)); + ASSERT_TRUE(txn1->Commit().IsInvalidArgument()); + txn1.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + ASSERT_OK(txn0->Put(handles_[1], "foo", "value")); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + ASSERT_TRUE(txn0->Commit().IsInvalidArgument()); + txn0.reset(); + + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->Put(handles_[1], "foo", "value1")); + { + std::string buf; + PutFixed64(&buf, 23); + ASSERT_OK(txn1->Put("id", buf)); + ASSERT_OK(txn1->Merge("id", buf)); + } + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + { + std::string value; + const Status s = + GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value); + ASSERT_OK(s); + ASSERT_EQ("value1", value); + } + + { + std::string value; + const Status s = db->Get(ReadOptions(), handles_[0], "id", &value); + ASSERT_OK(s); + uint64_t ival = 0; + Slice value_slc = value; + bool result = GetFixed64(&value_slc, &ival); + assert(result); + ASSERT_EQ(46, ival); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, RecoverFromWal) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + options.avoid_flush_during_shutdown = true; + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + ASSERT_OK(txn0->Put(handles_[1], "foo", "foo_value")); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + + WriteOptions write_opts; + write_opts.sync = true; + std::unique_ptr txn1(NewTxn(write_opts, TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->Put("bar", "bar_value_1")); + ASSERT_OK(txn1->Put(handles_[1], "bar", "bar_value_1")); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + std::unique_ptr txn2(NewTxn(write_opts, TransactionOptions())); + assert(txn2); + ASSERT_OK(txn2->Put("key1", "value_3")); + ASSERT_OK(txn2->Put(handles_[1], "key1", "value_3")); + ASSERT_OK(txn2->SetCommitTimestamp(/*ts=*/24)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); + + txn0.reset(); + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + { + std::string value; + Status s = GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(ReadOptions(), handles_[0], "bar", &value); + ASSERT_OK(s); + ASSERT_EQ("bar_value_1", value); + + value.clear(); + s = GetFromDb(ReadOptions(), handles_[1], "bar", /*ts=*/23, &value); + ASSERT_OK(s); + ASSERT_EQ("bar_value_1", value); + + s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/23, &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(ReadOptions(), handles_[0], "key1", &value); + ASSERT_OK(s); + ASSERT_EQ("value_3", value); + + s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/24, &value); + ASSERT_OK(s); + ASSERT_EQ("value_3", value); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, TransactionDbLevelApi) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_options); + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::string key_str = "tes_key"; + std::string ts_str; + std::string value_str = "test_value"; + PutFixed64(&ts_str, 100); + Slice value = value_str; + + assert(db); + ASSERT_TRUE( + db->Put(WriteOptions(), handles_[1], "foo", "bar").IsNotSupported()); + ASSERT_TRUE(db->Delete(WriteOptions(), handles_[1], "foo").IsNotSupported()); + ASSERT_TRUE( + db->SingleDelete(WriteOptions(), handles_[1], "foo").IsNotSupported()); + ASSERT_TRUE( + db->Merge(WriteOptions(), handles_[1], "foo", "+1").IsNotSupported()); + WriteBatch wb1(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + ASSERT_OK(wb1.Put(handles_[1], key_str, ts_str, value)); + ASSERT_TRUE(db->Write(WriteOptions(), &wb1).IsNotSupported()); + ASSERT_TRUE(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb1) + .IsNotSupported()); + auto* pessimistic_txn_db = + static_cast_with_check(db); + assert(pessimistic_txn_db); + ASSERT_TRUE( + pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb1) + .IsNotSupported()); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + ASSERT_OK(db->Delete(WriteOptions(), "bar")); + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Merge(WriteOptions(), "key", "_more")); + WriteBatch wb2(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + ASSERT_OK(wb2.Put(key_str, value)); + ASSERT_OK(db->Write(WriteOptions(), &wb2)); + ASSERT_OK(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb2)); + ASSERT_OK( + pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb2)); + + std::unique_ptr txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + + WriteBatch wb3(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + + ASSERT_OK(wb3.Put(handles_[1], "key", "value")); + auto* pessimistic_txn = + static_cast_with_check(txn.get()); + assert(pessimistic_txn); + ASSERT_TRUE(pessimistic_txn->CommitBatch(&wb3).IsNotSupported()); + + txn.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, Merge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->Put(handles_[1], "foo", "bar")); + ASSERT_OK(txn->Merge(handles_[1], "foo", "1")); + ASSERT_OK(txn->SetCommitTimestamp(24)); + ASSERT_OK(txn->Commit()); + txn.reset(); + { + std::string value; + const Status s = + GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/24, &value); + ASSERT_OK(s); + ASSERT_EQ("bar,1", value); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn0( + NewTxn(WriteOptions(), TransactionOptions())); + + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn1->Put(handles_[1], "key", "value1")); + ASSERT_OK(txn1->SetCommitTimestamp(24)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + std::string value; + ASSERT_OK(txn0->SetReadTimestampForValidation(23)); + ASSERT_TRUE( + txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value).IsBusy()); + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + std::unique_ptr txn2( + NewTxn(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->SetReadTimestampForValidation(25)); + ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value)); + ASSERT_OK(txn2->SetCommitTimestamp(26)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + + { + std::string value; + ASSERT_OK(txn0->SetReadTimestampForValidation(100)); + // Lock "key". + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value) + .IsNotFound()); + } + + ASSERT_OK(txn0->Put(handles_[1], "key", "value0")); + ASSERT_OK(txn0->SetCommitTimestamp(101)); + ASSERT_OK(txn0->Commit()); + + ASSERT_OK(txn1->Put(handles_[1], "key", "value1")); + // In reality, caller needs to ensure commit_ts of txn1 is greater than the + // commit_ts of txn0, which is true for lock-based concurrency control. + ASSERT_OK(txn1->SetCommitTimestamp(102)); + ASSERT_OK(txn1->Commit()); + + txn0.reset(); + txn1.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, RefineReadTimestamp) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + + std::unique_ptr txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + + { + ASSERT_OK(txn0->SetReadTimestampForValidation(100)); + // Lock "key0", "key1", ..., "key4". + for (int i = 0; i < 5; ++i) { + std::string value; + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], + "key" + std::to_string(i), &value) + .IsNotFound()); + } + } + ASSERT_OK(txn1->Put(handles_[1], "key5", "value5_0")); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(101)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + { + std::string value; + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value) + .IsBusy()); + ASSERT_OK(txn0->SetReadTimestampForValidation(102)); + ASSERT_OK(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value)); + ASSERT_EQ("value5_0", value); + } + + for (int i = 0; i < 6; ++i) { + ASSERT_OK(txn0->Put(handles_[1], "key" + std::to_string(i), + "value" + std::to_string(i))); + } + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->SetCommitTimestamp(103)); + ASSERT_OK(txn0->Commit()); + txn0.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + ASSERT_OK(ReOpen()); + + std::unique_ptr txn1( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn1); + + std::unique_ptr txn2( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn2); + ASSERT_OK(txn2->Put("foo", "v0")); + ASSERT_OK(txn2->SetCommitTimestamp(10)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); + + // txn1 takes a snapshot after txn2 commits. The writes of txn2 have + // a smaller seqno than txn1's snapshot, thus should not affect conflict + // checking. + txn1->SetSnapshot(); + + std::unique_ptr txn3( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn3); + ASSERT_OK(txn3->SetReadTimestampForValidation(20)); + std::string dontcare; + ASSERT_OK(txn3->GetForUpdate(ReadOptions(), "foo", &dontcare)); + ASSERT_OK(txn3->SingleDelete("foo")); + ASSERT_OK(txn3->SetName("txn3")); + ASSERT_OK(txn3->Prepare()); + ASSERT_OK(txn3->SetCommitTimestamp(30)); + // txn3 reads at ts=20 > txn2's commit timestamp, and commits at ts=30. + // txn3 can commit successfully, leaving a tombstone with ts=30. + ASSERT_OK(txn3->Commit()); + txn3.reset(); + + bool called = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLatestSequenceForKey:mem", [&](void* arg) { + auto* const ts_ptr = reinterpret_cast(arg); + assert(ts_ptr); + Slice ts_slc = *ts_ptr; + uint64_t last_ts = 0; + ASSERT_TRUE(GetFixed64(&ts_slc, &last_ts)); + ASSERT_EQ(30, last_ts); + called = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // txn1's read timestamp is 25 < 30 (commit timestamp of txn3). Therefore, + // the tombstone written by txn3 causes the conflict checking to fail. + ASSERT_OK(txn1->SetReadTimestampForValidation(25)); + ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), "foo", &dontcare).IsBusy()); + ASSERT_TRUE(called); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Transactions not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc new file mode 100644 index 000000000..86a9511a4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc @@ -0,0 +1,4078 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/debug.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_test.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +using CommitEntry = WritePreparedTxnDB::CommitEntry; +using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b; +using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat; + +TEST(PreparedHeap, BasicsTest) { + WritePreparedTxnDB::PreparedHeap heap; + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(14l); + // Test with one element + ASSERT_EQ(14l, heap.top()); + heap.push(24l); + heap.push(34l); + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.push(44l); + heap.push(54l); + heap.push(64l); + heap.push(74l); + heap.push(84l); + } + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.erase(24l); + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.erase(14l); + // Test that the new comes to the top after multiple erase + ASSERT_EQ(34l, heap.top()); + heap.erase(34l); + // Test that the new comes to the top after single erase + ASSERT_EQ(44l, heap.top()); + heap.erase(54l); + ASSERT_EQ(44l, heap.top()); + heap.pop(); // pop 44l + // Test that the erased items are ignored after pop + ASSERT_EQ(64l, heap.top()); + heap.erase(44l); + // Test that erasing an already popped item would work + ASSERT_EQ(64l, heap.top()); + heap.erase(84l); + ASSERT_EQ(64l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(85l); + heap.push(86l); + heap.push(87l); + heap.push(88l); + heap.push(89l); + } + heap.erase(87l); + heap.erase(85l); + heap.erase(89l); + heap.erase(86l); + heap.erase(88l); + // Test top remains the same after a random order of many erases + ASSERT_EQ(64l, heap.top()); + heap.pop(); + // Test that pop works with a series of random pending erases + ASSERT_EQ(74l, heap.top()); + ASSERT_FALSE(heap.empty()); + heap.pop(); + // Test that empty works + ASSERT_TRUE(heap.empty()); +} + +// This is a scenario reconstructed from a buggy trace. Test that the bug does +// not resurface again. +TEST(PreparedHeap, EmptyAtTheEnd) { + WritePreparedTxnDB::PreparedHeap heap; + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } + ASSERT_EQ(40l, heap.top()); + // Although not a recommended scenario, we must be resilient against erase + // without a prior push. + heap.erase(50l); + ASSERT_EQ(40l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } + ASSERT_EQ(40l, heap.top()); + + heap.erase(60l); + ASSERT_EQ(40l, heap.top()); + heap.erase(40l); + ASSERT_TRUE(heap.empty()); + + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } + ASSERT_EQ(40l, heap.top()); + heap.erase(50l); + ASSERT_EQ(40l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } + ASSERT_EQ(40l, heap.top()); + + heap.erase(40l); + // Test that the erase has not emptied the heap (we had a bug doing that) + ASSERT_FALSE(heap.empty()); + ASSERT_EQ(60l, heap.top()); + heap.erase(60l); + ASSERT_TRUE(heap.empty()); +} + +// Generate random order of PreparedHeap access and test that the heap will be +// successfully emptied at the end. +TEST(PreparedHeap, Concurrent) { + const size_t t_cnt = 10; + ROCKSDB_NAMESPACE::port::Thread t[t_cnt + 1]; + WritePreparedTxnDB::PreparedHeap heap; + port::RWMutex prepared_mutex; + std::atomic last; + + for (size_t n = 0; n < 100; n++) { + last = 0; + t[0] = ROCKSDB_NAMESPACE::port::Thread([&]() { + Random rnd(1103); + for (size_t seq = 1; seq <= t_cnt; seq++) { + // This is not recommended usage but we should be resilient against it. + bool skip_push = rnd.OneIn(5); + if (!skip_push) { + MutexLock ml(heap.push_pop_mutex()); + std::this_thread::yield(); + heap.push(seq); + last.store(seq); + } + } + }); + for (size_t i = 1; i <= t_cnt; i++) { + t[i] = + ROCKSDB_NAMESPACE::port::Thread([&heap, &prepared_mutex, &last, i]() { + auto seq = i; + do { + std::this_thread::yield(); + } while (last.load() < seq); + WriteLock wl(&prepared_mutex); + heap.erase(seq); + }); + } + for (size_t i = 0; i <= t_cnt; i++) { + t[i].join(); + } + ASSERT_TRUE(heap.empty()); + } +} + +// Test that WriteBatchWithIndex correctly counts the number of sub-batches +TEST(WriteBatchWithIndex, SubBatchCnt) { + ColumnFamilyOptions cf_options; + std::string cf_name = "two"; + DB* db; + Options options; + options.create_if_missing = true; + const std::string dbname = test::PerThreadDBPath("transaction_testdb"); + EXPECT_OK(DestroyDB(dbname, options)); + ASSERT_OK(DB::Open(options, dbname, &db)); + ColumnFamilyHandle* cf_handle = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + size_t batch_cnt = 1; + size_t save_points = 0; + std::vector batch_cnt_at; + WriteBatchWithIndex batch(db->DefaultColumnFamily()->GetComparator(), 0, true, + 0); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key"), Slice("value"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key2"), Slice("value2"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the keys + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key"), Slice("value3"))); + batch_cnt++; + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the 2nd key. It should not be counted duplicate since a + // sub-patch is cut after the last duplicate. + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key2"), Slice("value4"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate keys + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + + // Test that the number of sub-batches matches what we count with + // SubBatchCounter + std::map comparators; + comparators[0] = db->DefaultColumnFamily()->GetComparator(); + comparators[cf_handle->GetID()] = cf_handle->GetComparator(); + SubBatchCounter counter(comparators); + ASSERT_OK(batch.GetWriteBatch()->Iterate(&counter)); + ASSERT_EQ(batch_cnt, counter.BatchCount()); + + // Test that RollbackToSavePoint will properly resets the number of + // sub-batches + for (size_t i = save_points; i > 0; i--) { + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt()); + } + + // Test the count is right with random batches + { + const size_t TOTAL_KEYS = 20; // 20 ~= 10 to cause a few randoms + Random rnd(1131); + std::string keys[TOTAL_KEYS]; + for (size_t k = 0; k < TOTAL_KEYS; k++) { + int len = static_cast(rnd.Uniform(50)); + keys[k] = test::RandomKey(&rnd, len); + } + for (size_t i = 0; i < 1000; i++) { // 1000 random batches + WriteBatchWithIndex rndbatch(db->DefaultColumnFamily()->GetComparator(), + 0, true, 0); + for (size_t k = 0; k < 10; k++) { // 10 key per batch + size_t ki = static_cast(rnd.Uniform(TOTAL_KEYS)); + Slice key = Slice(keys[ki]); + std::string tmp = rnd.RandomString(16); + Slice value = Slice(tmp); + ASSERT_OK(rndbatch.Put(key, value)); + } + SubBatchCounter batch_counter(comparators); + ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter)); + ASSERT_EQ(rndbatch.SubBatchCnt(), batch_counter.BatchCount()); + } + } + + delete cf_handle; + delete db; +} + +TEST(CommitEntry64b, BasicTest) { + const size_t INDEX_BITS = static_cast(21); + const size_t INDEX_SIZE = static_cast(1ull << INDEX_BITS); + const CommitEntry64bFormat FORMAT(static_cast(INDEX_BITS)); + + // zero-initialized CommitEntry64b should indicate an empty entry + CommitEntry64b empty_entry64b; + uint64_t empty_index = 11ul; + CommitEntry empty_entry; + bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT); + ASSERT_FALSE(ok); + + // the zero entry is reserved for un-initialized entries + const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1; + // Samples over the numbers that are covered by that many index bits + std::array is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}}; + // Samples over the numbers that are covered by that many commit bits + std::array ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}}; + // Iterate over prepare numbers that have i) cover all bits of a sequence + // number, and ii) include some bits that fall into the range of index or + // commit bits + for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) { + for (uint64_t i : is) { + for (uint64_t d : ds) { + uint64_t p = base + i + d; + for (uint64_t c : {p, p + d / 2, p + d}) { + uint64_t index = p % INDEX_SIZE; + CommitEntry before(p, c), after; + CommitEntry64b entry64b(before, FORMAT); + ok = entry64b.Parse(index, &after, FORMAT); + ASSERT_TRUE(ok); + if (!(before == after)) { + printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64 + " c %" PRIu64 " index %" PRIu64 "\n", + base, i, d, p, c, index); + } + ASSERT_EQ(before, after); + } + } + } + } +} + +class WritePreparedTxnDBMock : public WritePreparedTxnDB { + public: + WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt) + : WritePreparedTxnDB(db_impl, opt) {} + void SetDBSnapshots(const std::vector& snapshots) { + snapshots_ = snapshots; + } + void TakeSnapshot(SequenceNumber seq) { snapshots_.push_back(seq); } + + protected: + const std::vector GetSnapshotListFromDB( + SequenceNumber /* unused */) override { + return snapshots_; + } + + private: + std::vector snapshots_; +}; + +class WritePreparedTransactionTestBase : public TransactionTestBase { + public: + WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue, + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + write_ordering){}; + + protected: + void UpdateTransactionDBOptions(size_t snapshot_cache_bits, + size_t commit_cache_bits) { + txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits; + txn_db_options.wp_commit_cache_bits = commit_cache_bits; + } + void UpdateTransactionDBOptions(size_t snapshot_cache_bits) { + txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits; + } + // If expect_update is set, check if it actually updated old_commit_map_. If + // it did not and yet suggested not to check the next snapshot, do the + // opposite to check if it was not a bad suggestion. + void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit, + uint64_t snapshot, + uint64_t next_snapshot, + bool expect_update) { + WritePreparedTxnDB* wp_db = dynamic_cast(db); + // reset old_commit_map_empty_ so that its value indicate whether + // old_commit_map_ was updated + wp_db->old_commit_map_empty_ = true; + bool check_next = wp_db->MaybeUpdateOldCommitMap(prepare, commit, snapshot, + snapshot < next_snapshot); + if (expect_update == wp_db->old_commit_map_empty_) { + printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64 + " next: %" PRIu64 "\n", + prepare, commit, snapshot, next_snapshot); + } + EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_); + if (!check_next && wp_db->old_commit_map_empty_) { + // do the opposite to make sure it was not a bad suggestion + const bool dont_care_bool = true; + wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot, + dont_care_bool); + if (!wp_db->old_commit_map_empty_) { + printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64 + " next: %" PRIu64 "\n", + prepare, commit, snapshot, next_snapshot); + } + EXPECT_TRUE(wp_db->old_commit_map_empty_); + } + } + + // Test that a CheckAgainstSnapshots thread reading old_snapshots will not + // miss a snapshot because of a concurrent update by UpdateSnapshots that is + // writing new_snapshots. Both threads are broken at two points. The sync + // points to enforce them are specified by a1, a2, b1, and b2. CommitEntry + // entry is expected to be vital for one of the snapshots that is common + // between the old and new list of snapshots. + void SnapshotConcurrentAccessTestInternal( + WritePreparedTxnDB* wp_db, + const std::vector& old_snapshots, + const std::vector& new_snapshots, CommitEntry& entry, + SequenceNumber& version, size_t a1, size_t a2, size_t b1, size_t b2) { + // First reset the snapshot list + const std::vector empty_snapshots; + wp_db->old_commit_map_empty_ = true; + wp_db->UpdateSnapshots(empty_snapshots, ++version); + // Then initialize it with the old_snapshots + wp_db->UpdateSnapshots(old_snapshots, ++version); + + // Starting from the first thread, cut each thread at two points + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a1), + "WritePreparedTxnDB::UpdateSnapshots:s:start"}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b1), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a1)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a2), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b1)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b2), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a2)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:end", + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b2)}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_); + ROCKSDB_NAMESPACE::port::Thread t1( + [&]() { wp_db->UpdateSnapshots(new_snapshots, version); }); + wp_db->CheckAgainstSnapshots(entry); + t1.join(); + ASSERT_FALSE(wp_db->old_commit_map_empty_); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + wp_db->old_commit_map_empty_ = true; + wp_db->UpdateSnapshots(empty_snapshots, ++version); + wp_db->UpdateSnapshots(old_snapshots, ++version); + // Starting from the second thread, cut each thread at two points + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a1), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:start"}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b1), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a1)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a2), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b1)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b2), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a2)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:end", + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b2)}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_); + ROCKSDB_NAMESPACE::port::Thread t1( + [&]() { wp_db->UpdateSnapshots(new_snapshots, version); }); + wp_db->CheckAgainstSnapshots(entry); + t1.join(); + ASSERT_FALSE(wp_db->old_commit_map_empty_); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + // Verify value of keys. + void VerifyKeys(const std::unordered_map& data, + const Snapshot* snapshot = nullptr) { + std::string value; + ReadOptions read_options; + read_options.snapshot = snapshot; + for (auto& kv : data) { + auto s = db->Get(read_options, kv.first, &value); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + if (kv.second != value) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_EQ(kv.second, value); + } else { + ASSERT_EQ(kv.second, "NOT_FOUND"); + } + + // Try with MultiGet API too + std::vector values; + auto s_vec = db->MultiGet(read_options, {db->DefaultColumnFamily()}, + {kv.first}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(kv.second == values[0]); + } else { + ASSERT_EQ(kv.second, "NOT_FOUND"); + } + } + } + + // Verify all versions of keys. + void VerifyInternalKeys(const std::vector& expected_versions) { + std::vector versions; + const size_t kMaxKeys = 100000; + ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key, + expected_versions.back().user_key, kMaxKeys, + &versions)); + ASSERT_EQ(expected_versions.size(), versions.size()); + for (size_t i = 0; i < versions.size(); i++) { + ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key); + ASSERT_EQ(expected_versions[i].sequence, versions[i].sequence); + ASSERT_EQ(expected_versions[i].type, versions[i].type); + if (versions[i].type != kTypeDeletion && + versions[i].type != kTypeSingleDeletion) { + ASSERT_EQ(expected_versions[i].value, versions[i].value); + } + // Range delete not supported. + ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion); + } + } +}; + +class WritePreparedTransactionTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + WritePreparedTransactionTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class SnapshotConcurrentAccessTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface> { + public: + SnapshotConcurrentAccessTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())){}; + + protected: + // A test is split into split_cnt_ tests, each identified with split_id_ where + // 0 <= split_id_ < split_cnt_ + size_t split_id_; + size_t split_cnt_; +}; +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class SeqAdvanceConcurrentTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface> { + public: + SeqAdvanceConcurrentTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())) { + special_env.skip_fsync_ = true; + }; + + protected: + // A test is split into split_cnt_ tests, each identified with split_id_ where + // 0 <= split_id_ < split_cnt_ + size_t split_id_; + size_t split_cnt_; +}; + +INSTANTIATE_TEST_CASE_P( + WritePreparedTransaction, WritePreparedTransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite))); + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + TwoWriteQueues, SnapshotConcurrentAccessTest, + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20), + + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20))); + +INSTANTIATE_TEST_CASE_P( + OneWriteQueue, SnapshotConcurrentAccessTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20))); + +INSTANTIATE_TEST_CASE_P( + TwoWriteQueues, SeqAdvanceConcurrentTest, + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10))); + +INSTANTIATE_TEST_CASE_P( + OneWriteQueue, SeqAdvanceConcurrentTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(WritePreparedTransactionTest, CommitMap) { + WritePreparedTxnDB* wp_db = dynamic_cast(db); + ASSERT_NE(wp_db, nullptr); + ASSERT_NE(wp_db->db_impl_, nullptr); + size_t size = wp_db->COMMIT_CACHE_SIZE; + CommitEntry c = {5, 12}, e; + bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e); + ASSERT_FALSE(evicted); + + // Should be able to read the same value + CommitEntry64b dont_care; + bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(c, e); + // Should be able to distinguish between overlapping entries + found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_NE(c.prep_seq + size, e.prep_seq); + // Should be able to detect non-existent entry + found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e); + ASSERT_FALSE(found); + + // Reject an invalid exchange + CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size}; + CommitEntry64b e2_64b(e2, wp_db->FORMAT); + bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e); + ASSERT_FALSE(exchanged); + // check whether it did actually reject that + found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(c, e); + + // Accept a valid exchange + CommitEntry64b c_64b(c, wp_db->FORMAT); + CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1}; + exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3); + ASSERT_TRUE(exchanged); + // check whether it did actually accepted that + found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(e3, e); + + // Rewrite an entry + CommitEntry e4 = {e3.prep_seq + size, e3.commit_seq + size + 1}; + evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e); + ASSERT_TRUE(evicted); + ASSERT_EQ(e3, e); + found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(e4, e); +} + +TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) { + // If prepare <= snapshot < commit we should keep the entry around since its + // nonexistence could be interpreted as committed in the snapshot while it is + // not true. We keep such entries around by adding them to the + // old_commit_map_. + uint64_t p /*prepare*/, c /*commit*/, s /*snapshot*/, ns /*next_snapshot*/; + p = 10l, c = 15l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + // If we do not expect the old commit map to be updated, try also with a next + // snapshot that is expected to update the old commit map. This would test + // that MaybeUpdateOldCommitMap would not prevent us from checking the next + // snapshot that must be checked. + p = 10l, c = 15l, s = 20l, ns = 11l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 10l, c = 20l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 10l, c = 20l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 20l, c = 20l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 20l, c = 20l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 10l, c = 25l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true); + + p = 20l, c = 25l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true); + + p = 21l, c = 25l, s = 20l, ns = 22l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 21l, c = 25l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); +} + +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + options.max_write_buffer_number_to_maintain = 3; + ASSERT_OK(ReOpen()); + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.set_snapshot = true; + string value; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn != nullptr); + ASSERT_OK(txn->SetName("txn")); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2 != nullptr); + ASSERT_OK(txn2->SetName("txn2")); + + // This transaction is created to cause potential conflict. + Transaction* txn_x = db->BeginTransaction(write_options); + ASSERT_OK(txn_x->SetName("txn_x")); + ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3"))); + ASSERT_OK(txn_x->Prepare()); + + // Create snapshots after the prepare, but there should still + // be a conflict when trying to read "foo". + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(db->Flush(flush_ops)); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + DBImpl* db_impl = static_cast(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_SwitchMemtable()); + } + uint64_t num_imm_mems; + ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, but this transaction also needs to + // check all memtables because of they contains uncommitted data. + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn3 != nullptr); + ASSERT_OK(txn3->SetName("txn3")); + + // Commit the pending write + ASSERT_OK(txn_x->Commit()); + + // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will + // pass. In all cases, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + // We should have checked two memtables, and since there is no + // conflict, another Get() will be made and fetch the data from + // DB. If it is in immutable memtable, two extra memtable reads + // will be issued. If it is not (in history), only one will + // be made, which is to the active memtable. + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(4, get_perf_context()->get_from_memtable_count); + } + + Transaction* txn4 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn4 != nullptr); + ASSERT_OK(txn4->SetName("txn4")); + get_perf_context()->Reset(); + ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value)); + if (attempt == kAttemptHistoryMemtable) { + // Active memtable will be checked in snapshot validation and when + // getting the value. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + } else { + // Only active memtable will be checked in snapshot validation but + // both of active and immutable snapshot will be queried when + // getting the value. + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } + + ASSERT_OK(txn2->Commit()); + ASSERT_OK(txn4->Commit()); + + TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + delete txn4; + delete txn_x; + } +} + +// Reproduce the bug with two snapshots with the same seuqence number and test +// that the release of the first snapshot will not affect the reads by the other +// snapshot +TEST_P(WritePreparedTransactionTest, DoubleSnapshot) { + TransactionOptions txn_options; + Status s; + + // Insert initial value + ASSERT_OK(db->Put(WriteOptions(), "key", "value1")); + + WritePreparedTxnDB* wp_db = dynamic_cast(db); + Transaction* txn = + wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key", "value2")); + ASSERT_OK(txn->Prepare()); + // Three snapshots with the same seq number + const Snapshot* snapshot0 = wp_db->GetSnapshot(); + const Snapshot* snapshot1 = wp_db->GetSnapshot(); + const Snapshot* snapshot2 = wp_db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE; + SequenceNumber overlap_seq = txn->GetId() + cache_size; + delete txn; + + // 4th snapshot with a larger seq + const Snapshot* snapshot3 = wp_db->GetSnapshot(); + // Cause an eviction to advance max evicted seq number + // This also fetches the 4 snapshots from db since their seq is lower than the + // new max + wp_db->AddCommitted(overlap_seq, overlap_seq); + + ReadOptions ropt; + // It should see the value before commit + ropt.snapshot = snapshot2; + PinnableSlice pinnable_val; + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + wp_db->ReleaseSnapshot(snapshot1); + + // It should still see the value before commit + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + // Cause an eviction to advance max evicted seq number and trigger updating + // the snapshot list + overlap_seq += cache_size; + wp_db->AddCommitted(overlap_seq, overlap_seq); + + // It should still see the value before commit + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + wp_db->ReleaseSnapshot(snapshot0); + wp_db->ReleaseSnapshot(snapshot2); + wp_db->ReleaseSnapshot(snapshot3); +} + +size_t UniqueCnt(std::vector vec) { + std::set aset; + for (auto i : vec) { + aset.insert(i); + } + return aset.size(); +} +// Test that the entries in old_commit_map_ get garbage collected properly +TEST_P(WritePreparedTransactionTest, OldCommitMapGC) { + const size_t snapshot_cache_bits = 0; + const size_t commit_cache_bits = 0; + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + std::unique_ptr wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + + SequenceNumber seq = 0; + // Take the first snapshot that overlaps with two txn + auto prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + auto prep_seq2 = ++seq; + wp_db->AddPrepared(prep_seq2); + auto snap_seq1 = seq; + wp_db->TakeSnapshot(snap_seq1); + auto commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + auto commit_seq2 = ++seq; + wp_db->AddCommitted(prep_seq2, commit_seq2); + wp_db->RemovePrepared(prep_seq2); + // Take the 2nd and 3rd snapshot that overlap with the same txn + prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + auto snap_seq2 = seq; + wp_db->TakeSnapshot(snap_seq2); + seq++; + auto snap_seq3 = seq; + wp_db->TakeSnapshot(snap_seq3); + seq++; + commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + // Make sure max_evicted_seq_ will be larger than 2nd snapshot by evicting the + // only item in the commit_cache_ via another commit. + prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + + // Verify that the evicted commit entries for all snapshots are in the + // old_commit_map_ + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(3, wp_db->old_commit_map_.size()); + ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq2])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 2nd snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq2); + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(2, wp_db->old_commit_map_.size()); + ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 1st snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq1); + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(1, wp_db->old_commit_map_.size()); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 3rd snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq3); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(0, wp_db->old_commit_map_.size()); + } +} + +TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) { + std::vector snapshots = {100l, 200l, 300l, 400l, 500l, + 600l, 700l, 800l, 900l}; + const size_t snapshot_cache_bits = 2; + const uint64_t cache_size = 1ul << snapshot_cache_bits; + // Safety check to express the intended size in the test. Can be adjusted if + // the snapshots lists changed. + ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size()); + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits); + std::unique_ptr wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + SequenceNumber version = 1000l; + ASSERT_EQ(0, wp_db->snapshots_total_); + wp_db->UpdateSnapshots(snapshots, version); + ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_); + // seq numbers are chosen so that we have two of them between each two + // snapshots. If the diff of two consecutive seq is more than 5, there is a + // snapshot between them. + std::vector seqs = {50l, 55l, 150l, 155l, 250l, 255l, 350l, + 355l, 450l, 455l, 550l, 555l, 650l, 655l, + 750l, 755l, 850l, 855l, 950l, 955l}; + ASSERT_GT(seqs.size(), 1); + for (size_t i = 0; i + 1 < seqs.size(); i++) { + wp_db->old_commit_map_empty_ = true; // reset + CommitEntry commit_entry = {seqs[i], seqs[i + 1]}; + wp_db->CheckAgainstSnapshots(commit_entry); + // Expect update if there is snapshot in between the prepare and commit + bool expect_update = commit_entry.commit_seq - commit_entry.prep_seq > 5 && + commit_entry.commit_seq >= snapshots.front() && + commit_entry.prep_seq <= snapshots.back(); + ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_); + } + + // Test that search will include multiple snapshot from snapshot cache + { + // exclude first and last item in the cache + CommitEntry commit_entry = {snapshots.front() + 1, + snapshots[cache_size - 1] - 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2); + } + + // Test that search will include multiple snapshot from old snapshots + { + // include two in the middle + CommitEntry commit_entry = {snapshots[cache_size] + 1, + snapshots[cache_size + 2] + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), 2); + } + + // Test that search will include both snapshot cache and old snapshots + // Case 1: includes all in snapshot cache + { + CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size()); + } + + // Case 2: includes all snapshot caches except the smallest + { + CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1); + } + + // Case 3: includes only the largest of snapshot cache + { + CommitEntry commit_entry = {snapshots[cache_size - 1] - 1, + snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1); + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in +// parallel with UpdateSnapshots. +TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) { + // We have a sync point in the method under test after checking each snapshot. + // If you increase the max number of snapshots in this test, more sync points + // in the methods must also be added. + const std::vector snapshots = {10l, 20l, 30l, 40l, 50l, + 60l, 70l, 80l, 90l, 100l}; + const size_t snapshot_cache_bits = 2; + // Safety check to express the intended size in the test. Can be adjusted if + // the snapshots lists changed. + ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size()); + SequenceNumber version = 1000l; + // Choose the cache size so that the new snapshot list could replace all the + // existing items in the cache and also have some overflow. + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits); + std::unique_ptr wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + const size_t extra = 2; + size_t loop_id = 0; + // Add up to extra items that do not fit into the cache + for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + extra; + old_size++) { + const std::vector old_snapshots( + snapshots.begin(), snapshots.begin() + old_size); + + // Each member of old snapshot might or might not appear in the new list. We + // create a common_snapshots for each combination. + size_t new_comb_cnt = size_t(1) << old_size; + for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) { + if (loop_id % split_cnt_ != split_id_) continue; + printf("."); // To signal progress + fflush(stdout); + std::vector common_snapshots; + for (size_t i = 0; i < old_snapshots.size(); i++) { + if (IsInCombination(i, new_comb)) { + common_snapshots.push_back(old_snapshots[i]); + } + } + // And add some new snapshots to the common list + for (size_t added_snapshots = 0; + added_snapshots <= snapshots.size() - old_snapshots.size(); + added_snapshots++) { + std::vector new_snapshots = common_snapshots; + for (size_t i = 0; i < added_snapshots; i++) { + new_snapshots.push_back(snapshots[old_snapshots.size() + i]); + } + for (auto it = common_snapshots.begin(); it != common_snapshots.end(); + ++it) { + auto snapshot = *it; + // Create a commit entry that is around the snapshot and thus should + // be not be discarded + CommitEntry entry = {static_cast(snapshot - 1), + snapshot + 1}; + // The critical part is when iterating the snapshot cache. Afterwards, + // we are operating under the lock + size_t a_range = + std::min(old_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1; + size_t b_range = + std::min(new_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1; + // Break each thread at two points + for (size_t a1 = 1; a1 <= a_range; a1++) { + for (size_t a2 = a1 + 1; a2 <= a_range; a2++) { + for (size_t b1 = 1; b1 <= b_range; b1++) { + for (size_t b2 = b1 + 1; b2 <= b_range; b2++) { + SnapshotConcurrentAccessTestInternal( + wp_db.get(), old_snapshots, new_snapshots, entry, version, + a1, a2, b1, b2); + } + } + } + } + } + } + } + } + printf("\n"); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// This test clarifies the contract of AdvanceMaxEvictedSeq method +TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasic) { + DBImpl* mock_db = new DBImpl(options, dbname); + std::unique_ptr wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + + // 1. Set the initial values for max, prepared, and snapshots + SequenceNumber zero_max = 0l; + // Set the initial list of prepared txns + const std::vector initial_prepared = {10, 30, 50, 100, + 150, 200, 250}; + for (auto p : initial_prepared) { + wp_db->AddPrepared(p); + } + // This updates the max value and also set old prepared + SequenceNumber init_max = 100; + wp_db->AdvanceMaxEvictedSeq(zero_max, init_max); + const std::vector initial_snapshots = {20, 40}; + wp_db->SetDBSnapshots(initial_snapshots); + // This will update the internal cache of snapshots from the DB + wp_db->UpdateSnapshots(initial_snapshots, init_max); + + // 2. Invoke AdvanceMaxEvictedSeq + const std::vector latest_snapshots = {20, 110, 220, 300}; + wp_db->SetDBSnapshots(latest_snapshots); + SequenceNumber new_max = 200; + wp_db->AdvanceMaxEvictedSeq(init_max, new_max); + + // 3. Verify that the state matches with AdvanceMaxEvictedSeq contract + // a. max should be updated to new_max + ASSERT_EQ(wp_db->max_evicted_seq_, new_max); + // b. delayed prepared should contain every txn <= max and prepared should + // only contain txns > max + auto it = initial_prepared.begin(); + for (; it != initial_prepared.end() && *it <= new_max; ++it) { + ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it)); + } + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty(); + ++it, wp_db->prepared_txns_.pop()) { + ASSERT_EQ(*it, wp_db->prepared_txns_.top()); + } + ASSERT_TRUE(it == initial_prepared.end()); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + // c. snapshots should contain everything below new_max + auto sit = latest_snapshots.begin(); + for (size_t i = 0; sit != latest_snapshots.end() && *sit <= new_max && + i < wp_db->snapshots_total_; + sit++, i++) { + ASSERT_TRUE(i < wp_db->snapshots_total_); + // This test is in small scale and the list of snapshots are assumed to be + // within the cache size limit. This is just a safety check to double check + // that assumption. + ASSERT_TRUE(i < wp_db->SNAPSHOT_CACHE_SIZE); + ASSERT_EQ(*sit, wp_db->snapshot_cache_[i]); + } +} + +// A new snapshot should always be always larger than max_evicted_seq_ +// Otherwise the snapshot does not go through AdvanceMaxEvictedSeq +TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) { + WriteOptions woptions; + TransactionOptions txn_options; + WritePreparedTxnDB* wp_db = dynamic_cast(db); + Transaction* txn0 = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value"))); + ASSERT_OK(txn0->Commit()); + const SequenceNumber seq = txn0->GetId(); // is also prepare seq + delete txn0; + std::vector txns; + // Inc seq without committing anything + for (int i = 0; i < 10; i++) { + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + ASSERT_OK(txn->Put(Slice("key" + std::to_string(i)), Slice("value"))); + ASSERT_OK(txn->Prepare()); + txns.push_back(txn); + } + + // The new commit is seq + 10 + ASSERT_OK(db->Put(woptions, "key", "value")); + auto snap = wp_db->GetSnapshot(); + const SequenceNumber last_seq = snap->GetSequenceNumber(); + wp_db->ReleaseSnapshot(snap); + ASSERT_LT(seq, last_seq); + // Otherwise our test is not effective + ASSERT_LT(last_seq - seq, wp_db->INC_STEP_FOR_MAX_EVICTED); + + // Evict seq out of commit cache + const SequenceNumber overwrite_seq = seq + wp_db->COMMIT_CACHE_SIZE; + // Check that the next write could make max go beyond last + auto last_max = wp_db->max_evicted_seq_.load(); + wp_db->AddCommitted(overwrite_seq, overwrite_seq); + // Check that eviction has advanced the max + ASSERT_LT(last_max, wp_db->max_evicted_seq_.load()); + // Check that the new max has not advanced the last seq + ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq); + for (auto txn : txns) { + txn->Rollback(); + delete txn; + } +} + +// A new snapshot should always be always larger than max_evicted_seq_ +// In very rare cases max could be below last published seq. Test that +// taking snapshot will wait for max to catch up. +TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast(db); + + const int writes = 50; + const int batch_cnt = 4; + ROCKSDB_NAMESPACE::port::Thread t1([&]() { + for (int i = 0; i < writes; i++) { + WriteBatch batch; + // For duplicate keys cause 4 commit entries, each evicting an entry that + // is not published yet, thus causing max evicted seq go higher than last + // published. + for (int b = 0; b < batch_cnt; b++) { + ASSERT_OK(batch.Put("foo", "foo")); + } + ASSERT_OK(db->Write(woptions, &batch)); + } + }); + + ROCKSDB_NAMESPACE::port::Thread t2([&]() { + while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread + std::this_thread::yield(); + } + for (int i = 0; i < 10; i++) { + SequenceNumber max_lower_bound = wp_db->max_evicted_seq_; + auto snap = db->GetSnapshot(); + if (snap->GetSequenceNumber() != 0) { + // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus + // compare with the lower bound instead as an approximation. + ASSERT_LT(max_lower_bound, snap->GetSequenceNumber()); + } // seq 0 is ok to be less than max since nothing is visible to it + db->ReleaseSnapshot(snap); + } + }); + + t1.join(); + t2.join(); + + // Make sure that the test has worked and seq number has advanced as we + // thought + auto snap = db->GetSnapshot(); + ASSERT_GT(snap->GetSequenceNumber(), batch_cnt * writes - 1); + db->ReleaseSnapshot(snap); +} + +// Test that reads without snapshots would not hit an undefined state +TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast(db); + + const int writes = 50; + ROCKSDB_NAMESPACE::port::Thread t1([&]() { + for (int i = 0; i < writes; i++) { + WriteBatch batch; + ASSERT_OK(batch.Put("key", "foo")); + ASSERT_OK(db->Write(woptions, &batch)); + } + }); + + ROCKSDB_NAMESPACE::port::Thread t2([&]() { + while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread + std::this_thread::yield(); + } + ReadOptions ropt; + PinnableSlice pinnable_val; + TransactionOptions txn_options; + for (int i = 0; i < 10; i++) { + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + std::vector values; + auto s_vec = + txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + Slice key("key"); + txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s, + true); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + delete txn; + } + }); + + t1.join(); + t2.join(); + + // Make sure that the test has worked and seq number has advanced as we + // thought + auto snap = db->GetSnapshot(); + ASSERT_GT(snap->GetSequenceNumber(), writes - 1); + db->ReleaseSnapshot(snap); +} + +// Check that old_commit_map_ cleanup works correctly if the snapshot equals +// max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast(db); + // Insert something to increase seq + ASSERT_OK(db->Put(woptions, "key", "value")); + auto snap = db->GetSnapshot(); + auto snap_seq = snap->GetSequenceNumber(); + // Another insert should trigger eviction + load snapshot from db + ASSERT_OK(db->Put(woptions, "key", "value")); + // This is the scenario that we check agaisnt + ASSERT_EQ(snap_seq, wp_db->max_evicted_seq_); + // old_commit_map_ now has some data that needs gc + ASSERT_EQ(1, wp_db->snapshots_total_); + ASSERT_EQ(1, wp_db->old_commit_map_.size()); + + db->ReleaseSnapshot(snap); + + // Another insert should trigger eviction + load snapshot from db + ASSERT_OK(db->Put(woptions, "key", "value")); + + // the snapshot and related metadata must be properly garbage collected + ASSERT_EQ(0, wp_db->snapshots_total_); + ASSERT_TRUE(wp_db->snapshots_all_.empty()); + ASSERT_EQ(0, wp_db->old_commit_map_.size()); +} + +TEST_P(WritePreparedTransactionTest, AdvanceSeqByOne) { + auto snap = db->GetSnapshot(); + auto seq1 = snap->GetSequenceNumber(); + db->ReleaseSnapshot(snap); + + WritePreparedTxnDB* wp_db = dynamic_cast(db); + wp_db->AdvanceSeqByOne(); + + snap = db->GetSnapshot(); + auto seq2 = snap->GetSequenceNumber(); + db->ReleaseSnapshot(snap); + + ASSERT_LT(seq1, seq2); +} + +// Test that the txn Initilize calls the overridden functions +TEST_P(WritePreparedTransactionTest, TxnInitialize) { + TransactionOptions txn_options; + WriteOptions write_options; + ASSERT_OK(db->Put(write_options, "key", "value")); + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value1"))); + ASSERT_OK(txn0->Prepare()); + + // SetSnapshot is overridden to update min_uncommitted_ + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + auto snap = txn1->GetSnapshot(); + auto snap_impl = reinterpret_cast(snap); + // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be + // udpated + ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq); + + ASSERT_OK(txn0->Rollback()); + ASSERT_OK(txn1->Rollback()); + delete txn0; + delete txn1; +} + +// This tests that transactions with duplicate keys perform correctly after max +// is advancing their prepared sequence numbers. This will not be the case if +// for example the txn does not add the prepared seq for the second sub-batch to +// the PreparedHeap structure. +TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // disable commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + + ReadOptions ropt; + PinnableSlice pinnable_val; + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value1"))); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value2"))); + ASSERT_OK(txn0->Prepare()); + + ASSERT_OK(db->Put(write_options, "key2", "value")); + // Will cause max advance due to disabled commit cache + ASSERT_OK(db->Put(write_options, "key3", "value")); + + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + delete txn0; + + WritePreparedTxnDB* wp_db = dynamic_cast(db); + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + txn0 = db->GetTransactionByName("xid"); + ASSERT_OK(txn0->Rollback()); + delete txn0; +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and +// delayed_prepared_, when is run concurrently with advancing max_evicted_seq, +// which moves prepared txns from prepared_txns_ to delayed_prepared_. +TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // disable commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + ReadOptions ropt; + PinnableSlice pinnable_val; + WriteOptions write_options; + TransactionOptions txn_options; + std::vector txns, committed_txns; + + const int cnt = 100; + for (int i = 0; i < cnt; i++) { + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + auto key = "key1" + std::to_string(i); + auto value = "value1" + std::to_string(i); + ASSERT_OK(txn->Put(Slice(key), Slice(value))); + ASSERT_OK(txn->Prepare()); + txns.push_back(txn); + } + + port::Mutex mutex; + Random rnd(1103); + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + for (int i = 0; i < cnt; i++) { + uint32_t index = rnd.Uniform(cnt - i); + Transaction* txn; + { + MutexLock l(&mutex); + txn = txns[index]; + txns.erase(txns.begin() + index); + } + // Since commit cache is practically disabled, commit results in immediate + // advance in max_evicted_seq_ and subsequently moving some prepared txns + // to delayed_prepared_. + ASSERT_OK(txn->Commit()); + committed_txns.push_back(txn); + } + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + while (1) { + MutexLock l(&mutex); + if (txns.empty()) { + break; + } + auto min_uncommitted = wp_db->SmallestUnCommittedSeq(); + ASSERT_LE(min_uncommitted, (*txns.begin())->GetId()); + } + }); + + commit_thread.join(); + read_thread.join(); + for (auto txn : committed_txns) { + delete txn; + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) { + // Given the sequential run of txns, with this timeout we should never see a + // deadlock nor a timeout unless we have a key conflict, which should be + // almost infeasible. + txn_db_options.transaction_lock_timeout = 1000; + txn_db_options.default_lock_timeout = 1000; + ASSERT_OK(ReOpen()); + FlushOptions fopt; + + // Number of different txn types we use in this test + const size_t type_cnt = 5; + // The size of the first write group + // TODO(myabandeh): This should be increase for pre-release tests + const size_t first_group_size = 2; + // Total number of txns we run in each test + // TODO(myabandeh): This should be increase for pre-release tests + const size_t txn_cnt = first_group_size + 1; + + size_t base[txn_cnt + 1] = { + 1, + }; + for (size_t bi = 1; bi <= txn_cnt; bi++) { + base[bi] = base[bi - 1] * type_cnt; + } + const size_t max_n = static_cast(std::pow(type_cnt, txn_cnt)); + printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n); + for (size_t n = 0; n < max_n; n++) { + if (n > 0) { + ASSERT_OK(ReOpen()); + } + + if (n % split_cnt_ != split_id_) continue; + if (n % 1000 == 0) { + printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n); + } + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + auto seq = db_impl->TEST_GetLastVisibleSequence(); + with_empty_commits = 0; + exp_seq = seq; + // This is increased before writing the batch for commit + commit_writes = 0; + // This is increased before txn starts linking if it expects to do a commit + // eventually + expected_commits = 0; + std::vector threads; + + linked.store(0, std::memory_order_release); + std::atomic batch_formed(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::EnterAsBatchGroupLeader:End", + [&](void* /*arg*/) { batch_formed = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) { + size_t orig_linked = linked.fetch_add(1, std::memory_order_acq_rel); + if (orig_linked == 0) { + // Wait until the others are linked too. + while (linked.load(std::memory_order_acquire) < first_group_size) { + } + } else if (orig_linked == first_group_size) { + // Make the 2nd batch of the rest of writes plus any followup + // commits from the first batch + while (linked.load(std::memory_order_acquire) < + txn_cnt + commit_writes) { + } + } + // Then we will have one or more batches consisting of follow-up + // commits from the 2nd batch. There is a bit of non-determinism here + // but it should be tolerable. + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (size_t bi = 0; bi < txn_cnt; bi++) { + // get the bi-th digit in number system based on type_cnt + size_t d = (n % base[bi + 1]) / base[bi]; + switch (d) { + case 0: + threads.emplace_back(&TransactionTestBase::TestTxn0, this, bi); + break; + case 1: + threads.emplace_back(&TransactionTestBase::TestTxn1, this, bi); + break; + case 2: + threads.emplace_back(&TransactionTestBase::TestTxn2, this, bi); + break; + case 3: + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); + break; + case 4: + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); + break; + default: + FAIL(); + } + // wait to be linked + while (linked.load(std::memory_order_acquire) <= bi) { + } + // after a queue of size first_group_size + if (bi + 1 == first_group_size) { + while (!batch_formed) { + } + // to make it more deterministic, wait until the commits are linked + while (linked.load(std::memory_order_acquire) <= + bi + expected_commits) { + } + } + } + for (auto& t : threads) { + t.join(); + } + if (options.two_write_queues) { + // In this case none of the above scheduling tricks to deterministically + // form merged batches works because the writes go to separate queues. + // This would result in different write groups in each run of the test. We + // still keep the test since although non-deterministic and hard to debug, + // it is still useful to have. + // TODO(myabandeh): Add a deterministic unit test for two_write_queues + } + + // Check if memtable inserts advanced seq number as expected + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Check if recovery preserves the last sequence number + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + + // Check if flush preserves the last sequence number + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + + // Check if recovery after flush preserves the last sequence number + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + db_impl = static_cast_with_check(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + } +} + +// Run a couple of different txns among them some uncommitted. Restart the db at +// a couple points to check whether the list of uncommitted txns are recovered +// properly. +TEST_P(WritePreparedTransactionTest, BasicRecovery) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + + TestTxn0(0); + + TransactionOptions txn_options; + WriteOptions write_options; + size_t index = 1000; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto istr0 = std::to_string(index); + auto s = txn0->SetName("xid" + istr0); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0)); + ASSERT_OK(s); + s = txn0->Prepare(); + ASSERT_OK(s); + auto prep_seq_0 = txn0->GetId(); + + TestTxn1(0); + + index++; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + auto istr1 = std::to_string(index); + s = txn1->SetName("xid" + istr1); + ASSERT_OK(s); + s = txn1->Put(Slice("foo1" + istr1), Slice("bar")); + ASSERT_OK(s); + s = txn1->Prepare(); + ASSERT_OK(s); + auto prep_seq_1 = txn1->GetId(); + + TestTxn2(0); + + ReadOptions ropt; + PinnableSlice pinnable_val; + // Check the value is not committed before restart + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + + delete txn0; + delete txn1; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast(db); + // After recovery, all the uncommitted txns (0 and 1) should be inserted into + // delayed_prepared_ + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_); + ASSERT_LE(prep_seq_1, wp_db->max_evicted_seq_); + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_EQ(2, wp_db->delayed_prepared_.size()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_0) != + wp_db->delayed_prepared_.end()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_1) != + wp_db->delayed_prepared_.end()); + } + + // Check the value is still not committed after restart + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + + TestTxn3(0); + + // Test that a recovered txns will be properly marked committed for the next + // recovery + txn1 = db->GetTransactionByName("xid" + istr1); + ASSERT_NE(txn1, nullptr); + ASSERT_OK(txn1->Commit()); + delete txn1; + + index++; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + auto istr2 = std::to_string(index); + s = txn2->SetName("xid" + istr2); + ASSERT_OK(s); + s = txn2->Put(Slice("foo2" + istr2), Slice("bar")); + ASSERT_OK(s); + s = txn2->Prepare(); + ASSERT_OK(s); + auto prep_seq_2 = txn2->GetId(); + + delete txn2; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast(db); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + + // 0 and 2 are prepared and 1 is committed + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_EQ(2, wp_db->delayed_prepared_.size()); + const auto& end = wp_db->delayed_prepared_.end(); + ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_0), end); + ASSERT_EQ(wp_db->delayed_prepared_.find(prep_seq_1), end); + ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_2), end); + } + ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_); + ASSERT_LE(prep_seq_2, wp_db->max_evicted_seq_); + + // Commit all the remaining txns + txn0 = db->GetTransactionByName("xid" + istr0); + ASSERT_NE(txn0, nullptr); + ASSERT_OK(txn0->Commit()); + txn2 = db->GetTransactionByName("xid" + istr2); + ASSERT_NE(txn2, nullptr); + ASSERT_OK(txn2->Commit()); + + // Check the value is committed after commit + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(pinnable_val == ("bar0" + istr0)); + pinnable_val.Reset(); + + delete txn0; + delete txn2; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast(db); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_empty_); + + // Check the value is still committed after recovery + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(pinnable_val == ("bar0" + istr0)); + pinnable_val.Reset(); +} + +// After recovery the commit map is empty while the max is set. The code would +// go through a different path which requires a separate test. Test that the +// committed data before the restart is visible to all snapshots. +TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) { + for (bool end_with_prepare : {false, true}) { + ASSERT_OK(ReOpen()); + WriteOptions woptions; + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + SequenceNumber prepare_seq = kMaxSequenceNumber; + if (end_with_prepare) { + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->SetName("xid0")); + ASSERT_OK(txn->Prepare()); + prepare_seq = txn->GetId(); + delete txn; + } + dynamic_cast(db)->TEST_Crash(); + auto db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + ASSERT_NE(wp_db, nullptr); + ASSERT_GT(wp_db->max_evicted_seq_, 0); // max after recovery + // Take a snapshot right after recovery + const Snapshot* snap = db->GetSnapshot(); + auto snap_seq = snap->GetSequenceNumber(); + ASSERT_GT(snap_seq, 0); + + for (SequenceNumber seq = 0; + seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) { + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq)); + } + if (end_with_prepare) { + ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq)); + } + // trivial check + ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq)); + + db->ReleaseSnapshot(snap); + + ASSERT_OK(db->Put(woptions, "key", "value")); + // Take a snapshot after some writes + snap = db->GetSnapshot(); + snap_seq = snap->GetSequenceNumber(); + for (SequenceNumber seq = 0; + seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) { + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq)); + } + if (end_with_prepare) { + ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq)); + } + // trivial check + ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq)); + + db->ReleaseSnapshot(snap); + } +} + +// Shows the contract of IsInSnapshot when called on invalid/released snapshots +TEST_P(WritePreparedTransactionTest, IsInSnapshotReleased) { + WritePreparedTxnDB* wp_db = dynamic_cast(db); + WriteOptions woptions; + ASSERT_OK(db->Put(woptions, "key", "value")); + // snap seq = 1 + const Snapshot* snap1 = db->GetSnapshot(); + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + // snap seq = 3 + const Snapshot* snap2 = db->GetSnapshot(); + const SequenceNumber seq = 1; + // Evict seq out of commit cache + size_t overwrite_seq = wp_db->COMMIT_CACHE_SIZE + seq; + wp_db->AddCommitted(overwrite_seq, overwrite_seq); + SequenceNumber snap_seq; + uint64_t min_uncommitted = kMinUnCommittedSeq; + bool released; + + released = false; + snap_seq = snap1->GetSequenceNumber(); + ASSERT_LE(seq, snap_seq); + // Valid snapshot lower than max + ASSERT_LE(snap_seq, wp_db->max_evicted_seq_); + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_FALSE(released); + + released = false; + snap_seq = snap1->GetSequenceNumber(); + // Invaid snapshot lower than max + ASSERT_LE(snap_seq + 1, wp_db->max_evicted_seq_); + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + db->ReleaseSnapshot(snap1); + + released = false; + // Released snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + // The release does not take affect until the next max advance + ASSERT_FALSE(released); + + released = false; + // Invaid snapshot lower than max + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + // This make the snapshot release to reflect in txn db structures + wp_db->AdvanceMaxEvictedSeq(wp_db->max_evicted_seq_, + wp_db->max_evicted_seq_ + 1); + + released = false; + // Released snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_TRUE(released); + + released = false; + // Invaid snapshot lower than max + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + snap_seq = snap2->GetSequenceNumber(); + + released = false; + // Unreleased snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_FALSE(released); + + db->ReleaseSnapshot(snap2); +} + +// Test WritePreparedTxnDB's IsInSnapshot against different ordering of +// snapshot, max_committed_seq_, prepared, and commit entries. +TEST_P(WritePreparedTransactionTest, IsInSnapshot) { + WriteOptions wo; + // Use small commit cache to trigger lots of eviction and fast advance of + // max_evicted_seq_ + const size_t commit_cache_bits = 3; + // Same for snapshot cache size + const size_t snapshot_cache_bits = 2; + + // Take some preliminary snapshots first. This is to stress the data structure + // that holds the old snapshots as it will be designed to be efficient when + // only a few snapshots are below the max_evicted_seq_. + for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) { + // Leave some gap between the preliminary snapshots and the final snapshot + // that we check. This should test for also different overlapping scenarios + // between the last snapshot and the commits. + for (int max_gap = 1; max_gap < 10; max_gap++) { + // Since we do not actually write to db, we mock the seq as it would be + // increased by the db. The only exception is that we need db seq to + // advance for our snapshots. for which we apply a dummy put each time we + // increase our mock of seq. + uint64_t seq = 0; + // At each step we prepare a txn and then we commit it in the next txn. + // This emulates the consecutive transactions that write to the same key + uint64_t cur_txn = 0; + // Number of snapshots taken so far + int num_snapshots = 0; + // Number of gaps applied so far + int gap_cnt = 0; + // The final snapshot that we will inspect + uint64_t snapshot = 0; + bool found_committed = false; + // To stress the data structure that maintain prepared txns, at each cycle + // we add a new prepare txn. These do not mean to be committed for + // snapshot inspection. + std::set prepared; + // We keep the list of txns committed before we take the last snapshot. + // These should be the only seq numbers that will be found in the snapshot + std::set committed_before; + // The set of commit seq numbers to be excluded from IsInSnapshot queries + std::set commit_seqs; + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + std::unique_ptr wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + // We continue until max advances a bit beyond the snapshot. + while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) { + // do prepare for a transaction + seq++; + wp_db->AddPrepared(seq); + prepared.insert(seq); + + // If cur_txn is not started, do prepare for it. + if (!cur_txn) { + seq++; + cur_txn = seq; + wp_db->AddPrepared(cur_txn); + } else { // else commit it + seq++; + wp_db->AddCommitted(cur_txn, seq); + wp_db->RemovePrepared(cur_txn); + commit_seqs.insert(seq); + if (!snapshot) { + committed_before.insert(cur_txn); + } + cur_txn = 0; + } + + if (num_snapshots < max_snapshots - 1) { + // Take preliminary snapshots + wp_db->TakeSnapshot(seq); + num_snapshots++; + } else if (gap_cnt < max_gap) { + // Wait for some gap before taking the final snapshot + gap_cnt++; + } else if (!snapshot) { + // Take the final snapshot if it is not already taken + snapshot = seq; + wp_db->TakeSnapshot(snapshot); + num_snapshots++; + } + + // If the snapshot is taken, verify seq numbers visible to it. We redo + // it at each cycle to test that the system is still sound when + // max_evicted_seq_ advances. + if (snapshot) { + for (uint64_t s = 1; + s <= seq && commit_seqs.find(s) == commit_seqs.end(); s++) { + bool was_committed = + (committed_before.find(s) != committed_before.end()); + bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot); + if (was_committed != is_in_snapshot) { + printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64 + " snapshot %" PRIu64 + " gap_cnt %d num_snapshots %d s %" PRIu64 "\n", + max_snapshots, max_gap, seq, + wp_db->max_evicted_seq_.load(), snapshot, gap_cnt, + num_snapshots, s); + } + ASSERT_EQ(was_committed, is_in_snapshot); + found_committed = found_committed || is_in_snapshot; + } + } + } + // Safety check to make sure the test actually ran + ASSERT_TRUE(found_committed); + // As an extra check, check if prepared set will be properly empty after + // they are committed. + if (cur_txn) { + wp_db->AddCommitted(cur_txn, seq); + wp_db->RemovePrepared(cur_txn); + } + for (auto p : prepared) { + wp_db->AddCommitted(p, seq); + wp_db->RemovePrepared(p); + } + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + } + } +} + +void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s, + PinnableSlice& exp_v, Slice key) { + Status s; + PinnableSlice v; + s = db->Get(roptions, db->DefaultColumnFamily(), key, &v); + ASSERT_EQ(exp_s, s); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(exp_v == v); + } + + // Try with MultiGet API too + std::vector values; + auto s_vec = + db->MultiGet(roptions, {db->DefaultColumnFamily()}, {key}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_EQ(exp_s, s); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(exp_v == values[0]); + } +} + +void ASSERT_SAME(TransactionDB* db, Status exp_s, PinnableSlice& exp_v, + Slice key) { + ASSERT_SAME(ReadOptions(), db, exp_s, exp_v, key); +} + +TEST_P(WritePreparedTransactionTest, Rollback) { + ReadOptions roptions; + WriteOptions woptions; + TransactionOptions txn_options; + const size_t num_keys = 4; + const size_t num_values = 5; + for (size_t ikey = 1; ikey <= num_keys; ikey++) { + for (size_t ivalue = 0; ivalue < num_values; ivalue++) { + for (bool crash : {false, true}) { + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + std::string key_str = "key" + std::to_string(ikey); + switch (ivalue) { + case 0: + break; + case 1: + ASSERT_OK(db->Put(woptions, key_str, "initvalue1")); + break; + case 2: + ASSERT_OK(db->Merge(woptions, key_str, "initvalue2")); + break; + case 3: + ASSERT_OK(db->Delete(woptions, key_str)); + break; + case 4: + ASSERT_OK(db->SingleDelete(woptions, key_str)); + break; + default: + FAIL(); + } + + PinnableSlice v1; + auto s1 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key1"), &v1); + PinnableSlice v2; + auto s2 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key2"), &v2); + PinnableSlice v3; + auto s3 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key3"), &v3); + PinnableSlice v4; + auto s4 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key4"), &v4); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + auto s = txn->SetName("xid0"); + ASSERT_OK(s); + s = txn->Put(Slice("key1"), Slice("value1")); + ASSERT_OK(s); + s = txn->Merge(Slice("key2"), Slice("value2")); + ASSERT_OK(s); + s = txn->Delete(Slice("key3")); + ASSERT_OK(s); + s = txn->SingleDelete(Slice("key4")); + ASSERT_OK(s); + s = txn->Prepare(); + ASSERT_OK(s); + + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_FALSE(wp_db->prepared_txns_.empty()); + ASSERT_EQ(txn->GetId(), wp_db->prepared_txns_.top()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + + if (crash) { + delete txn; + auto db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + dynamic_cast(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast(db); + txn = db->GetTransactionByName("xid0"); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) != + wp_db->delayed_prepared_.end()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + + s = txn->Rollback(); + ASSERT_OK(s); + + { + ASSERT_TRUE(wp_db->delayed_prepared_empty_); + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + delete txn; + } + } + } +} + +TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) { + // Use large buffer to avoid memtable flush after 1024 insertions + options.write_buffer_size = 1024 * 1024; + ASSERT_OK(ReOpen()); + std::vector versions; + uint64_t seq = 0; + for (uint64_t i = 1; i <= 1024; i++) { + std::string v = "bar" + std::to_string(i); + ASSERT_OK(db->Put(WriteOptions(), "foo", v)); + VerifyKeys({{"foo", v}}); + seq++; // one for the key/value + KeyVersion kv = {"foo", v, seq, kTypeValue}; + if (options.two_write_queues) { + seq++; // one for the commit + } + versions.emplace_back(kv); + } + std::reverse(std::begin(versions), std::end(versions)); + VerifyInternalKeys(versions); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + // Use small buffer to ensure memtable flush during recovery + options.write_buffer_size = 1024; + ASSERT_OK(ReOpenNoDelete()); + VerifyInternalKeys(versions); +} + +TEST_P(WritePreparedTransactionTest, SequenceNumberZero) { + ASSERT_OK(db->Put(WriteOptions(), "foo", "bar")); + VerifyKeys({{"foo", "bar"}}); + const Snapshot* snapshot = db->GetSnapshot(); + ASSERT_OK(db->Flush(FlushOptions())); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Compaction will output keys with sequence number 0, if it is visible to + // earliest snapshot. Make sure IsInSnapshot() report sequence number 0 is + // visible to any snapshot. + VerifyKeys({{"foo", "bar"}}); + VerifyKeys({{"foo", "bar"}}, snapshot); + VerifyInternalKeys({{"foo", "bar", 0, kTypeValue}}); + db->ReleaseSnapshot(snapshot); +} + +// Compaction should not remove a key if it is not committed, and should +// proceed with older versions of the key as-if the new version doesn't exist. +TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + // Snapshots to avoid keys get evicted. + std::vector snapshots; + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + + auto add_key = [&](std::function func) { + ASSERT_OK(func()); + expected_seq++; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + snapshots.push_back(db->GetSnapshot()); + }; + + // Each key here represent a standalone test case. + add_key([&]() { return db->Put(WriteOptions(), "key1", "value1_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key2", "value2_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key3", "value3_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key4", "value4_1"); }); + add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_1"); }); + add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_2"); }); + add_key([&]() { return db->Put(WriteOptions(), "key6", "value6_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key7", "value7_1"); }); + ASSERT_OK(db->Flush(FlushOptions())); + add_key([&]() { return db->Delete(WriteOptions(), "key6"); }); + add_key([&]() { return db->SingleDelete(WriteOptions(), "key7"); }); + + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1_2")); + ASSERT_OK(transaction->Delete("key2")); + ASSERT_OK(transaction->SingleDelete("key3")); + ASSERT_OK(transaction->Merge("key4", "value4_2")); + ASSERT_OK(transaction->Merge("key5", "value5_3")); + ASSERT_OK(transaction->Put("key6", "value6_2")); + ASSERT_OK(transaction->Put("key7", "value7_2")); + // Prepare but not commit. + ASSERT_OK(transaction->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + ASSERT_OK(db->Flush(FlushOptions())); + for (auto* s : snapshots) { + db->ReleaseSnapshot(s); + } + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({ + {"key1", "value1_1"}, + {"key2", "value2_1"}, + {"key3", "value3_1"}, + {"key4", "value4_1"}, + {"key5", "value5_1,value5_2"}, + {"key6", "NOT_FOUND"}, + {"key7", "NOT_FOUND"}, + }); + VerifyInternalKeys({ + {"key1", "value1_2", expected_seq, kTypeValue}, + {"key1", "value1_1", 0, kTypeValue}, + {"key2", "", expected_seq, kTypeDeletion}, + {"key2", "value2_1", 0, kTypeValue}, + {"key3", "", expected_seq, kTypeSingleDeletion}, + {"key3", "value3_1", 0, kTypeValue}, + {"key4", "value4_2", expected_seq, kTypeMerge}, + {"key4", "value4_1", 0, kTypeValue}, + {"key5", "value5_3", expected_seq, kTypeMerge}, + {"key5", "value5_1,value5_2", 0, kTypeValue}, + {"key6", "value6_2", expected_seq, kTypeValue}, + {"key7", "value7_2", expected_seq, kTypeValue}, + }); + ASSERT_OK(transaction->Commit()); + VerifyKeys({ + {"key1", "value1_2"}, + {"key2", "NOT_FOUND"}, + {"key3", "NOT_FOUND"}, + {"key4", "value4_1,value4_2"}, + {"key5", "value5_1,value5_2,value5_3"}, + {"key6", "value6_2"}, + {"key7", "value7_2"}, + }); + delete transaction; +} + +// Compaction should keep keys visible to a snapshot based on commit sequence, +// not just prepare sequence. +TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + auto* txn1 = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Put("key1", "value1_1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + ASSERT_OK(txn1->Commit()); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence()); + delete txn1; + // Take a snapshots to avoid keys get evicted before compaction. + const Snapshot* snapshot1 = db->GetSnapshot(); + auto* txn2 = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Put("key2", "value2_1")); + ASSERT_OK(txn2->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + // txn1 commit before snapshot2 and it is visible to snapshot2. + // txn2 commit after snapshot2 and it is not visible. + const Snapshot* snapshot2 = db->GetSnapshot(); + ASSERT_OK(txn2->Commit()); + ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence()); + delete txn2; + // Take a snapshots to avoid keys get evicted before compaction. + const Snapshot* snapshot3 = db->GetSnapshot(); + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2")); + expected_seq++; // 1 for write + SequenceNumber seq1 = expected_seq; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2")); + expected_seq++; // 1 for write + SequenceNumber seq2 = expected_seq; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Flush(FlushOptions())); + db->ReleaseSnapshot(snapshot1); + db->ReleaseSnapshot(snapshot3); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({{"key1", "value1_2"}, {"key2", "value2_2"}}); + VerifyKeys({{"key1", "value1_1"}, {"key2", "NOT_FOUND"}}, snapshot2); + VerifyInternalKeys({ + {"key1", "value1_2", seq1, kTypeValue}, + // "value1_1" is visible to snapshot2. Also keys at bottom level visible + // to earliest snapshot will output with seq = 0. + {"key1", "value1_1", 0, kTypeValue}, + {"key2", "value2_2", seq2, kTypeValue}, + }); + db->ReleaseSnapshot(snapshot2); +} + +TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // disable commit cache + for (bool has_recent_prepare : {true, false}) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Delete("key1")); + ASSERT_OK(transaction->Prepare()); + // snapshot1 should get min_uncommitted from prepared_txns_ heap. + auto snapshot1 = db->GetSnapshot(); + ASSERT_EQ(transaction->GetId(), + ((SnapshotImpl*)snapshot1)->min_uncommitted_); + // Add a commit to advance max_evicted_seq and move the prepared transaction + // into delayed_prepared_ set. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + Transaction* txn2 = nullptr; + if (has_recent_prepare) { + txn2 = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Put("key3", "value3")); + ASSERT_OK(txn2->Prepare()); + } + // snapshot2 should get min_uncommitted from delayed_prepared_ set. + auto snapshot2 = db->GetSnapshot(); + ASSERT_EQ(transaction->GetId(), + ((SnapshotImpl*)snapshot1)->min_uncommitted_); + ASSERT_OK(transaction->Commit()); + delete transaction; + if (has_recent_prepare) { + ASSERT_OK(txn2->Commit()); + delete txn2; + } + VerifyKeys({{"key1", "NOT_FOUND"}}); + VerifyKeys({{"key1", "value1"}}, snapshot1); + VerifyKeys({{"key1", "value1"}}, snapshot2); + db->ReleaseSnapshot(snapshot1); + db->ReleaseSnapshot(snapshot2); + } +} + +// Insert two values, v1 and v2, for a key. Between prepare and commit of v2 +// take two snapshots, s1 and s2. Release s1 during compaction. +// Test to make sure compaction doesn't get confused and think s1 can see both +// values, and thus compact out the older value by mistake. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1")); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1_2")); + ASSERT_OK(transaction->Prepare()); + auto snapshot1 = db->GetSnapshot(); + // Increment sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + auto snapshot2 = db->GetSnapshot(); + ASSERT_OK(transaction->Commit()); + delete transaction; + VerifyKeys({{"key1", "value1_2"}}); + VerifyKeys({{"key1", "value1_1"}}, snapshot1); + VerifyKeys({{"key1", "value1_1"}}, snapshot2); + // Add a flush to avoid compaction to fallback to trivial move. + + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; + auto callback = [&](void*) { + if (callback_finished) { + return; + } + // Release snapshot1 after CompactionIterator init. + // CompactionIterator need to figure out the earliest snapshot + // that can see key1:value1_2 is kMaxSequenceNumber, not + // snapshot1 or snapshot2. + db->ReleaseSnapshot(snapshot1); + // Add some keys to advance max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "key3", "value3")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "value4")); + callback_finished = true; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + VerifyKeys({{"key1", "value1_2"}}); + VerifyKeys({{"key1", "value1_1"}}, snapshot2); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Insert two values, v1 and v2, for a key. Take two snapshots, s1 and s2, +// after committing v2. Release s1 during compaction, right after compaction +// processes v2 and before processes v1. Test to make sure compaction doesn't +// get confused and believe v1 and v2 are visible to different snapshot +// (v1 by s2, v2 by s1) and refuse to compact out v1. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + ASSERT_OK(db->Put(WriteOptions(), "key1", "value2")); + SequenceNumber v2_seq = db->GetLatestSequenceNumber(); + auto* s1 = db->GetSnapshot(); + // Advance sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "dummy")); + auto* s2 = db->GetSnapshot(); + + int count_value = 0; + auto callback = [&](void* arg) { + auto* ikey = reinterpret_cast(arg); + if (ikey->user_key == "key1") { + count_value++; + if (count_value == 2) { + // Processing v1. + db->ReleaseSnapshot(s1); + // Add some keys to advance max_evicted_seq and update + // old_commit_map. + ASSERT_OK(db->Put(WriteOptions(), "key3", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "dummy")); + } + } + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:ProcessKV", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // value1 should be compact out. + VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}}); + + // cleanup + db->ReleaseSnapshot(s2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Insert two values, v1 and v2, for a key. Insert another dummy key +// so to evict the commit cache for v2, while v1 is still in commit cache. +// Take two snapshots, s1 and s2. Release s1 during compaction. +// Since commit cache for v2 is evicted, and old_commit_map don't have +// s1 (it is released), +// TODO(myabandeh): how can we be sure that the v2's commit info is evicted +// (and not v1's)? Instead of putting a dummy, we can directly call +// AddCommitted(v2_seq + cache_size, ...) to evict v2's entry from commit cache. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // commit cache size = 2 + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Add a dummy key to evict v2 commit cache, but keep v1 commit cache. + // It also advance max_evicted_seq and can trigger old_commit_map cleanup. + auto add_dummy = [&]() { + auto* txn_dummy = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn_dummy->SetName("txn_dummy")); + ASSERT_OK(txn_dummy->Put("dummy", "dummy")); + ASSERT_OK(txn_dummy->Prepare()); + ASSERT_OK(txn_dummy->Commit()); + delete txn_dummy; + }; + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key1", "value2")); + ASSERT_OK(txn->Prepare()); + // TODO(myabandeh): replace it with GetId()? + auto v2_seq = db->GetLatestSequenceNumber(); + ASSERT_OK(txn->Commit()); + delete txn; + auto* s1 = db->GetSnapshot(); + // Dummy key to advance sequence number. + add_dummy(); + auto* s2 = db->GetSnapshot(); + + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; + auto callback = [&](void*) { + if (callback_finished) { + return; + } + db->ReleaseSnapshot(s1); + // Add some dummy entries to trigger s1 being cleanup from old_commit_map. + add_dummy(); + add_dummy(); + callback_finished = true; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // value1 should be compact out. + VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}}); + + db->ReleaseSnapshot(s2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + SequenceNumber put_seq = db->GetLatestSequenceNumber(); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Delete("key1")); + ASSERT_OK(transaction->Prepare()); + SequenceNumber del_seq = db->GetLatestSequenceNumber(); + auto snapshot1 = db->GetSnapshot(); + // Increment sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + auto snapshot2 = db->GetSnapshot(); + ASSERT_OK(transaction->Commit()); + delete transaction; + VerifyKeys({{"key1", "NOT_FOUND"}}); + VerifyKeys({{"key1", "value1"}}, snapshot1); + VerifyKeys({{"key1", "value1"}}, snapshot2); + ASSERT_OK(db->Flush(FlushOptions())); + + auto callback = [&](void* compaction) { + // Release snapshot1 after CompactionIterator init. + // CompactionIterator need to double check and find out snapshot2 is now + // the earliest existing snapshot. + if (compaction != nullptr) { + db->ReleaseSnapshot(snapshot1); + // Add some keys to advance max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "key3", "value3")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "value4")); + } + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Only verify for key1. Both the put and delete for the key should be kept. + // Since the delete tombstone is not visible to snapshot2, we need to keep + // at least one version of the key, for write-conflict check. + VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion}, + {"key1", "value1", put_seq, kTypeValue}}); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithSD) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->SingleDelete("key")); + ASSERT_OK(txn->Put("wow", "value")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(db->Flush(FlushOptions())); + + const bool two_write_queues = std::get<1>(GetParam()); + if (two_write_queues) { + // In the case of two queues, commit another txn just to bump + // last_published_seq so that a subsequent GetSnapshot() call can return + // a snapshot with higher sequence. + auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(dummy_txn->Put("haha", "value")); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + } + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(txn->Commit()); + delete txn; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { + if (!arg) { + return; + } + db->ReleaseSnapshot(snapshot); + + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithSD2) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->Put("bar", "value")); + ASSERT_OK(txn->SingleDelete("key")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(db->Flush(FlushOptions())); + + ASSERT_OK(txn->Commit()); + delete txn; + + ASSERT_OK(db->Put(WriteOptions(), "haha", "value")); + + // Create a dummy transaction to take a snapshot for ww-conflict detection. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + auto* dummy_txn = + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:2", [&](void* /*arg*/) { + ASSERT_OK(dummy_txn->Rollback()); + delete dummy_txn; + + ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Put(WriteOptions(), "haha2", "value")); + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + db->ReleaseSnapshot(snapshot); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithDelete) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->Delete("b")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + + const bool two_write_queues = std::get<1>(GetParam()); + if (two_write_queues) { + // In the case of two queues, commit another txn just to bump + // last_published_seq so that a subsequent GetSnapshot() call can return + // a snapshot with higher sequence. + auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(dummy_txn->Put("haha", "value")); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + } + auto* snapshot1 = db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + delete txn; + auto* snapshot2 = db->GetSnapshot(); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:BottommostDelete:1", [&](void* arg) { + if (!arg) { + return; + } + db->ReleaseSnapshot(snapshot1); + + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "dummy1", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseSnapshotBetweenSDAndPutDuringCompaction) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Create a dummy transaction to take a snapshot for ww-conflict detection. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + auto* dummy_txn = + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr); + // Increment seq + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + auto* snapshot1 = db->GetSnapshot(); + // Increment seq + ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value")); + auto* snapshot2 = db->GetSnapshot(); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:KeepSDForWW", [&](void* /*arg*/) { + db->ReleaseSnapshot(snapshot1); + + ASSERT_OK(db->Put(WriteOptions(), "dontcare2", "value2")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + db->ReleaseSnapshot(snapshot2); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestWriteConflictSnapshot_SingleDelete) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + std::unique_ptr txn; + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr)); + ASSERT_OK(txn->SetName("txn1")); + ASSERT_OK(txn->SingleDelete("b")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + + auto* snapshot1 = db->GetSnapshot(); + + // Bump seq of the db by performing writes so that + // earliest_snapshot_ < earliest_write_conflict_snapshot_ in + // CompactionIterator. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + + // Create another snapshot for write conflict checking + std::unique_ptr txn2; + { + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + txn2.reset( + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr)); + } + + // Bump seq so that the subsequent bg flush won't create a snapshot with the + // same seq as the previous snapshot for conflict checking. + ASSERT_OK(db->Put(WriteOptions(), "y", "dont")); + + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) { + // Rolling back txn2 should release its snapshot(for ww checking). + ASSERT_OK(txn2->Rollback()); + txn2.reset(); + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + db->ReleaseSnapshot(snapshot1); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(db->SingleDelete(WriteOptions(), "b")); + + // Take a snapshot so that the SD won't be dropped during flush. + auto* tmp_snapshot = db->GetSnapshot(); + + ASSERT_OK(db->Put(WriteOptions(), "b", "value2")); + auto* snapshot = db->GetSnapshot(); + ASSERT_OK(db->Flush(FlushOptions())); + + db->ReleaseSnapshot(tmp_snapshot); + + // Bump the sequence so that the below bg compaction job's snapshot will be + // different from snapshot's sequence. + ASSERT_OK(db->Put(WriteOptions(), "z", "foo")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { + const auto* const ikey = + reinterpret_cast(arg); + assert(ikey); + if (ikey->user_key == "b") { + assert(ikey->type == kTypeValue); + db->ReleaseSnapshot(snapshot); + + // Bump max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Generate an L0 with only SD for one key "b". + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + // Take a snapshot so that subsequent flush outputs the SD for "b". + auto* tmp_snapshot = db->GetSnapshot(); + ASSERT_OK(db->SingleDelete(WriteOptions(), "b")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:3", [&](void* arg) { + if (!arg) { + db->ReleaseSnapshot(tmp_snapshot); + // Bump max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // Finish generating L0 with only SD for "b". + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Move the L0 to L2. + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(db->Put(WriteOptions(), "b", "value1")); + + auto* snapshot = db->GetSnapshot(); + + // Bump seq so that a subsequent flush/compaction job's snapshot is larger + // than the above snapshot's seq. + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + + // Generate a second L0. + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { + const auto* const ikey = + reinterpret_cast(arg); + assert(ikey); + if (ikey->user_key == "b") { + assert(ikey->type == kTypeValue); + db->ReleaseSnapshot(snapshot); + + // Bump max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Although the user-contract indicates that a SD can only be issued for a key +// that exists and has not been overwritten, it is still possible for a Delete +// to be present when write-prepared transaction is rolled back. +TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; }; + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Get a write conflict snapshot by creating a transaction with + // set_snapshot=true. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + std::unique_ptr dummy_txn( + db->BeginTransaction(WriteOptions(), txn_opts)); + + std::unique_ptr txn0( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn0->Put("foo", "value")); + ASSERT_OK(txn0->SetName("xid0")); + ASSERT_OK(txn0->Prepare()); + + // Create an SST with only {"foo": "value"}. + ASSERT_OK(db->Flush(FlushOptions())); + + // Insert a Delete to cancel out the prior Put by txn0. + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + // Create a second SST. + ASSERT_OK(db->Flush(FlushOptions())); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value1")); + + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + + int count = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { + const auto* const c = reinterpret_cast(arg); + assert(!c); + // Trigger once only for SingleDelete during flush. + if (0 == count) { + ++count; + db->ReleaseSnapshot(snapshot); + // Bump max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Create a third SST containing a SD without its matching PUT. + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + DBImpl* dbimpl = static_cast_with_check(db->GetRootDB()); + assert(dbimpl); + ASSERT_OK(dbimpl->TEST_CompactRange( + /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr, + /*column_family=*/nullptr, /*disallow_trivial_mode=*/true)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Release the conflict-checking snapshot. + ASSERT_OK(dummy_txn->Rollback()); +} + +// A more complex test to verify compaction/flush should keep keys visible +// to snapshots. +TEST_P(WritePreparedTransactionTest, + CompactionKeepSnapshotVisibleKeysRandomized) { + constexpr size_t kNumTransactions = 10; + constexpr size_t kNumIterations = 1000; + + std::vector transactions(kNumTransactions, nullptr); + std::vector versions(kNumTransactions, 0); + std::unordered_map current_data; + std::vector snapshots; + std::vector> snapshot_data; + + Random rnd(1103); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + for (size_t i = 0; i < kNumTransactions; i++) { + std::string key = "key" + std::to_string(i); + std::string value = "value0"; + ASSERT_OK(db->Put(WriteOptions(), key, value)); + current_data[key] = value; + } + VerifyKeys(current_data); + + for (size_t iter = 0; iter < kNumIterations; iter++) { + auto r = rnd.Next() % (kNumTransactions + 1); + if (r < kNumTransactions) { + std::string key = "key" + std::to_string(r); + if (transactions[r] == nullptr) { + std::string value = "value" + std::to_string(versions[r] + 1); + auto* txn = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn->SetName("txn" + std::to_string(r))); + ASSERT_OK(txn->Put(key, value)); + ASSERT_OK(txn->Prepare()); + transactions[r] = txn; + } else { + std::string value = "value" + std::to_string(++versions[r]); + ASSERT_OK(transactions[r]->Commit()); + delete transactions[r]; + transactions[r] = nullptr; + current_data[key] = value; + } + } else { + auto* snapshot = db->GetSnapshot(); + VerifyKeys(current_data, snapshot); + snapshots.push_back(snapshot); + snapshot_data.push_back(current_data); + } + VerifyKeys(current_data); + } + // Take a last snapshot to test compaction with uncommitted prepared + // transaction. + snapshots.push_back(db->GetSnapshot()); + snapshot_data.push_back(current_data); + + ASSERT_EQ(snapshots.size(), snapshot_data.size()); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + ASSERT_OK(db->Flush(FlushOptions())); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + // cleanup + for (size_t i = 0; i < kNumTransactions; i++) { + if (transactions[i] == nullptr) { + continue; + } + ASSERT_OK(transactions[i]->Commit()); + delete transactions[i]; + } + for (size_t i = 0; i < snapshots.size(); i++) { + db->ReleaseSnapshot(snapshots[i]); + } +} + +// Compaction should not apply the optimization to output key with sequence +// number equal to 0 if the key is not visible to earliest snapshot, based on +// commit sequence number. +TEST_P(WritePreparedTransactionTest, + CompactionShouldKeepSequenceForUncommittedKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1")); + ASSERT_OK(transaction->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + SequenceNumber seq1 = expected_seq; + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + expected_seq++; // one for data + if (options.two_write_queues) { + expected_seq++; // one for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Flush(FlushOptions())); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({ + {"key1", "NOT_FOUND"}, + {"key2", "value2"}, + }); + VerifyInternalKeys({ + // "key1" has not been committed. It keeps its sequence number. + {"key1", "value1", seq1, kTypeValue}, + // "key2" is committed and output with seq = 0. + {"key2", "value2", 0, kTypeValue}, + }); + ASSERT_OK(transaction->Commit()); + VerifyKeys({ + {"key1", "value1"}, + {"key2", "value2"}, + }); + delete transaction; +} + +TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + const Snapshot* snapshot = nullptr; + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* txn = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key1", "value2")); + ASSERT_OK(txn->Prepare()); + + auto callback = [&](void*) { + // Snapshot is taken after compaction start. It should be taken into + // consideration for whether to compact out value1. + snapshot = db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + delete txn; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_NE(nullptr, snapshot); + VerifyKeys({{"key1", "value2"}}); + VerifyKeys({{"key1", "value1"}}, snapshot); + db->ReleaseSnapshot(snapshot); +} + +TEST_P(WritePreparedTransactionTest, Iterate) { + auto verify_state = [](Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); + }; + + auto verify_iter = [&](const std::string& expected_val) { + // Get iterator from a concurrent transaction and make sure it has the + // same view as an iterator from the DB. + auto* txn = db->BeginTransaction(WriteOptions()); + + for (int i = 0; i < 2; i++) { + Iterator* iter = (i == 0) ? db->NewIterator(ReadOptions()) + : txn->GetIterator(ReadOptions()); + // Seek + iter->Seek("foo"); + verify_state(iter, "foo", expected_val); + // Next + iter->Seek("a"); + verify_state(iter, "a", "va"); + iter->Next(); + verify_state(iter, "foo", expected_val); + // SeekForPrev + iter->SeekForPrev("y"); + verify_state(iter, "foo", expected_val); + // Prev + iter->SeekForPrev("z"); + verify_state(iter, "z", "vz"); + iter->Prev(); + verify_state(iter, "foo", expected_val); + delete iter; + } + delete txn; + }; + + ASSERT_OK(db->Put(WriteOptions(), "foo", "v1")); + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("foo", "v2")); + ASSERT_OK(transaction->Prepare()); + VerifyKeys({{"foo", "v1"}}); + // dummy keys + ASSERT_OK(db->Put(WriteOptions(), "a", "va")); + ASSERT_OK(db->Put(WriteOptions(), "z", "vz")); + verify_iter("v1"); + ASSERT_OK(transaction->Commit()); + VerifyKeys({{"foo", "v2"}}); + verify_iter("v2"); + delete transaction; +} + +TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) { + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Refresh().IsNotSupported()); + delete iter; +} + +// Committing an delayed prepared has two non-atomic steps: update commit cache, +// remove seq from delayed_prepared_. The read in IsInSnapshot also involves two +// non-atomic steps of checking these two data structures. This test breaks each +// in the middle to ensure correctness in spite of non-atomic execution. +// Note: This test is limitted to the case where snapshot is larger than the +// max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + for (auto split_read : {true, false}) { + std::vector split_options = {false}; + if (split_read) { + // Also test for break before mutex + split_options.push_back(true); + } + for (auto split_before_mutex : split_options) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + // Fill up the commit cache + std::string init_value("value1"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Prepare a transaction but do not commit it + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), Slice("value2"))); + ASSERT_OK(txn->Prepare()); + // Commit a bunch of entries to advance max evicted seq and make the + // prepared a delayed prepared + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + } + // The snapshot should not see the delayed prepared entry + auto snap = db->GetSnapshot(); + + if (split_read) { + if (split_before_mutex) { + // split before acquiring prepare_mutex_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause", + "AtomicCommitOfDelayedPrepared:Commit:before"}, + {"AtomicCommitOfDelayedPrepared:Commit:after", + "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"}}); + } else { + // split right after reading from the commit cache + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause", + "AtomicCommitOfDelayedPrepared:Commit:before"}, + {"AtomicCommitOfDelayedPrepared:Commit:after", + "WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"}}); + } + } else { // split commit + // split right before removing from delayed_prepared_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::RemovePrepared:pause", + "AtomicCommitOfDelayedPrepared:Read:before"}, + {"AtomicCommitOfDelayedPrepared:Read:after", + "WritePreparedTxnDB::RemovePrepared:resume"}}); + } + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:before"); + ASSERT_OK(txn->Commit()); + if (split_before_mutex) { + // Do bunch of inserts to evict the commit entry from the cache. This + // would prevent the 2nd look into commit cache under prepare_mutex_ + // to see the commit entry. + auto seq = db_impl->TEST_GetLastVisibleSequence(); + size_t tries = 0; + while (wp_db->max_evicted_seq_ < seq && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + } + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:after"); + delete txn; + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:before"); + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should not see the commit of delayed prepared + ASSERT_TRUE(value == init_value); + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:after"); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } // for split_before_mutex + } // for split_read +} + +// When max evicted seq advances a prepared seq, it involves two updates: i) +// adding prepared seq to delayed_prepared_, ii) updating max_evicted_seq_. +// ::IsInSnapshot also reads these two values in a non-atomic way. This test +// ensures correctness if the update occurs after ::IsInSnapshot reads +// delayed_prepared_empty_ and before it reads max_evicted_seq_. +// Note: this test focuses on read snapshot larger than max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + // Fill up the commit cache + std::string init_value("value1"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Prepare a transaction but do not commit it + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), Slice("value2"))); + ASSERT_OK(txn->Prepare()); + // Create a gap between prepare seq and snapshot seq + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + // The snapshot should not see the delayed prepared entry + auto snap = db->GetSnapshot(); + ASSERT_LT(txn->GetId(), snap->GetSequenceNumber()); + + // split right after reading delayed_prepared_empty_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause", + "AtomicUpdateOfDelayedPrepared:before"}, + {"AtomicUpdateOfDelayedPrepared:after", + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:before"); + // Commit a bunch of entries to advance max evicted seq and make the + // prepared a delayed prepared + size_t tries = 0; + while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + // This is the case on which the test focuses + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:after"); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should not see the uncommitted value of delayed prepared + ASSERT_TRUE(value == init_value); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + ASSERT_OK(txn->Commit()); + delete txn; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Eviction from commit cache and update of max evicted seq are two non-atomic +// steps. Similarly the read of max_evicted_seq_ in ::IsInSnapshot and reading +// from commit cache are two non-atomic steps. This tests if the update occurs +// after reading max_evicted_seq_ and before reading the commit cache. +// Note: the test focuses on snapshot larger than max_evicted_seq_ +TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + // Fill up the commit cache + std::string init_value("value1"); + std::string last_value("value_final"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Do an uncommitted write to prevent min_uncommitted optimization + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Put(Slice("key0"), last_value)); + ASSERT_OK(txn1->Prepare()); + // Do a write with prepare to get the prepare seq + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), last_value)); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + // Create a gap between commit entry and snapshot seq + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + // The snapshot should see the last commit + auto snap = db->GetSnapshot(); + ASSERT_LE(txn->GetId(), snap->GetSequenceNumber()); + + // split right after reading max_evicted_seq_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause", + "NonAtomicUpdateOfMaxEvictedSeq:before"}, + {"NonAtomicUpdateOfMaxEvictedSeq:after", + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:before"); + // Commit a bunch of entries to advance max evicted seq beyond txn->GetId() + size_t tries = 0; + while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + // This is the case on which the test focuses + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:after"); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should see the committed value of the evicted entry + ASSERT_TRUE(value == last_value); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + delete txn; + ASSERT_OK(txn1->Commit()); + delete txn1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Test when we add a prepared seq when the max_evicted_seq_ already goes beyond +// that. The test focuses on a race condition between AddPrepared and +// AdvanceMaxEvictedSeq functions. +TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) { + if (!options.two_write_queues) { + // This test is only for two write queues + return; + } + const size_t snapshot_cache_bits = 7; // same as default + // 1 entry to advance max after the 2nd commit + const size_t commit_cache_bits = 0; + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast(db); + std::string some_value("value_some"); + std::string uncommitted_value("value_uncommitted"); + // Prepare two uncommitted transactions + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Put(Slice("key1"), some_value)); + ASSERT_OK(txn1->Prepare()); + Transaction* txn2 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn2->SetName("xid2")); + ASSERT_OK(txn2->Put(Slice("key2"), some_value)); + ASSERT_OK(txn2->Prepare()); + // Start the txn here so the other thread could get its id + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value)); + port::Mutex txn_mutex_; + + // t1) Insert prepared entry, t2) commit other entries to advance max + // evicted sec and finish checking the existing prepared entries, t1) + // AddPrepared, t2) update max_evicted_seq_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"AddPreparedCallback::AddPrepared::begin:pause", + "AddPreparedBeforeMax::read_thread:start"}, + {"AdvanceMaxEvictedSeq::update_max:pause", + "AddPreparedCallback::AddPrepared::begin:resume"}, + {"AddPreparedCallback::AddPrepared::end", + "AdvanceMaxEvictedSeq::update_max:resume"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + txn_mutex_.Lock(); + ASSERT_OK(txn->Prepare()); + txn_mutex_.Unlock(); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + TEST_SYNC_POINT("AddPreparedBeforeMax::read_thread:start"); + // Publish seq number with a commit + ASSERT_OK(txn1->Commit()); + // Since the commit cache size is one the 2nd commit evict the 1st one and + // invokes AdcanceMaxEvictedSeq + ASSERT_OK(txn2->Commit()); + + ReadOptions roptions; + PinnableSlice value; + // The snapshot should not see the uncommitted value from write_thread + auto snap = db->GetSnapshot(); + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + // This is the scenario that we test for + txn_mutex_.Lock(); + ASSERT_GT(wp_db->max_evicted_seq_, txn->GetId()); + txn_mutex_.Unlock(); + roptions.snapshot = snap; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key0", &value); + ASSERT_TRUE(s.IsNotFound()); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + write_thread.join(); + delete txn1; + delete txn2; + ASSERT_OK(txn->Commit()); + delete txn; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// When an old prepared entry gets committed, there is a gap between the time +// that it is published and when it is cleaned up from old_prepared_. This test +// stresses such cases. +TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + for (const size_t commit_cache_bits : {0, 2, 3}) { + for (const size_t sub_batch_cnt : {1, 2, 3}) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + std::atomic snap = {nullptr}; + std::atomic exp_prepare = {0}; + ROCKSDB_NAMESPACE::port::Thread callback_thread; + // Value is synchronized via snap + PinnableSlice value; + // Take a snapshot after publish and before RemovePrepared:Start + auto snap_callback = [&]() { + ASSERT_EQ(nullptr, snap.load()); + snap.store(db->GetSnapshot()); + ReadOptions roptions; + roptions.snapshot = snap.load(); + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value); + ASSERT_OK(s); + }; + auto callback = [&](void* param) { + SequenceNumber prep_seq = *((SequenceNumber*)param); + if (prep_seq == exp_prepare.load()) { // only for write_thread + // We need to spawn a thread to avoid deadlock since getting a + // snpashot might end up calling AdvanceSeqByOne which needs joining + // the write queue. + callback_thread = ROCKSDB_NAMESPACE::port::Thread(snap_callback); + TEST_SYNC_POINT("callback:end"); + } + }; + // Wait for the first snapshot be taken in GetSnapshotInternal. Although + // it might be updated before GetSnapshotInternal finishes but this should + // cover most of the cases. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"}, + }); + SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback); + SyncPoint::GetInstance()->EnableProcessing(); + // Thread to cause frequent evictions + ROCKSDB_NAMESPACE::port::Thread eviction_thread([&]() { + // Too many txns might cause commit_seq - prepare_seq in another thread + // to go beyond DELTA_UPPERBOUND + for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1"))); + } + }); + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) { + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + std::string val_str = "value" + std::to_string(i); + for (size_t b = 0; b < sub_batch_cnt; b++) { + ASSERT_OK(txn->Put(Slice("key2"), val_str)); + } + ASSERT_OK(txn->Prepare()); + // Let an eviction to kick in + std::this_thread::yield(); + + exp_prepare.store(txn->GetId()); + ASSERT_OK(txn->Commit()); + delete txn; + // Wait for the snapshot taking that is triggered by + // RemovePrepared:Start callback + callback_thread.join(); + + // Read with the snapshot taken before delayed_prepared_ cleanup + ReadOptions roptions; + roptions.snapshot = snap.load(); + ASSERT_NE(nullptr, roptions.snapshot); + PinnableSlice value2; + auto s = + db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2); + ASSERT_OK(s); + // It should see its own write + ASSERT_TRUE(val_str == value2); + // The value read by snapshot should not change + ASSERT_STREQ(value2.ToString().c_str(), value.ToString().c_str()); + + db->ReleaseSnapshot(roptions.snapshot); + snap.store(nullptr); + } + }); + write_thread.join(); + eviction_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } + } +} + +// Test that updating the commit map will not affect the existing snapshots +TEST_P(WritePreparedTransactionTest, AtomicCommit) { + for (bool skip_prepare : {true, false}) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::AddCommitted:start", + "AtomicCommit::GetSnapshot:start"}, + {"AtomicCommit::Get:end", + "WritePreparedTxnDB::AddCommitted:start:pause"}, + {"WritePreparedTxnDB::AddCommitted:end", "AtomicCommit::Get2:start"}, + {"AtomicCommit::Get2:end", + "WritePreparedTxnDB::AddCommitted:end:pause:"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + if (skip_prepare) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value"))); + } else { + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key"), Slice("value"))); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + } + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + TEST_SYNC_POINT("AtomicCommit::GetSnapshot:start"); + roptions.snapshot = db->GetSnapshot(); + PinnableSlice val; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &val); + TEST_SYNC_POINT("AtomicCommit::Get:end"); + TEST_SYNC_POINT("AtomicCommit::Get2:start"); + ASSERT_SAME(roptions, db, s, val, "key"); + TEST_SYNC_POINT("AtomicCommit::Get2:end"); + db->ReleaseSnapshot(roptions.snapshot); + }); + read_thread.join(); + write_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(WritePreparedTransactionTest, BasicRollbackDeletionTypeCb) { + options.level0_file_num_compaction_trigger = 2; + // Always use SingleDelete to rollback Put. + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; }; + + const auto write_to_db = [&]() { + assert(db); + std::unique_ptr txn0( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Put("a", "v0")); + ASSERT_OK(txn0->Prepare()); + + // Generate sst1: [PUT('a')] + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = options.num_levels - 1; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + ASSERT_OK(db->Put(WriteOptions(), "a", "v1")); + + ASSERT_OK(db->SingleDelete(WriteOptions(), "a")); + // Generate another SST with a SD to cover the oldest PUT('a') + ASSERT_OK(db->Flush(FlushOptions())); + + auto* dbimpl = static_cast_with_check(db->GetRootDB()); + assert(dbimpl); + ASSERT_OK(dbimpl->TEST_WaitForCompact()); + + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + { + std::string value; + const Status s = db->Get(ReadOptions(), "a", &value); + ASSERT_TRUE(s.IsNotFound()); + } + }; + + // Destroy and reopen + ASSERT_OK(ReOpen()); + write_to_db(); +} + +// Test that we can change write policy from WriteCommitted to WritePrepared +// after a clean shutdown (which would empty the WAL) +TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal); +} + +// Test that we fail fast if WAL is not emptied between changing the write +// policy from WriteCommitted to WritePrepared +TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal); +} + +// Test that we can change write policy from WritePrepare back to WriteCommitted +// after a clean shutdown (which would empty the WAL) +TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal); +} + +// Test that we fail fast if WAL is not emptied between changing the write +// policy from WriteCommitted to WritePrepared +TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + if (getenv("CIRCLECI")) { + // Looking for backtrace on "Resource temporarily unavailable" exceptions + ::testing::FLAGS_gtest_catch_exceptions = false; + } + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.cc b/src/rocksdb/utilities/transactions/write_prepared_txn.cc new file mode 100644 index 000000000..16b5cc1cb --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn.cc @@ -0,0 +1,512 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options, false), + wpt_db_(txn_db) { + // Call Initialize outside PessimisticTransaction constructor otherwise it + // would skip overridden functions in WritePreparedTxn since they are not + // defined yet in the constructor of PessimisticTransaction + Initialize(txn_options); +} + +void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + prepare_batch_cnt_ = 0; +} + +void WritePreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WritePreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + pinnable_val, &callback); + const bool callback_valid = + callback.valid(); // NOTE: validity of callback must always be checked + // before it is destructed + if (res.ok()) { + if (!LIKELY(callback_valid && + wpt_db_->ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + res = Status::TryAgain(); + } + } + + return res; +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_iter); +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter); +} + +Status WritePreparedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool kFirstPrepareBatch = true; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // Having AddPrepared in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &log_number_, /*log ref*/ 0, + !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_, + &add_prepared_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + SetId(prepare_seq); + return s; +} + +Status WritePreparedTxn::CommitWithoutPrepareInternal() { + // For each duplicate key we account for a new sub-batch + const size_t batch_cnt = GetWriteBatch()->SubBatchCnt(); + return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt); +} + +Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt) { + return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this); +} + +Status WritePreparedTxn::CommitInternal() { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitInternal prepare_seq: %" PRIu64, GetID()); + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + auto prepare_seq = GetId(); + const bool includes_data = !empty && !for_recovery; + assert(prepare_batch_cnt_); + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt); + // This is to call AddPrepared on CommitTimeWriteBatch + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is already + // a connection between the memtable and its WAL, so there is no need to + // redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + // If `two_write_queues && includes_data`, then `do_one_write` is false. The + // following `WriteImpl` will insert the data of the commit-time-batch into + // the database before updating the commit cache. Therefore, the data of the + // commmit-time-batch is considered uncommitted. Furthermore, since data of + // the commit-time-batch are not locked, it is possible for two uncommitted + // versions of the same key to co-exist for a (short) period of time until + // the commit cache is updated by the second write. If the two uncommitted + // keys are compacted to the bottommost level in the meantime, it is possible + // that compaction iterator will zero out the sequence numbers of both, thus + // violating the invariant that an SST does not have two identical internal + // keys. To prevent this situation, we should allow the usage of + // commit-time-batch only if the user sets + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to + // true. See the comments about GetCommitTimeWriteBatch() in + // include/rocksdb/utilities/transaction.h. + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues && + s.ok())) { + // Note: RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); + } // else RemovePrepared is called from within PreReleaseCallback + if (UNLIKELY(!do_one_write)) { + assert(!s.ok()); + // Cleanup the prepared entry we added with add_prepared_callback + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + return s; + } // else do the 2nd write to publish seq + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + const size_t kZeroData = 0; + // Update commit map only from the 2nd queue + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData, + commit_batch_seq, commit_batch_cnt); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_aux_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + return s; +} + +Status WritePreparedTxn::RollbackInternal() { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "RollbackInternal prepare_seq: %" PRIu64, GetId()); + + assert(db_impl_); + assert(wpt_db_); + + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options_.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); + auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* const db_; + WritePreparedTxnDB* const wpt_db_; + WritePreparedTxnReadCallback callback_; + WriteBatch* rollback_batch_; + std::map& comparators_; + std::map& handles_; + using CFKeys = std::set; + std::map keys_; + bool rollback_merge_operands_; + ReadOptions roptions_; + + RollbackWriteBatchBuilder( + DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq, + WriteBatch* dst_batch, + std::map& comparators, + std::map& handles, + bool rollback_merge_operands, const ReadOptions& _roptions) + : db_(db), + wpt_db_(wpt_db), + callback_(wpt_db, snap_seq), // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands), + roptions_(_roptions) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto it = cf_keys.insert(key); + // second is false if a element already existed. + if (it.second == false) { + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback_; + s = db_->GetImpl(roptions_, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch_->SingleDelete(cf_handle, key); + } else { + s = rollback_batch_->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + + protected: + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } + } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + wpt_db_->txn_db_options_.rollback_merge_operands, + roptions); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ONE_BATCH = 1; + const bool kFirstPrepareBatch = true; + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, ONE_BATCH, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while + // the rollback batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr, + NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + assert(!db_impl_->immutable_db_options().two_write_queues); + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + return s; + } // else do the 2nd write for commit + uint64_t rollback_seq = seq_used; + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write rollback_seq: %" PRIu64, + rollback_seq); + // Commit the batch by writing an empty batch to the queue that will release + // the commit sequence number to readers. + WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare( + wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal (status=%s) commit: %" PRIu64, + s.ToString().c_str(), GetId()); + // TODO(lth): For WriteUnPrepared that rollback is called frequently, + // RemovePrepared could be moved to the callback to reduce lock contention. + if (s.ok()) { + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + } + // Note: RemovePrepared for prepared batch is called from within + // PreReleaseCallback + wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH); + + return s; +} + +Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted, + kBackedByDBSnapshot); + // TODO(yanqin): support user-defined timestamp + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +void WritePreparedTxn::SetSnapshot() { + const bool kForWWConflictCheck = true; + SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck); + SetSnapshotInternal(snapshot); +} + +Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { + auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch); + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + return ret; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.h b/src/rocksdb/utilities/transactions/write_prepared_txn.h new file mode 100644 index 000000000..30d9bdb99 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class WritePreparedTxnDB; + +// This impl could write to DB also uncommitted data and then later tell apart +// committed data from uncommitted data. Uncommitted data could be after the +// Prepare phase in 2PC (WritePreparedTxn) or before that +// (WriteUnpreparedTxnImpl). +class WritePreparedTxn : public PessimisticTransaction { + public: + WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WritePreparedTxn(const WritePreparedTxn&) = delete; + void operator=(const WritePreparedTxn&) = delete; + + virtual ~WritePreparedTxn() {} + + // To make WAL commit markers visible, the snapshot will be based on the last + // seq in the WAL that is also published, LastPublishedSequence, as opposed to + // the last seq in the memtable. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + // Note: The behavior is undefined in presence of interleaved writes to the + // same transaction. + // To make WAL commit markers visible, the snapshot will be + // based on the last seq in the WAL that is also published, + // LastPublishedSequence, as opposed to the last seq in the memtable. + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual void SetSnapshot() override; + + protected: + void Initialize(const TransactionOptions& txn_options) override; + // Override the protected SetId to make it visible to the friend class + // WritePreparedTxnDB + inline void SetId(uint64_t id) override { Transaction::SetId(id); } + + private: + friend class WritePreparedTransactionTest_BasicRecoveryTest_Test; + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + // Since the data is already written to memtables at the Prepare phase, the + // commit entails writing only a commit marker in the WAL. The sequence number + // of the commit marker is then the commit timestamp of the transaction. To + // make WAL commit markers visible, the snapshot will be based on the last seq + // in the WAL that is also published, LastPublishedSequence, as opposed to the + // last seq in the memtable. + Status CommitInternal() override; + + Status RollbackInternal() override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WritePreparedTxnDB* wpt_db_; + // Number of sub-batches in prepare + size_t prepare_batch_cnt_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc new file mode 100644 index 000000000..595c3df8f --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc @@ -0,0 +1,1030 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn_db.h" + +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +// This function is for testing only. If it returns true, then all entries in +// the commit cache will be evicted. Unit and/or stress tests (db_stress) +// can implement this function and customize how frequently commit cache +// eviction occurs. +// TODO: remove this function once we can configure commit cache to be very +// small so that eviction occurs very frequently. This requires the commit +// cache entry to be able to encode prepare and commit sequence numbers so that +// the commit sequence number does not have to be within a certain range of +// prepare sequence number. +extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void) + __attribute__((__weak__)); + +namespace ROCKSDB_NAMESPACE { + +Status WritePreparedTxnDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; + for (auto rtxn : rtxns) { + // There should only one batch for WritePrepared policy. + assert(rtxn.second->batches_.size() == 1); + const auto& seq = rtxn.second->batches_.begin()->first; + const auto& batch_info = rtxn.second->batches_.begin()->second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + ordered_seq_cnt[seq] = cnt; + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices, + handles); + return s; +} + +Status WritePreparedTxnDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options); + if (!s.ok()) { + return s; + } + if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) { + return Status::InvalidArgument( + "memtable_factory->CanHandleDuplicatedKey() cannot be false with " + "WritePrpeared transactions"); + } + return Status::OK(); +} + +Transaction* WritePreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WritePreparedTxn(this, write_options, txn_options); + } +} + +Status WritePreparedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + if (txn_db_options_.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN); + } else { + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + if (optimizations.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + const size_t ONE_BATCH_CNT = 1; + const size_t batch_cnt = optimizations.skip_duplicate_key_check + ? ONE_BATCH_CNT + : UNKNOWN_BATCH_CNT; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, batch_cnt, NO_TXN); + } else { + // TODO(myabandeh): Make use of skip_duplicate_key_check hint + // Fall back to unoptimized version + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, + WriteBatch* batch, size_t batch_cnt, + WritePreparedTxn* txn) { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal"); + if (batch->Count() == 0) { + // Otherwise our 1 seq per batch logic will break since there is no seq + // increased for this batch. + return Status::OK(); + } + + if (write_options_orig.protection_bytes_per_key > 0) { + auto s = WriteBatchInternal::UpdateProtectionInfo( + batch, write_options_orig.protection_bytes_per_key); + if (!s.ok()) { + return s; + } + } + + if (batch_cnt == 0) { // not provided, then compute it + // TODO(myabandeh): add an option to allow user skipping this cost + SubBatchCounter counter(*GetCFComparatorMap()); + auto s = batch->Iterate(&counter); + if (!s.ok()) { + return s; + } + batch_cnt = counter.BatchCount(); + WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD); + ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches", + static_cast(batch_cnt)); + } + assert(batch_cnt); + + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + WriteOptions write_options(write_options_orig); + // In the absence of Prepare markers, use Noop as a batch separator + auto s = WriteBatchInternal::InsertNoop(batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const uint64_t no_log_ref = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ZERO_PREPARES = 0; + const bool kSeperatePrepareCommitBatches = true; + // Since this is not 2pc, there is no need for AddPrepared but having it in + // the PreReleaseCallback enables an optimization. Refer to + // SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + this, db_impl_, batch_cnt, + db_impl_->immutable_db_options().two_write_queues, + !kSeperatePrepareCommitBatches); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref, + !DISABLE_MEMTABLE, &seq_used, batch_cnt, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + uint64_t prepare_seq = seq_used; + if (txn != nullptr) { + txn->SetId(prepare_seq); + } + if (!s.ok()) { + return s; + } + if (do_one_write) { + return s; + } // else do the 2nd write for commit + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + // Commit the batch by writing an empty batch to the 2nd queue that will + // release the commit sequence number to readers. + const size_t ZERO_COMMITS = 0; + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare( + this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS); + WriteBatch empty_batch; + write_options.disableWAL = true; + write_options.sync = false; + const size_t ONE_BATCH = 1; // Just to inc the seq + s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr, + no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note: RemovePrepared is called from within PreReleaseCallback + return s; +} + +Status WritePreparedTxnDB::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted, + backed_by_snapshot); + bool* dont_care = nullptr; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.value_found = dont_care; + get_impl_options.callback = &callback; + auto res = db_impl_->GetImpl(options, key, get_impl_options); + if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +void WritePreparedTxnDB::UpdateCFComparatorMap( + const std::vector& handles) { + auto cf_map = new std::map(); + auto handle_map = new std::map(); + for (auto h : handles) { + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + if (id != 0) { + (*handle_map)[id] = h; + } else { + // The pointer to the default cf handle in the handles will be deleted. + // Use the pointer maintained by the db instead. + (*handle_map)[id] = DefaultColumnFamily(); + } + } + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { + auto old_cf_map_ptr = cf_map_.get(); + assert(old_cf_map_ptr); + auto cf_map = new std::map(*old_cf_map_ptr); + auto old_handle_map_ptr = handle_map_.get(); + assert(old_handle_map_ptr); + auto handle_map = + new std::map(*old_handle_map_ptr); + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + (*handle_map)[id] = h; + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +std::vector WritePreparedTxnDB::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + assert(values); + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]); + } + return stat_list; +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WritePreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr s, + SequenceNumber min_uncommitted) + : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot), + snapshot(s) {} + + WritePreparedTxnReadCallback callback; + std::shared_ptr snapshot; +}; + +namespace { +static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast(arg1); +} +} // anonymous namespace + +Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + own_snapshot = std::make_shared(db_impl_, snapshot); + } + assert(snapshot_seq != kMaxSequenceNumber); + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +Status WritePreparedTxnDB::NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + own_snapshot = std::make_shared(db_impl_, snapshot); + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + } + iterators->clear(); + iterators->reserve(column_families.size()); + for (auto* column_family : column_families) { + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + iterators->push_back(db_iter); + } + return Status::OK(); +} + +void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) { + // Adcance max_evicted_seq_ no more than 100 times before the cache wraps + // around. + INC_STEP_FOR_MAX_EVICTED = + std::max(COMMIT_CACHE_SIZE / 100, static_cast(1)); + snapshot_cache_ = std::unique_ptr[]>( + new std::atomic[SNAPSHOT_CACHE_SIZE] {}); + commit_cache_ = std::unique_ptr[]>( + new std::atomic[COMMIT_CACHE_SIZE] {}); + dummy_max_snapshot_.number_ = kMaxSequenceNumber; + rollback_deletion_type_callback_ = + txn_db_opts.rollback_deletion_type_callback; +} + +void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max, + bool locked) { + // When max_evicted_seq_ advances, move older entries from prepared_txns_ + // to delayed_prepared_. This guarantees that if a seq is lower than max, + // then it is not in prepared_txns_ and save an expensive, synchronized + // lookup from a shared set. delayed_prepared_ is expected to be empty in + // normal cases. + ROCKS_LOG_DETAILS( + info_log_, + "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64, + prepared_txns_.empty(), + prepared_txns_.empty() ? 0 : prepared_txns_.top()); + const SequenceNumber prepared_top = prepared_txns_.top(); + const bool empty = prepared_top == kMaxSequenceNumber; + // Preliminary check to avoid the synchronization cost + if (!empty && prepared_top <= new_max) { + if (locked) { + // Needed to avoid double locking in pop(). + prepared_txns_.push_pop_mutex()->Unlock(); + } + WriteLock wl(&prepared_mutex_); + // Need to fetch fresh values of ::top after mutex is acquired + while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { + auto to_be_popped = prepared_txns_.top(); + delayed_prepared_.insert(to_be_popped); + ROCKS_LOG_WARN(info_log_, + "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 + " new_max=%" PRIu64 ")", + static_cast(delayed_prepared_.size()), + to_be_popped, new_max); + delayed_prepared_empty_.store(false, std::memory_order_release); + // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise + // there will be a point in time that the entry is neither in + // prepared_txns_ nor in delayed_prepared_, which will not be checked if + // delayed_prepared_empty_ is false. + prepared_txns_.pop(); + } + if (locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + } +} + +void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64, + seq, max_evicted_seq_.load()); + TEST_SYNC_POINT("AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPrepared::begin:resume"); + if (!locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + prepared_txns_.push_pop_mutex()->AssertHeld(); + prepared_txns_.push(seq); + auto new_max = future_max_evicted_seq_.load(); + if (UNLIKELY(seq <= new_max)) { + // This should not happen in normal case + ROCKS_LOG_ERROR( + info_log_, + "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64 + " <= %" PRIu64, + seq, new_max); + CheckPreparedAgainstMax(new_max, true /*locked*/); + } + if (!locked) { + prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPrepared::end"); +} + +void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64, + prepare_seq, commit_seq); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause"); + auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE; + CommitEntry64b evicted_64b; + CommitEntry evicted; + bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted); + if (LIKELY(to_be_evicted)) { + assert(evicted.prep_seq != prepare_seq); + auto prev_max = max_evicted_seq_.load(std::memory_order_acquire); + ROCKS_LOG_DETAILS(info_log_, + "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64, + evicted.prep_seq, evicted.commit_seq, prev_max); + if (prev_max < evicted.commit_seq) { + auto last = db_impl_->GetLastPublishedSequence(); // could be 0 + SequenceNumber max_evicted_seq; + if (LIKELY(evicted.commit_seq < last)) { + assert(last > 0); + // Inc max in larger steps to avoid frequent updates + max_evicted_seq = + std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1); + } else { + // legit when a commit entry in a write batch overwrite the previous one + max_evicted_seq = evicted.commit_seq; + } +#ifdef OS_LINUX + if (rocksdb_write_prepared_TEST_ShouldClearCommitCache && + rocksdb_write_prepared_TEST_ShouldClearCommitCache()) { + max_evicted_seq = last; + } +#endif // OS_LINUX + ROCKS_LOG_DETAILS(info_log_, + "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64 + " => %lu", + prepare_seq, evicted.prep_seq, evicted.commit_seq, + prev_max, max_evicted_seq); + AdvanceMaxEvictedSeq(prev_max, max_evicted_seq); + } + if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) { + WriteLock wl(&prepared_mutex_); + auto dp_iter = delayed_prepared_.find(evicted.prep_seq); + if (dp_iter != delayed_prepared_.end()) { + // This is a rare case that txn is committed but prepared_txns_ is not + // cleaned up yet. Refer to delayed_prepared_commits_ definition for + // why it should be kept updated. + delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq; + ROCKS_LOG_DEBUG(info_log_, + "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64, + evicted.prep_seq, evicted.commit_seq); + } + } + // After each eviction from commit cache, check if the commit entry should + // be kept around because it overlaps with a live snapshot. + CheckAgainstSnapshots(evicted); + } + bool succ = + ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq}); + if (UNLIKELY(!succ)) { + ROCKS_LOG_ERROR(info_log_, + "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64 + ",%" PRIu64 " retrying...", + indexed_seq, prepare_seq, commit_seq); + // A very rare event, in which the commit entry is updated before we do. + // Here we apply a very simple solution of retrying. + if (loop_cnt > 100) { + throw std::runtime_error("Infinite loop in AddCommitted!"); + } + AddCommitted(prepare_seq, commit_seq, ++loop_cnt); + return; + } + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause"); +} + +void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq, + const size_t batch_cnt) { + TEST_SYNC_POINT_CALLBACK( + "RemovePrepared:Start", + const_cast(reinterpret_cast(&prepare_seq))); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume"); + ROCKS_LOG_DETAILS(info_log_, + "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt, + prepare_seq, batch_cnt); + WriteLock wl(&prepared_mutex_); + for (size_t i = 0; i < batch_cnt; i++) { + prepared_txns_.erase(prepare_seq + i); + bool was_empty = delayed_prepared_.empty(); + if (!was_empty) { + delayed_prepared_.erase(prepare_seq + i); + auto it = delayed_prepared_commits_.find(prepare_seq + i); + if (it != delayed_prepared_commits_.end()) { + ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64, + prepare_seq + i); + delayed_prepared_commits_.erase(it); + } + bool is_empty = delayed_prepared_.empty(); + if (was_empty != is_empty) { + delayed_prepared_empty_.store(is_empty, std::memory_order_release); + } + } + } +} + +bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq, + CommitEntry64b* entry_64b, + CommitEntry* entry) const { + *entry_64b = commit_cache_[static_cast(indexed_seq)].load( + std::memory_order_acquire); + bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq, + const CommitEntry& new_entry, + CommitEntry* evicted_entry) { + CommitEntry64b new_entry_64b(new_entry, FORMAT); + CommitEntry64b evicted_entry_64b = + commit_cache_[static_cast(indexed_seq)].exchange( + new_entry_64b, std::memory_order_acq_rel); + bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry_64b, + const CommitEntry& new_entry) { + auto& atomic_entry = commit_cache_[static_cast(indexed_seq)]; + CommitEntry64b new_entry_64b(new_entry, FORMAT); + bool succ = atomic_entry.compare_exchange_strong( + expected_entry_64b, new_entry_64b, std::memory_order_acq_rel, + std::memory_order_acquire); + return succ; +} + +void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max) { + ROCKS_LOG_DETAILS(info_log_, + "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64, + prev_max, new_max); + // Declare the intention before getting snapshot from the DB. This helps a + // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if + // it has not already. Otherwise the new snapshot is when we ask DB for + // snapshots smaller than future max. + auto updated_future_max = prev_max; + while (updated_future_max < new_max && + !future_max_evicted_seq_.compare_exchange_weak( + updated_future_max, new_max, std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; + + CheckPreparedAgainstMax(new_max, false /*locked*/); + + // With each change to max_evicted_seq_ fetch the live snapshots behind it. + // We use max as the version of snapshots to identify how fresh are the + // snapshot list. This works because the snapshots are between 0 and + // max, so the larger the max, the more complete they are. + SequenceNumber new_snapshots_version = new_max; + std::vector snapshots; + bool update_snapshots = false; + if (new_snapshots_version > snapshots_version_) { + // This is to avoid updating the snapshots_ if it already updated + // with a more recent vesion by a concrrent thread + update_snapshots = true; + // We only care about snapshots lower then max + snapshots = GetSnapshotListFromDB(new_max); + } + if (update_snapshots) { + UpdateSnapshots(snapshots, new_snapshots_version); + if (!snapshots.empty()) { + WriteLock wl(&old_commit_map_mutex_); + for (auto snap : snapshots) { + // This allows IsInSnapshot to tell apart the reads from in valid + // snapshots from the reads from committed values in valid snapshots. + old_commit_map_[snap]; + } + old_commit_map_empty_.store(false, std::memory_order_release); + } + } + auto updated_prev_max = prev_max; + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause"); + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume"); + while (updated_prev_max < new_max && + !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; +} + +const Snapshot* WritePreparedTxnDB::GetSnapshot() { + const bool kForWWConflictCheck = true; + return GetSnapshotInternal(!kForWWConflictCheck); +} + +SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal( + bool for_ww_conflict_check) { + // Note: for this optimization setting the last sequence number and obtaining + // the smallest uncommitted seq should be done atomically. However to avoid + // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the + // snapshot. Since we always updated the list of unprepared seq (via + // AddPrepared) AFTER the last sequence is updated, this guarantees that the + // smallest uncommitted seq that we pair with the snapshot is smaller or equal + // the value that would be obtained otherwise atomically. That is ok since + // this optimization works as long as min_uncommitted is less than or equal + // than the smallest uncommitted seq when the snapshot was taken. + auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq(); + SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first"); + assert(snap_impl); + SequenceNumber snap_seq = snap_impl->GetSequenceNumber(); + // Note: Check against future_max_evicted_seq_ (in contrast with + // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq. + if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) { + // There is a very rare case in which the commit entry evicts another commit + // entry that is not published yet thus advancing max evicted seq beyond the + // last published seq. This case is not likely in real-world setup so we + // handle it with a few retries. + size_t retry = 0; + SequenceNumber max; + while ((max = future_max_evicted_seq_.load()) != 0 && + snap_impl->GetSequenceNumber() <= max && retry < 100) { + ROCKS_LOG_WARN(info_log_, + "GetSnapshot snap: %" PRIu64 " max: %" PRIu64 + " retry %" ROCKSDB_PRIszt, + snap_impl->GetSequenceNumber(), max, retry); + ReleaseSnapshot(snap_impl); + // Wait for last visible seq to catch up with max, and also go beyond it + // by one. + AdvanceSeqByOne(); + snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + assert(snap_impl); + retry++; + } + assert(snap_impl->GetSequenceNumber() > max); + if (snap_impl->GetSequenceNumber() <= max) { + throw std::runtime_error( + "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) + + " after " + std::to_string(retry) + + " retries is still less than futre_max_evicted_seq_" + + std::to_string(max)); + } + } + EnhanceSnapshot(snap_impl, min_uncommitted); + ROCKS_LOG_DETAILS( + db_impl_->immutable_db_options().info_log, + "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64, + snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end"); + return snap_impl; +} + +void WritePreparedTxnDB::AdvanceSeqByOne() { + // Inserting an empty value will i) let the max evicted entry to be + // published, i.e., max == last_published, increase the last published to + // be one beyond max, i.e., max < last_published. + WriteOptions woptions; + TransactionOptions txn_options; + Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr); + std::hash hasher; + char name[64]; + snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id())); + assert(strlen(name) < 64 - 1); + Status s = txn0->SetName(name); + assert(s.ok()); + if (s.ok()) { + // Without prepare it would simply skip the commit + s = txn0->Prepare(); + } + assert(s.ok()); + if (s.ok()) { + s = txn0->Commit(); + } + assert(s.ok()); + delete txn0; +} + +const std::vector WritePreparedTxnDB::GetSnapshotListFromDB( + SequenceNumber max) { + ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max); + InstrumentedMutexLock dblock(db_impl_->mutex()); + db_impl_->mutex()->AssertHeld(); + return db_impl_->snapshots().GetAll(nullptr, max); +} + +void WritePreparedTxnDB::ReleaseSnapshotInternal( + const SequenceNumber snap_seq) { + // TODO(myabandeh): relax should enough since the synchronizatin is already + // done by snapshots_mutex_ under which this function is called. + if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) { + // Then this is a rare case that transaction did not finish before max + // advances. It is expected for a few read-only backup snapshots. For such + // snapshots we might have kept around a couple of entries in the + // old_commit_map_. Check and do garbage collection if that is the case. + bool need_gc = false; + { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snap_seq); + need_gc = prep_set_entry != old_commit_map_.end(); + } + if (need_gc) { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_.erase(snap_seq); + old_commit_map_empty_.store(old_commit_map_.empty(), + std::memory_order_release); + } + } +} + +void WritePreparedTxnDB::CleanupReleasedSnapshots( + const std::vector& new_snapshots, + const std::vector& old_snapshots) { + auto newi = new_snapshots.begin(); + auto oldi = old_snapshots.begin(); + for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) { + assert(*newi >= *oldi); // cannot have new snapshots with lower seq + if (*newi == *oldi) { // still not released + auto value = *newi; + while (newi != new_snapshots.end() && *newi == value) { + newi++; + } + while (oldi != old_snapshots.end() && *oldi == value) { + oldi++; + } + } else { + assert(*newi > *oldi); // *oldi is released + ReleaseSnapshotInternal(*oldi); + oldi++; + } + } + // Everything remained in old_snapshots is released and must be cleaned up + for (; oldi != old_snapshots.end(); oldi++) { + ReleaseSnapshotInternal(*oldi); + } +} + +void WritePreparedTxnDB::UpdateSnapshots( + const std::vector& snapshots, + const SequenceNumber& version) { + ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64, + version); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead"); + WriteLock wl(&snapshots_mutex_); + snapshots_version_ = version; + // We update the list concurrently with the readers. + // Both new and old lists are sorted and the new list is subset of the + // previous list plus some new items. Thus if a snapshot repeats in + // both new and old lists, it will appear upper in the new list. So if + // we simply insert the new snapshots in order, if an overwritten item + // is still valid in the new list is either written to the same place in + // the array or it is written in a higher palce before it gets + // overwritten by another item. This guarantess a reader that reads the + // list bottom-up will eventaully see a snapshot that repeats in the + // update, either before it gets overwritten by the writer or + // afterwards. + size_t i = 0; + auto it = snapshots.begin(); + for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) { + snapshot_cache_[i].store(*it, std::memory_order_release); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#ifndef NDEBUG + // Release the remaining sync points since they are useless given that the + // reader would also use lock to access snapshots + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#endif + snapshots_.clear(); + for (; it != snapshots.end(); ++it) { + // Insert them to a vector that is less efficient to access + // concurrently + snapshots_.push_back(*it); + } + // Update the size at the end. Otherwise a parallel reader might read + // items that are not set yet. + snapshots_total_.store(snapshots.size(), std::memory_order_release); + + // Note: this must be done after the snapshots data structures are updated + // with the new list of snapshots. + CleanupReleasedSnapshots(snapshots, snapshots_all_); + snapshots_all_ = snapshots; + + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end"); +} + +void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + // First check the snapshot cache that is efficient for concurrent access + auto cnt = snapshots_total_.load(std::memory_order_acquire); + // The list might get updated concurrently as we are reading from it. The + // reader should be able to read all the snapshots that are still valid + // after the update. Since the survived snapshots are written in a higher + // place before gets overwritten the reader that reads bottom-up will + // eventully see it. + const bool next_is_larger = true; + // We will set to true if the border line snapshot suggests that. + bool search_larger_list = false; + size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE); + for (; 0 < ip1; ip1--) { + SequenceNumber snapshot_seq = + snapshot_cache_[ip1 - 1].load(std::memory_order_acquire); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", + ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + if (ip1 == SNAPSHOT_CACHE_SIZE) { // border line snapshot + // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq + // then later also continue the search to larger snapshots + search_larger_list = snapshot_seq < evicted.commit_seq; + } + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, !next_is_larger)) { + break; + } + } +#ifndef NDEBUG + // Release the remaining sync points before accquiring the lock + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + } +#endif + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end"); + if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) { + // Then access the less efficient list of snapshots_ + WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64 + "> with %" ROCKSDB_PRIszt " snapshots", + evicted.prep_seq, evicted.commit_seq, cnt); + ReadLock rl(&snapshots_mutex_); + // Items could have moved from the snapshots_ to snapshot_cache_ before + // accquiring the lock. To make sure that we do not miss a valid snapshot, + // read snapshot_cache_ again while holding the lock. + for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) { + SequenceNumber snapshot_seq = + snapshot_cache_[i].load(std::memory_order_acquire); + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, next_is_larger)) { + break; + } + } + for (auto snapshot_seq_2 : snapshots_) { + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq_2, next_is_larger)) { + break; + } + } + } +} + +bool WritePreparedTxnDB::MaybeUpdateOldCommitMap( + const uint64_t& prep_seq, const uint64_t& commit_seq, + const uint64_t& snapshot_seq, const bool next_is_larger = true) { + // If we do not store an entry in old_commit_map_ we assume it is committed in + // all snapshots. If commit_seq <= snapshot_seq, it is considered already in + // the snapshot so we need not to keep the entry around for this snapshot. + if (commit_seq <= snapshot_seq) { + // continue the search if the next snapshot could be smaller than commit_seq + return !next_is_larger; + } + // then snapshot_seq < commit_seq + if (prep_seq <= snapshot_seq) { // overlapping range + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "old_commit_map_mutex_ overhead for %" PRIu64 + " commit entry: <%" PRIu64 ",%" PRIu64 ">", + snapshot_seq, prep_seq, commit_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_empty_.store(false, std::memory_order_release); + auto& vec = old_commit_map_[snapshot_seq]; + vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq); + // We need to store it once for each overlapping snapshot. Returning true to + // continue the search if there is more overlapping snapshot. + return true; + } + // continue the search if the next snapshot could be larger than prep_seq + return next_is_larger; +} + +WritePreparedTxnDB::~WritePreparedTxnDB() { + // At this point there could be running compaction/flush holding a + // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB. + // Make sure those jobs finished before destructing WritePreparedTxnDB. + if (!db_impl_->shutting_down_) { + db_impl_->CancelAllBackgroundWork(true /*wait*/); + } +} + +void SubBatchCounter::InitWithComp(const uint32_t cf) { + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); +} + +void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) { + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + batches_++; + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.h b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h new file mode 100644 index 000000000..25a382473 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h @@ -0,0 +1,1125 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_iter.h" +#include "db/pre_release_callback.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "util/set_comparator.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { +enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot }; + +// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC. +// In this way some data in the DB might not be committed. The DB provides +// mechanisms to tell such data apart from committed data. +class WritePreparedTxnDB : public PessimisticTransactionDB { + public: + explicit WritePreparedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + explicit WritePreparedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + virtual ~WritePreparedTxnDB(); + + virtual Status Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + using TransactionDB::Write; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using PessimisticTransactionDB::Write; + Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&, + WriteBatch* updates) override; + + // Write the batch to the underlying DB and mark it as committed. Could be + // used by both directly from TxnDB or through a transaction. + Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch, + size_t batch_cnt, WritePreparedTxn* txn); + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + using DB::NewIterators; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + // Check whether the transaction that wrote the value with sequence number seq + // is visible to the snapshot with sequence number snapshot_seq. + // Returns true if commit_seq <= snapshot_seq + // If the snapshot_seq is already released and snapshot_seq <= max, sets + // *snap_released to true and returns true as well. + inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq, + uint64_t min_uncommitted = kMinUnCommittedSeq, + bool* snap_released = nullptr) const { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, min_uncommitted); + assert(min_uncommitted >= kMinUnCommittedSeq); + // Caller is responsible to initialize snap_released. + assert(snap_released == nullptr || *snap_released == false); + // Here we try to infer the return value without looking into prepare list. + // This would help avoiding synchronization over a shared map. + // TODO(myabandeh): optimize this. This sequence of checks must be correct + // but not necessary efficient + if (prep_seq == 0) { + // Compaction will output keys to bottom-level with sequence number 0 if + // it is visible to the earliest snapshot. + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + if (snapshot_seq < prep_seq) { + // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + if (prep_seq < min_uncommitted) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 + " because of min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, 1, min_uncommitted); + return true; + } + // Commit of delayed prepared has two non-atomic steps: add to commit cache, + // remove from delayed prepared. Our reads from these two is also + // non-atomic. By looking into commit cache first thus we might not find the + // prep_seq neither in commit cache not in delayed_prepared_. To fix that i) + // we check if there was any delayed prepared BEFORE looking into commit + // cache, ii) if there was, we complete the search steps to be these: i) + // commit cache, ii) delayed prepared, commit cache again. In this way if + // the first query to commit cache missed the commit, the 2nd will catch it. + bool was_empty; + SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub; + CommitEntry64b dont_care; + auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE; + size_t repeats = 0; + do { + repeats++; + assert(repeats < 100); + if (UNLIKELY(repeats >= 100)) { + throw std::runtime_error( + "The read was intrupted 100 times by update to max_evicted_seq_. " + "This is unexpected in all setups"); + } + max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"); + was_empty = delayed_prepared_empty_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"); + CommitEntry cached; + bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"); + if (exist && prep_seq == cached.prep_seq) { + // It is committed and also not evicted from commit cache + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + // else it could be committed but not inserted in the map which could + // happen after recovery, or it could be committed and evicted by another + // commit, or never committed. + + // At this point we don't know if it was committed or it is still prepared + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) { + continue; + } + // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub + if (max_evicted_seq_ub < prep_seq) { + // Not evicted from cache and also not present, so must be still + // prepared + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"); + if (!was_empty) { + // We should not normally reach here + WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD); + ReadLock rl(&prepared_mutex_); + ROCKS_LOG_WARN( + info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64, + static_cast(delayed_prepared_.size()), prep_seq); + if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) { + // This is the order: 1) delayed_prepared_commits_ update, 2) publish + // 3) delayed_prepared_ clean up. So check if it is the case of a late + // clenaup. + auto it = delayed_prepared_commits_.find(prep_seq); + if (it == delayed_prepared_commits_.end()) { + // Then it is not committed yet + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } else { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " commit: %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, it->second, + snapshot_seq <= it->second); + return it->second <= snapshot_seq; + } + } else { + // 2nd query to commit cache. Refer to was_empty comment above. + exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + if (exist && prep_seq == cached.prep_seq) { + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + } + } + } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)); + // When advancing max_evicted_seq_, we move older entires from prepared to + // delayed_prepared_. Also we move evicted entries from commit cache to + // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <= + // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in + // old_commit_map_, iii) committed with no conflict with any snapshot. Case + // (i) delayed_prepared_ is checked above + if (max_evicted_seq_ub < snapshot_seq) { // then (ii) cannot be the case + // only (iii) is the case: committed + // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq < + // snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + // else (ii) might be the case: check the commit data saved for this + // snapshot. If there was no overlapping commit entry, then it is committed + // with a commit_seq lower than any live snapshot, including snapshot_seq. + if (old_commit_map_empty_.load(std::memory_order_acquire)) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + assert(snap_released); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + *snap_released = true; + return true; + } + { + // We should not normally reach here unless sapshot_seq is old. This is a + // rare case and it is ok to pay the cost of mutex ReadLock for such old, + // reading transactions. + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snapshot_seq); + bool found = prep_set_entry != old_commit_map_.end(); + if (found) { + auto& vec = prep_set_entry->second; + found = std::binary_search(vec.begin(), vec.end(), prep_seq); + } else { + // coming from compaction + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + assert(snap_released); + *snap_released = true; + return true; + } + + if (!found) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + } + // (ii) it the case: it is committed but after the snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + + // Add the transaction with prepare sequence seq to the prepared list. + // Note: must be called serially with increasing seq on each call. + // locked is true if prepared_mutex_ is already locked. + void AddPrepared(uint64_t seq, bool locked = false); + // Check if any of the prepared txns are less than new max_evicted_seq_. Must + // be called with prepared_mutex_ write locked. + void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked); + // Remove the transaction with prepare sequence seq from the prepared list + void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1); + // Add the transaction with prepare sequence prepare_seq and commit sequence + // commit_seq to the commit map. loop_cnt is to detect infinite loops. + // Note: must be called serially. + void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt = 0); + + struct CommitEntry { + uint64_t prep_seq; + uint64_t commit_seq; + CommitEntry() : prep_seq(0), commit_seq(0) {} + CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {} + bool operator==(const CommitEntry& rhs) const { + return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq; + } + }; + + struct CommitEntry64bFormat { + explicit CommitEntry64bFormat(size_t index_bits) + : INDEX_BITS(index_bits), + PREP_BITS(static_cast(64 - PAD_BITS - INDEX_BITS)), + COMMIT_BITS(static_cast(64 - PREP_BITS)), + COMMIT_FILTER(static_cast((1ull << COMMIT_BITS) - 1)), + DELTA_UPPERBOUND(static_cast((1ull << COMMIT_BITS))) {} + // Number of higher bits of a sequence number that is not used. They are + // used to encode the value type, ... + const size_t PAD_BITS = static_cast(8); + // Number of lower bits from prepare seq that can be skipped as they are + // implied by the index of the entry in the array + const size_t INDEX_BITS; + // Number of bits we use to encode the prepare seq + const size_t PREP_BITS; + // Number of bits we use to encode the commit seq. + const size_t COMMIT_BITS; + // Filter to encode/decode commit seq + const uint64_t COMMIT_FILTER; + // The value of commit_seq - prepare_seq + 1 must be less than this bound + const uint64_t DELTA_UPPERBOUND; + }; + + // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ... + // INDEX Delta Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ... + // DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA + // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and + // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the + // bits that do not have to be encoded (will be provided externally) DELTA: + // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of + // index bits + PADs + struct CommitEntry64b { + constexpr CommitEntry64b() noexcept : rep_(0) {} + + CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format) + : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {} + + CommitEntry64b(const uint64_t ps, const uint64_t cs, + const CommitEntry64bFormat& format) { + assert(ps < static_cast( + (1ull << (format.PREP_BITS + format.INDEX_BITS)))); + assert(ps <= cs); + uint64_t delta = cs - ps + 1; // make initialized delta always >= 1 + // zero is reserved for uninitialized entries + assert(0 < delta); + assert(delta < format.DELTA_UPPERBOUND); + if (delta >= format.DELTA_UPPERBOUND) { + throw std::runtime_error( + "commit_seq >> prepare_seq. The allowed distance is " + + std::to_string(format.DELTA_UPPERBOUND) + " commit_seq is " + + std::to_string(cs) + " prepare_seq is " + std::to_string(ps)); + } + rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER; + rep_ = rep_ | delta; + } + + // Return false if the entry is empty + bool Parse(const uint64_t indexed_seq, CommitEntry* entry, + const CommitEntry64bFormat& format) { + uint64_t delta = rep_ & format.COMMIT_FILTER; + // zero is reserved for uninitialized entries + assert(delta < static_cast((1ull << format.COMMIT_BITS))); + if (delta == 0) { + return false; // initialized entry would have non-zero delta + } + + assert(indexed_seq < static_cast((1ull << format.INDEX_BITS))); + uint64_t prep_up = rep_ & ~format.COMMIT_FILTER; + prep_up >>= format.PAD_BITS; + const uint64_t& prep_low = indexed_seq; + entry->prep_seq = prep_up | prep_low; + + entry->commit_seq = entry->prep_seq + delta - 1; + return true; + } + + private: + uint64_t rep_; + }; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + std::shared_ptr> GetCFComparatorMap() { + return cf_map_; + } + std::shared_ptr> GetCFHandleMap() { + return handle_map_; + } + void UpdateCFComparatorMap( + const std::vector& handles) override; + void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override; + + virtual const Snapshot* GetSnapshot() override; + SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check); + + protected: + virtual Status VerifyCFOptions( + const ColumnFamilyOptions& cf_options) override; + // Assign the min and max sequence numbers for reading from the db. A seq > + // max is not valid, and a seq < min is valid, and a min <= seq < max requires + // further checking. Normally max is defined by the snapshot and min is by + // minimum uncommitted seq. + inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max); + // Validate is a snapshot sequence number is still valid based on the latest + // db status. backed_by_snapshot specifies if the number is baked by an actual + // snapshot object. order specified the memory order with which we load the + // atomic variables: relax is enough for the default since we care about last + // value seen by same thread. + inline bool ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order = std::memory_order_relaxed); + // Get a dummy snapshot that refers to kMaxSequenceNumber + Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; } + + bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + return rollback_deletion_type_callback_ + ? rollback_deletion_type_callback_(this, column_family, key) + : false; + } + + std::function + rollback_deletion_type_callback_; + + private: + friend class AddPreparedCallback; + friend class PreparedHeap_BasicsTest_Test; + friend class PreparedHeap_Concurrent_Test; + friend class PreparedHeap_EmptyAtTheEnd_Test; + friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccess_Test; + friend class WritePreparedCommitEntryPreReleaseCallback; + friend class WritePreparedTransactionTestBase; + friend class WritePreparedTxn; + friend class WritePreparedTxnDBMock; + friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test; + friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasic_Test; + friend class + WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicates_Test; + friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test; + friend class WritePreparedTransactionTest_BasicRecovery_Test; + friend class WritePreparedTransactionTest_CheckAgainstSnapshots_Test; + friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test; + friend class WritePreparedTransactionTest_ConflictDetectionAfterRecovery_Test; + friend class WritePreparedTransactionTest_CommitMap_Test; + friend class WritePreparedTransactionTest_DoubleSnapshot_Test; + friend class WritePreparedTransactionTest_IsInSnapshotEmptyMap_Test; + friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test; + friend class WritePreparedTransactionTest_IsInSnapshot_Test; + friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test; + friend class + WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test; + friend class + WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test; + friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test; + friend class WritePreparedTransactionTest_OldCommitMapGC_Test; + friend class WritePreparedTransactionTest_Rollback_Test; + friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test; + friend class WriteUnpreparedTxn; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class MultiOpsTxnsStressTest; + + void Init(const TransactionDBOptions& txn_db_opts); + + void WPRecordTick(uint32_t ticker_type) const { + RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type); + } + + // A heap with the amortized O(1) complexity for erase. It uses one extra heap + // to keep track of erased entries that are not yet on top of the main heap. + class PreparedHeap { + // The mutex is required for push and pop from PreparedHeap. ::erase will + // use external synchronization via prepared_mutex_. + port::Mutex push_pop_mutex_; + std::deque heap_; + std::priority_queue, std::greater> + erased_heap_; + std::atomic heap_top_ = {kMaxSequenceNumber}; + // True when testing crash recovery + bool TEST_CRASH_ = false; + friend class WritePreparedTxnDB; + + public: + ~PreparedHeap() { + if (!TEST_CRASH_) { + assert(heap_.empty()); + assert(erased_heap_.empty()); + } + } + port::Mutex* push_pop_mutex() { return &push_pop_mutex_; } + + inline bool empty() { return top() == kMaxSequenceNumber; } + // Returns kMaxSequenceNumber if empty() and the smallest otherwise. + inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); } + inline void push(uint64_t v) { + push_pop_mutex_.AssertHeld(); + if (heap_.empty()) { + heap_top_.store(v, std::memory_order_release); + } else { + assert(heap_top_.load() < v); + } + heap_.push_back(v); + } + void pop(bool locked = false) { + if (!locked) { + push_pop_mutex()->Lock(); + } + push_pop_mutex_.AssertHeld(); + heap_.pop_front(); + while (!heap_.empty() && !erased_heap_.empty() && + // heap_.top() > erased_heap_.top() could happen if we have erased + // a non-existent entry. Ideally the user should not do that but we + // should be resilient against it. + heap_.front() >= erased_heap_.top()) { + if (heap_.front() == erased_heap_.top()) { + heap_.pop_front(); + } + uint64_t erased __attribute__((__unused__)); + erased = erased_heap_.top(); + erased_heap_.pop(); + // No duplicate prepare sequence numbers + assert(erased_heap_.empty() || erased_heap_.top() != erased); + } + while (heap_.empty() && !erased_heap_.empty()) { + erased_heap_.pop(); + } + heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber, + std::memory_order_release); + if (!locked) { + push_pop_mutex()->Unlock(); + } + } + // Concurrrent calls needs external synchronization. It is safe to be called + // concurrent to push and pop though. + void erase(uint64_t seq) { + if (!empty()) { + auto top_seq = top(); + if (seq < top_seq) { + // Already popped, ignore it. + } else if (top_seq == seq) { + pop(); +#ifndef NDEBUG + MutexLock ml(push_pop_mutex()); + assert(heap_.empty() || heap_.front() != seq); +#endif + } else { // top() > seq + // Down the heap, remember to pop it later + erased_heap_.push(seq); + } + } + } + }; + + void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; } + + // Get the commit entry with index indexed_seq from the commit table. It + // returns true if such entry exists. + bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b, + CommitEntry* entry) const; + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry . If the rewrite results into eviction, + // sets the evicted_entry and returns true. + bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry, + CommitEntry* evicted_entry); + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry new_entry only if the existing entry matches the + // expected_entry. Returns false otherwise. + bool ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry, + const CommitEntry& new_entry); + + // Increase max_evicted_seq_ from the previous value prev_max to the new + // value. This also involves taking care of prepared txns that are not + // committed before new_max, as well as updating the list of live snapshots at + // the time of updating the max. Thread-safety: this function can be called + // concurrently. The concurrent invocations of this function is equivalent to + // a serial invocation in which the last invocation is the one with the + // largest new_max value. + void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max); + + inline SequenceNumber SmallestUnCommittedSeq() { + // Note: We have two lists to look into, but for performance reasons they + // are not read atomically. Since CheckPreparedAgainstMax copies the entry + // to delayed_prepared_ before removing it from prepared_txns_, to ensure + // that a prepared entry will not go unmissed, we look into them in opposite + // order: first read prepared_txns_ and then delayed_prepared_. + + // This must be called before calling ::top. This is because the concurrent + // thread would call ::RemovePrepared before updating + // GetLatestSequenceNumber(). Reading then in opposite order here guarantees + // that the ::top that we read would be lower the ::top if we had otherwise + // update/read them atomically. + auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1; + auto min_prepare = prepared_txns_.top(); + // Since we update the prepare_heap always from the main write queue via + // PreReleaseCallback, the prepared_txns_.top() indicates the smallest + // prepared data in 2pc transactions. For non-2pc transactions that are + // written in two steps, we also update prepared_txns_ at the first step + // (via the same mechanism) so that their uncommitted data is reflected in + // SmallestUnCommittedSeq. + if (!delayed_prepared_empty_.load()) { + ReadLock rl(&prepared_mutex_); + if (!delayed_prepared_.empty()) { + return *delayed_prepared_.begin(); + } + } + bool empty = min_prepare == kMaxSequenceNumber; + if (empty) { + // Since GetLatestSequenceNumber is updated + // after prepared_txns_ are, the value of GetLatestSequenceNumber would + // reflect any uncommitted data that is not added to prepared_txns_ yet. + // Otherwise, if there is no concurrent txn, this value simply reflects + // that latest value in the memtable. + return next_prepare; + } else { + return std::min(min_prepare, next_prepare); + } + } + + // Enhance the snapshot object by recording in it the smallest uncommitted seq + inline void EnhanceSnapshot(SnapshotImpl* snapshot, + SequenceNumber min_uncommitted) { + assert(snapshot); + assert(min_uncommitted <= snapshot->number_ + 1); + snapshot->min_uncommitted_ = min_uncommitted; + } + + virtual const std::vector GetSnapshotListFromDB( + SequenceNumber max); + + // Will be called by the public ReleaseSnapshot method. Does the maintenance + // internal to WritePreparedTxnDB + void ReleaseSnapshotInternal(const SequenceNumber snap_seq); + + // Update the list of snapshots corresponding to the soon-to-be-updated + // max_evicted_seq_. Thread-safety: this function can be called concurrently. + // The concurrent invocations of this function is equivalent to a serial + // invocation in which the last invocation is the one with the largest + // version value. + void UpdateSnapshots(const std::vector& snapshots, + const SequenceNumber& version); + // Check the new list of new snapshots against the old one to see if any of + // the snapshots are released and to do the cleanup for the released snapshot. + void CleanupReleasedSnapshots( + const std::vector& new_snapshots, + const std::vector& old_snapshots); + + // Check an evicted entry against live snapshots to see if it should be kept + // around or it can be safely discarded (and hence assume committed for all + // snapshots). Thread-safety: this function can be called concurrently. If it + // is called concurrently with multiple UpdateSnapshots, the result is the + // same as checking the intersection of the snapshot list before updates with + // the snapshot list of all the concurrent updates. + void CheckAgainstSnapshots(const CommitEntry& evicted); + + // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq < + // commit_seq. Return false if checking the next snapshot(s) is not needed. + // This is the case if none of the next snapshots could satisfy the condition. + // next_is_larger: the next snapshot will be a larger value + bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq, + const uint64_t& commit_seq, + const uint64_t& snapshot_seq, + const bool next_is_larger); + + // A trick to increase the last visible sequence number by one and also wait + // for the in-flight commits to be visible. + void AdvanceSeqByOne(); + + // The list of live snapshots at the last time that max_evicted_seq_ advanced. + // The list stored into two data structures: in snapshot_cache_ that is + // efficient for concurrent reads, and in snapshots_ if the data does not fit + // into snapshot_cache_. The total number of snapshots in the two lists + std::atomic snapshots_total_ = {}; + // The list sorted in ascending order. Thread-safety for writes is provided + // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for + // each entry. In x86_64 architecture such reads are compiled to simple read + // instructions. + const size_t SNAPSHOT_CACHE_BITS; + const size_t SNAPSHOT_CACHE_SIZE; + std::unique_ptr[]> snapshot_cache_; + // 2nd list for storing snapshots. The list sorted in ascending order. + // Thread-safety is provided with snapshots_mutex_. + std::vector snapshots_; + // The list of all snapshots: snapshots_ + snapshot_cache_. This list although + // redundant but simplifies CleanupOldSnapshots implementation. + // Thread-safety is provided with snapshots_mutex_. + std::vector snapshots_all_; + // The version of the latest list of snapshots. This can be used to avoid + // rewriting a list that is concurrently updated with a more recent version. + SequenceNumber snapshots_version_ = 0; + + // A heap of prepared transactions. Thread-safety is provided with + // prepared_mutex_. + PreparedHeap prepared_txns_; + const size_t COMMIT_CACHE_BITS; + const size_t COMMIT_CACHE_SIZE; + const CommitEntry64bFormat FORMAT; + // commit_cache_ must be initialized to zero to tell apart an empty index from + // a filled one. Thread-safety is provided with commit_cache_mutex_. + std::unique_ptr[]> commit_cache_; + // The largest evicted *commit* sequence number from the commit_cache_. If a + // seq is smaller than max_evicted_seq_ is might or might not be present in + // commit_cache_. So commit_cache_ must first be checked before consulting + // with max_evicted_seq_. + std::atomic max_evicted_seq_ = {}; + // Order: 1) update future_max_evicted_seq_ = new_max, 2) + // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since + // GetSnapshotInternal guarantess that the snapshot seq is larger than + // future_max_evicted_seq_, this guarantes that if a snapshot is not larger + // than max has already being looked at via a GetSnapshotListFromDB(new_max). + std::atomic future_max_evicted_seq_ = {}; + // Advance max_evicted_seq_ by this value each time it needs an update. The + // larger the value, the less frequent advances we would have. We do not want + // it to be too large either as it would cause stalls by doing too much + // maintenance work under the lock. + size_t INC_STEP_FOR_MAX_EVICTED = 1; + // A map from old snapshots (expected to be used by a few read-only txns) to + // prepared sequence number of the evicted entries from commit_cache_ that + // overlaps with such snapshot. These are the prepared sequence numbers that + // the snapshot, to which they are mapped, cannot assume to be committed just + // because it is no longer in the commit_cache_. The vector must be sorted + // after each update. + // Thread-safety is provided with old_commit_map_mutex_. + std::map> old_commit_map_; + // A set of long-running prepared transactions that are not finished by the + // time max_evicted_seq_ advances their sequence number. This is expected to + // be empty normally. Thread-safety is provided with prepared_mutex_. + std::set delayed_prepared_; + // Commit of a delayed prepared: 1) update commit cache, 2) update + // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_. + // delayed_prepared_commits_ will help us tell apart the unprepared txns from + // the ones that are committed but not cleaned up yet. + std::unordered_map delayed_prepared_commits_; + // Update when delayed_prepared_.empty() changes. Expected to be true + // normally. + std::atomic delayed_prepared_empty_ = {true}; + // Update when old_commit_map_.empty() changes. Expected to be true normally. + std::atomic old_commit_map_empty_ = {true}; + mutable port::RWMutex prepared_mutex_; + mutable port::RWMutex old_commit_map_mutex_; + mutable port::RWMutex commit_cache_mutex_; + mutable port::RWMutex snapshots_mutex_; + // A cache of the cf comparators + // Thread safety: since it is a const it is safe to read it concurrently + std::shared_ptr> cf_map_; + // A cache of the cf handles + // Thread safety: since the handle is read-only object it is a const it is + // safe to read it concurrently + std::shared_ptr> handle_map_; + // A dummy snapshot object that refers to kMaxSequenceNumber + SnapshotImpl dummy_max_snapshot_; +}; + +class WritePreparedTxnReadCallback : public ReadCallback { + public: + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot) + : ReadCallback(snapshot), + db_(db), + backed_by_snapshot_(kBackedByDBSnapshot) {} + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + SnapshotBackup backed_by_snapshot) + : ReadCallback(snapshot, min_uncommitted), + db_(db), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WritePreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override { + auto snapshot = max_visible_seq_; + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; + } + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + // TODO(myabandeh): override Refresh when Iterator::Refresh is supported + private: + WritePreparedTxnDB* db_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class AddPreparedCallback : public PreReleaseCallback { + public: + AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl, + size_t sub_batch_cnt, bool two_write_queues, + bool first_prepare_batch) + : db_(db), + db_impl_(db_impl), + sub_batch_cnt_(sub_batch_cnt), + two_write_queues_(two_write_queues), + first_prepare_batch_(first_prepare_batch) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber prepare_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t log_number, size_t index, + size_t total) override { + assert(index < total); + // To reduce the cost of lock acquisition competing with the concurrent + // prepare requests, lock on the first callback and unlock on the last. + const bool do_lock = !two_write_queues_ || index == 0; + const bool do_unlock = !two_write_queues_ || index + 1 == total; + // Always Prepare from the main queue + assert(!two_write_queues_ || !is_mem_disabled); // implies the 1st queue + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume"); + if (do_lock) { + db_->prepared_txns_.push_pop_mutex()->Lock(); + } + const bool kLocked = true; + for (size_t i = 0; i < sub_batch_cnt_; i++) { + db_->AddPrepared(prepare_seq + i, kLocked); + } + if (do_unlock) { + db_->prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end"); + if (first_prepare_batch_) { + assert(log_number != 0); + db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection( + log_number); + } + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + size_t sub_batch_cnt_; + bool two_write_queues_; + // It is 2PC and this is the first prepare batch. Always the case in 2PC + // unless it is WriteUnPrepared. + bool first_prepare_batch_; +}; + +class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WritePreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq, + size_t prep_batch_cnt, size_t data_batch_cnt = 0, + SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + prep_batch_cnt_(prep_batch_cnt), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + aux_seq_(aux_seq), + aux_batch_cnt_(aux_batch_cnt), + includes_aux_batch_(aux_batch_cnt > 0) { + assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor + assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0); + assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber)); // xor + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + // Always commit from the 2nd queue + assert(!db_impl_->immutable_db_options().two_write_queues || + is_mem_disabled); + assert(includes_data_ || prep_seq_ != kMaxSequenceNumber); + // Data batch is what accompanied with the commit marker and affects the + // last seq in the commit batch. + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + if (prep_seq_ != kMaxSequenceNumber) { + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + } // else there was no prepare phase + if (includes_aux_batch_) { + for (size_t i = 0; i < aux_batch_cnt_; i++) { + db_->AddCommitted(aux_seq_ + i, last_commit_seq); + } + } + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + // Note RemovePrepared should be called after publishing the seq. + // Otherwise SmallestUnCommittedSeq optimization breaks. + if (prep_seq_ != kMaxSequenceNumber) { + db_->RemovePrepared(prep_seq_, prep_batch_cnt_); + } // else there was no prepare phase + if (includes_aux_batch_) { + db_->RemovePrepared(aux_seq_, aux_batch_cnt_); + } + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + // kMaxSequenceNumber if there was no prepare phase + SequenceNumber prep_seq_; + size_t prep_batch_cnt_; + size_t data_batch_cnt_; + // Data here is the batch that is written with the commit marker, either + // because it is commit without prepare or commit has a CommitTimeWriteBatch. + bool includes_data_; + // Auxiliary batch (if there is any) is a batch that is written before, but + // gets the same commit seq as prepare batch or data batch. This is used in + // two write queues where the CommitTimeWriteBatch becomes the aux batch and + // we do a separate write to actually commit everything. + SequenceNumber aux_seq_; + size_t aux_batch_cnt_; + bool includes_aux_batch_; +}; + +// For two_write_queues commit both the aborted batch and the cleanup batch and +// then published the seq +class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback { + public: + WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db, + DBImpl* db_impl, + SequenceNumber prep_seq, + SequenceNumber rollback_seq, + size_t prep_batch_cnt) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + rollback_seq_(rollback_seq), + prep_batch_cnt_(prep_batch_cnt) { + assert(prep_seq != kMaxSequenceNumber); + assert(rollback_seq != kMaxSequenceNumber); + assert(prep_batch_cnt_ > 0); + } + + Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t, + size_t /*index*/, size_t /*total*/) override { + // Always commit from the 2nd queue + assert(is_mem_disabled); // implies the 2nd queue + assert(db_impl_->immutable_db_options().two_write_queues); +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + const uint64_t last_commit_seq = commit_seq; + db_->AddCommitted(rollback_seq_, last_commit_seq); + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + db_impl_->SetLastPublishedSequence(last_commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + SequenceNumber prep_seq_; + SequenceNumber rollback_seq_; + size_t prep_batch_cnt_; +}; + +// Count the number of sub-batches inside a batch. A sub-batch does not have +// duplicate keys. +struct SubBatchCounter : public WriteBatch::Handler { + explicit SubBatchCounter(std::map& comparators) + : comparators_(comparators), batches_(1) {} + std::map& comparators_; + using CFKeys = std::set; + std::map keys_; + size_t batches_; + size_t BatchCount() { return batches_; } + void AddKey(const uint32_t cf, const Slice& key); + void InitWithComp(const uint32_t cf); + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status DeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { return Status::OK(); } + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } +}; + +SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max) { + if (snapshot != nullptr) { + *min = + static_cast_with_check(snapshot)->min_uncommitted_; + *max = static_cast_with_check(snapshot)->number_; + // A duplicate of the check in EnhanceSnapshot(). + assert(*min <= *max + 1); + return kBackedByDBSnapshot; + } else { + *min = SmallestUnCommittedSeq(); + *max = 0; // to be assigned later after sv is referenced. + return kUnbackedByDBSnapshot; + } +} + +bool WritePreparedTxnDB::ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order) { + if (backed_by_snapshot == kBackedByDBSnapshot) { + return true; + } else { + SequenceNumber max = max_evicted_seq_.load(order); + // Validate that max has not advanced the snapshot seq that is not backed + // by a real snapshot. This is a very rare case that should not happen in + // real workloads. + if (UNLIKELY(snap_seq <= max && snap_seq != 0)) { + return false; + } + } + return true; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc new file mode 100644 index 000000000..6c8c62e0e --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc @@ -0,0 +1,790 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_test.h" +#include "utilities/transactions/write_unprepared_txn.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTransactionTestBase : public TransactionTestBase { + public: + WriteUnpreparedTransactionTestBase(bool use_stackable_db, + bool two_write_queue, + TxnDBWritePolicy write_policy) + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + kOrderedWrite) {} +}; + +class WriteUnpreparedTransactionTest + : public WriteUnpreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + WriteUnpreparedTransactionTest() + : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()), + std::get<1>(GetParam()), + std::get<2>(GetParam())) {} +}; + +INSTANTIATE_TEST_CASE_P( + WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest, + ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED), + std::make_tuple(false, true, WRITE_UNPREPARED))); + +enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT }; +class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { + public: + WriteUnpreparedStressTest() + : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()), + WRITE_UNPREPARED), + action_(std::get<1>(GetParam())) {} + StressAction action_; +}; + +INSTANTIATE_TEST_CASE_P( + WriteUnpreparedStressTest, WriteUnpreparedStressTest, + ::testing::Values(std::make_tuple(false, NO_SNAPSHOT), + std::make_tuple(false, RO_SNAPSHOT), + std::make_tuple(false, REFRESH_SNAPSHOT), + std::make_tuple(true, NO_SNAPSHOT), + std::make_tuple(true, RO_SNAPSHOT), + std::make_tuple(true, REFRESH_SNAPSHOT))); + +TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { + // The following tests checks whether reading your own write for + // a transaction works for write unprepared, when there are uncommitted + // values written into DB. + auto verify_state = [](Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); + }; + + // Test always reseeking vs never reseeking. + for (uint64_t max_skip : {0, std::numeric_limits::max()}) { + options.max_sequential_skip_in_iterations = max_skip; + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + TransactionOptions txn_options; + WriteOptions woptions; + ReadOptions roptions; + + ASSERT_OK(db->Put(woptions, "a", "")); + ASSERT_OK(db->Put(woptions, "b", "")); + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast(txn); + txn->SetSnapshot(); + + for (int i = 0; i < 5; i++) { + std::string stored_value = "v" + std::to_string(i); + ASSERT_OK(txn->Put("a", stored_value)); + ASSERT_OK(txn->Put("b", stored_value)); + ASSERT_OK(wup_txn->FlushWriteBatchToDB(false)); + + // Test Get() + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, stored_value); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, stored_value); + + // Test Next() + auto iter = txn->GetIterator(roptions); + iter->Seek("a"); + verify_state(iter, "a", stored_value); + + iter->Next(); + verify_state(iter, "b", stored_value); + + iter->SeekToFirst(); + verify_state(iter, "a", stored_value); + + iter->Next(); + verify_state(iter, "b", stored_value); + + delete iter; + + // Test Prev() + iter = txn->GetIterator(roptions); + iter->SeekForPrev("b"); + verify_state(iter, "b", stored_value); + + iter->Prev(); + verify_state(iter, "a", stored_value); + + iter->SeekToLast(); + verify_state(iter, "b", stored_value); + + iter->Prev(); + verify_state(iter, "a", stored_value); + + delete iter; + } + + delete txn; + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) { + // This is a stress test where different threads are writing random keys, and + // then before committing or aborting the transaction, it validates to see + // that it can read the keys it wrote, and the keys it did not write respect + // the snapshot. To avoid row lock contention (and simply stressing the + // locking system), each thread is mostly only writing to its own set of keys. + const uint32_t kNumIter = 1000; + const uint32_t kNumThreads = 10; + const uint32_t kNumKeys = 5; + + // Test with + // 1. no snapshots set + // 2. snapshot set on ReadOptions + // 3. snapshot set, and refreshing after every write. + StressAction a = action_; + WriteOptions write_options; + txn_db_options.transaction_lock_timeout = -1; + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + std::vector keys; + for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) { + keys.push_back("k" + std::to_string(k)); + } + RandomShuffle(keys.begin(), keys.end()); + + // This counter will act as a "sequence number" to help us validate + // visibility logic with snapshots. If we had direct access to the seqno of + // snapshots and key/values, then we should directly compare those instead. + std::atomic counter(0); + + std::function stress_thread = [&](int id) { + size_t tid = std::hash()(std::this_thread::get_id()); + Random64 rnd(static_cast(tid)); + + Transaction* txn; + TransactionOptions txn_options; + // batch_size of 1 causes writes to DB for every marker. + txn_options.write_batch_flush_threshold = 1; + ReadOptions read_options; + + for (uint32_t i = 0; i < kNumIter; i++) { + std::set owned_keys(keys.begin() + id * kNumKeys, + keys.begin() + (id + 1) * kNumKeys); + // Add unowned keys to make the workload more interesting, but this + // increases row lock contention, so just do it sometimes. + if (rnd.OneIn(2)) { + owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]); + } + + txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName(std::to_string(id))); + txn->SetSnapshot(); + if (a >= RO_SNAPSHOT) { + read_options.snapshot = txn->GetSnapshot(); + ASSERT_TRUE(read_options.snapshot != nullptr); + } + + uint64_t buf[2]; + buf[0] = id; + + // When scanning through the database, make sure that all unprepared + // keys have value >= snapshot and all other keys have value < snapshot. + int64_t snapshot_num = counter.fetch_add(1); + + Status s; + for (const auto& key : owned_keys) { + buf[1] = counter.fetch_add(1); + s = txn->Put(key, Slice((const char*)buf, sizeof(buf))); + if (!s.ok()) { + break; + } + if (a == REFRESH_SNAPSHOT) { + txn->SetSnapshot(); + read_options.snapshot = txn->GetSnapshot(); + snapshot_num = counter.fetch_add(1); + } + } + + // Failure is possible due to snapshot validation. In this case, + // rollback and move onto next iteration. + if (!s.ok()) { + ASSERT_TRUE(s.IsBusy()); + ASSERT_OK(txn->Rollback()); + delete txn; + continue; + } + + auto verify_key = [&owned_keys, &a, &id, &snapshot_num]( + const std::string& key, const std::string& value) { + if (owned_keys.count(key) > 0) { + ASSERT_EQ(value.size(), 16); + + // Since this key is part of owned_keys, then this key must be + // unprepared by this transaction identified by 'id' + ASSERT_EQ(((int64_t*)value.c_str())[0], id); + if (a == REFRESH_SNAPSHOT) { + // If refresh snapshot is true, then the snapshot is refreshed + // after every Put(), meaning that the current snapshot in + // snapshot_num must be greater than the "seqno" of any keys + // written by the current transaction. + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } else { + // If refresh snapshot is not on, then the snapshot was taken at + // the beginning of the transaction, meaning all writes must come + // after snapshot_num + ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num); + } + } else if (a >= RO_SNAPSHOT) { + // If this is not an unprepared key, just assert that the key + // "seqno" is smaller than the snapshot seqno. + ASSERT_EQ(value.size(), 16); + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } + }; + + // Validate Get()/Next()/Prev(). Do only one of them to save time, and + // reduce lock contention. + switch (rnd.Uniform(3)) { + case 0: // Validate Get() + { + for (const auto& key : keys) { + std::string value; + s = txn->Get(read_options, Slice(key), &value); + if (!s.ok()) { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(owned_keys.count(key), 0); + } else { + verify_key(key, value); + } + } + break; + } + case 1: // Validate Next() + { + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_OK(iter->status()); + delete iter; + break; + } + case 2: // Validate Prev() + { + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_OK(iter->status()); + delete iter; + break; + } + default: + FAIL(); + } + + if (rnd.OneIn(2)) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + delete txn; + } + }; + + std::vector threads; + for (uint32_t i = 0; i < kNumThreads; i++) { + threads.emplace_back(stress_thread, i); + } + + for (auto& t : threads) { + t.join(); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// This tests how write unprepared behaves during recovery when the DB crashes +// after a transaction has either been unprepared or prepared, and tests if +// the changes are correctly applied for prepared transactions if we decide to +// rollback/commit. +TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) { + WriteOptions write_options; + write_options.disableWAL = false; + TransactionOptions txn_options; + std::vector prepared_trans; + WriteUnpreparedTxnDB* wup_db; + options.disable_auto_compactions = true; + + enum Action { UNPREPARED, ROLLBACK, COMMIT }; + + // batch_size of 1 causes writes to DB for every marker. + for (size_t batch_size : {1, 1000000}) { + txn_options.write_batch_flush_threshold = batch_size; + for (bool empty : {true, false}) { + for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) { + for (int num_batches = 1; num_batches < 10; num_batches++) { + // Reset database. + prepared_trans.clear(); + ASSERT_OK(ReOpen()); + wup_db = dynamic_cast(db); + if (!empty) { + for (int i = 0; i < num_batches; i++) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), + "before value" + std::to_string(i))); + } + } + + // Write num_batches unprepared batches. + Transaction* txn = db->BeginTransaction(write_options, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast(txn); + ASSERT_OK(txn->SetName("xid")); + for (int i = 0; i < num_batches; i++) { + ASSERT_OK( + txn->Put("k" + std::to_string(i), "value" + std::to_string(i))); + if (txn_options.write_batch_flush_threshold == 1) { + // WriteUnprepared will check write_batch_flush_threshold and + // possibly flush before appending to the write batch. No flush + // will happen at the first write because the batch is still + // empty, so after k puts, there should be k-1 flushed batches. + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i); + } else { + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); + } + } + if (a == UNPREPARED) { + // This is done to prevent the destructor from rolling back the + // transaction for us, since we want to pretend we crashed and + // test that recovery does the rollback. + wup_txn->unprep_seqs_.clear(); + } else { + ASSERT_OK(txn->Prepare()); + } + delete txn; + + // Crash and run recovery code paths. + ASSERT_OK(wup_db->db_impl_->FlushWAL(true)); + wup_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1); + if (a == ROLLBACK) { + ASSERT_OK(prepared_trans[0]->Rollback()); + delete prepared_trans[0]; + } else if (a == COMMIT) { + ASSERT_OK(prepared_trans[0]->Commit()); + delete prepared_trans[0]; + } + + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + // Check that DB has before values. + if (!empty || a == COMMIT) { + for (int i = 0; i < num_batches; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i)); + if (a == COMMIT) { + ASSERT_EQ(iter->value().ToString(), + "value" + std::to_string(i)); + } else { + ASSERT_EQ(iter->value().ToString(), + "before value" + std::to_string(i)); + } + iter->Next(); + } + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } + } + } + } +} + +// Basic test to see that unprepared batch gets written to DB when batch size +// is exceeded. It also does some basic checks to see if commit/rollback works +// as expected for write unprepared. +TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) { + WriteOptions write_options; + TransactionOptions txn_options; + const int kNumKeys = 10; + + // batch_size of 1 causes writes to DB for every marker. + for (size_t batch_size : {1, 1000000}) { + txn_options.write_batch_flush_threshold = batch_size; + for (bool prepare : {false, true}) { + for (bool commit : {false, true}) { + ASSERT_OK(ReOpen()); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast(txn); + ASSERT_OK(txn->SetName("xid")); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(txn->Put("k" + std::to_string(i), "v" + std::to_string(i))); + if (txn_options.write_batch_flush_threshold == 1) { + // WriteUnprepared will check write_batch_flush_threshold and + // possibly flush before appending to the write batch. No flush will + // happen at the first write because the batch is still empty, so + // after k puts, there should be k-1 flushed batches. + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i); + } else { + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); + } + } + + if (prepare) { + ASSERT_OK(txn->Prepare()); + } + + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + assert(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + if (commit) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + delete txn; + + iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + for (int i = 0; i < (commit ? kNumKeys : 0); i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i)); + ASSERT_EQ(iter->value().ToString(), "v" + std::to_string(i)); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } + } + } +} + +// Test whether logs containing unprepared/prepared batches are kept even +// after memtable finishes flushing, and whether they are removed when +// transaction commits/aborts. +// +// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests. +TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) { + WriteOptions write_options; + TransactionOptions txn_options; + // batch_size of 1 causes writes to DB for every marker. + txn_options.write_batch_flush_threshold = 1; + const int kNumKeys = 10; + + WriteOptions wopts; + wopts.sync = true; + + for (bool prepare : {false, true}) { + for (bool commit : {false, true}) { + ASSERT_OK(ReOpen()); + auto wup_db = dynamic_cast(db); + auto db_impl = wup_db->db_impl_; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn1->SetName("xid1")); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn2->SetName("xid2")); + + // Spread this transaction across multiple log files. + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(txn1->Put("k1" + std::to_string(i), "v" + std::to_string(i))); + if (i >= kNumKeys / 2) { + ASSERT_OK( + txn2->Put("k2" + std::to_string(i), "v" + std::to_string(i))); + } + + if (i > 0) { + ASSERT_OK(db_impl->TEST_SwitchWAL()); + } + } + + ASSERT_GT(txn1->GetLogNumber(), 0); + ASSERT_GT(txn2->GetLogNumber(), 0); + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber()); + + if (prepare) { + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn2->Prepare()); + } + + ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber()); + ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber()); + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + if (commit) { + ASSERT_OK(txn1->Commit()); + } else { + ASSERT_OK(txn1->Rollback()); + } + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn2->GetLogNumber()); + + if (commit) { + ASSERT_OK(txn2->Commit()); + } else { + ASSERT_OK(txn2->Rollback()); + } + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + delete txn1; + delete txn2; + } + } +} + +TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + + // Do some writes with no snapshot + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Put("c", "c")); + + // Test that it is still possible to create iterators after writes with no + // snapshot, if iterator snapshot is fresh enough. + ReadOptions roptions; + auto iter = txn->GetIterator(roptions); + ASSERT_OK(iter->status()); + int keys = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_EQ(keys, 3); + ASSERT_OK(iter->status()); + + delete iter; + delete txn; +} + +// Test whether write to a transaction while iterating is supported. +TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + enum Action { DO_DELETE, DO_UPDATE }; + + for (Action a : {DO_DELETE, DO_UPDATE}) { + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i))); + } + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + // write_batch_ now contains 1 key. + ASSERT_OK(txn->Put("9", "a")); + + ReadOptions roptions; + auto iter = txn->GetIterator(roptions); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + if (iter->key() == "9") { + ASSERT_EQ(iter->value().ToString(), "a"); + } else { + ASSERT_EQ(iter->key().ToString(), iter->value().ToString()); + } + + if (a == DO_DELETE) { + ASSERT_OK(txn->Delete(iter->key())); + } else { + ASSERT_OK(txn->Put(iter->key(), "b")); + } + } + ASSERT_OK(iter->status()); + + delete iter; + ASSERT_OK(txn->Commit()); + + iter = db->NewIterator(roptions); + ASSERT_OK(iter->status()); + if (a == DO_DELETE) { + // Check that db is empty. + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + } else { + int keys = 0; + // Check that all values are updated to b. + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->value().ToString(), "b"); + } + ASSERT_EQ(keys, 100); + } + ASSERT_OK(iter->status()); + + delete iter; + delete txn; + } +} + +// Test that using an iterator after transaction clear is not supported +TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + enum Action { kCommit, kRollback }; + + for (Action a : {kCommit, kRollback}) { + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i))); + } + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->Put("9", "a")); + + ReadOptions roptions; + auto iter1 = txn->GetIterator(roptions); + auto iter2 = txn->GetIterator(roptions); + iter1->SeekToFirst(); + iter2->Seek("9"); + + // Check that iterators are valid before transaction finishes. + ASSERT_TRUE(iter1->Valid()); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter1->status()); + ASSERT_OK(iter2->status()); + + if (a == kCommit) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + + // Check that iterators are invalidated after transaction finishes. + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + ASSERT_TRUE(iter1->status().IsInvalidArgument()); + ASSERT_TRUE(iter2->status().IsInvalidArgument()); + + delete iter1; + delete iter2; + delete txn; + } +} + +TEST_P(WriteUnpreparedTransactionTest, SavePoint) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + txn->SetSavePoint(); + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Commit()); + + ReadOptions roptions; + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, "b"); + delete txn; +} + +TEST_P(WriteUnpreparedTransactionTest, UntrackedKeys) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + auto wb = txn->GetWriteBatch()->GetWriteBatch(); + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(wb->Put("a_untrack", "a_untrack")); + txn->SetSavePoint(); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Put("b_untrack", "b_untrack")); + + ReadOptions roptions; + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "a_untrack", &value)); + ASSERT_EQ(value, "a_untrack"); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, "b"); + ASSERT_OK(txn->Get(roptions, "b_untrack", &value)); + ASSERT_EQ(value, "b_untrack"); + + // b and b_untrack should be rolled back. + ASSERT_OK(txn->RollbackToSavePoint()); + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "a_untrack", &value)); + ASSERT_EQ(value, "a_untrack"); + auto s = txn->Get(roptions, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Everything should be rolled back. + ASSERT_OK(txn->Rollback()); + s = txn->Get(roptions, "a", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "a_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc new file mode 100644 index 000000000..6e04d3344 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc @@ -0,0 +1,1053 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_unprepared_txn.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" +#include "utilities/transactions/write_unprepared_txn_db.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { + // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is + // in unprep_seqs, we have to check if seq is equal to prep_seq or any of + // the prepare_batch_cnt seq nums after it. + // + // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is + // large. + for (const auto& it : unprep_seqs_) { + if (it.first <= seq && seq < it.first + it.second) { + return true; + } + } + + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; +} + +WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : WritePreparedTxn(txn_db, write_options, txn_options), + wupt_db_(txn_db), + last_log_number_(0), + recovered_txn_(false), + largest_validated_seq_(0) { + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } +} + +WriteUnpreparedTxn::~WriteUnpreparedTxn() { + if (!unprep_seqs_.empty()) { + assert(log_number_ > 0); + assert(GetId() > 0); + assert(!name_.empty()); + + // We should rollback regardless of GetState, but some unit tests that + // test crash recovery run the destructor assuming that rollback does not + // happen, so that rollback during recovery can be exercised. + if (GetState() == STARTED || GetState() == LOCKS_STOLEN) { + auto s = RollbackInternal(); + assert(s.ok()); + if (!s.ok()) { + ROCKS_LOG_FATAL( + wupt_db_->info_log_, + "Rollback of WriteUnprepared transaction failed in destructor: %s", + s.ToString().c_str()); + } + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + + // Clear the tracked locks so that ~PessimisticTransaction does not + // try to unlock keys for recovered transactions. + if (recovered_txn_) { + tracked_locks_->Clear(); + } +} + +void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + assert(active_iterators_.empty()); + active_iterators_.clear(); + untracked_keys_.clear(); +} + +Status WriteUnpreparedTxn::HandleWrite(std::function do_write) { + Status s; + if (active_iterators_.empty()) { + s = MaybeFlushWriteBatchToDB(); + if (!s.ok()) { + return s; + } + } + s = do_write(); + if (s.ok()) { + if (snapshot_) { + largest_validated_seq_ = + std::max(largest_validated_seq_, snapshot_->GetSequenceNumber()); + } else { + // TODO(lth): We should use the same number as tracked_at_seq in TryLock, + // because what is actually being tracked is the sequence number at which + // this key was locked at. + largest_validated_seq_ = db_impl_->GetLastPublishedSequence(); + } + } + return s; +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Merge(column_family, key, value, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For +// WriteUnprepared, the write batches have already been written into the +// database during WAL replay, so all we have to do is just to "retrack" the key +// so that rollbacks are possible. +// +// Calling TryLock instead of TrackKey is also possible, but as an optimization, +// recovered transactions do not hold locks on their keys. This follows the +// implementation in PessimisticTransactionDB::Initialize where we set +// skip_concurrency_control to true. +Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) { + struct TrackKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + } + return Status::OK(); + } + + // Recovered batches do not contain 2PC markers. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + TrackKeyHandler handler(this, + wupt_db_->txn_db_options_.rollback_merge_operands); + return wb->Iterate(&handler); +} + +Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { + const bool kPrepared = true; + Status s; + if (write_batch_flush_threshold_ > 0 && + write_batch_.GetWriteBatch()->Count() > 0 && + write_batch_.GetDataSize() > + static_cast(write_batch_flush_threshold_)) { + assert(GetState() != PREPARED); + s = FlushWriteBatchToDB(!kPrepared); + } + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { + // If the current write batch contains savepoints, then some special handling + // is required so that RollbackToSavepoint can work. + // + // RollbackToSavepoint is not supported after Prepare() is called, so only do + // this for unprepared batches. + if (!prepared && unflushed_save_points_ != nullptr && + !unflushed_save_points_->empty()) { + return FlushWriteBatchWithSavePointToDB(); + } + + return FlushWriteBatchToDBInternal(prepared); +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) { + if (name_.empty()) { + assert(!prepared); +#ifndef NDEBUG + static std::atomic_ullong autogen_id{0}; + // To avoid changing all tests to call SetName, just autogenerate one. + if (wupt_db_->txn_db_options_.autogenerate_name) { + auto s = SetName(std::string("autoxid") + + std::to_string(autogen_id.fetch_add(1))); + assert(s.ok()); + } else +#endif + { + return Status::InvalidArgument("Cannot write to DB without SetName."); + } + } + + struct UntrackedKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + UntrackedKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status AddUntrackedKey(uint32_t cf, const Slice& key) { + auto str = key.ToString(); + PointLockStatus lock_status = + txn_->tracked_locks_->GetPointLockStatus(cf, str); + if (!lock_status.locked) { + txn_->untracked_keys_[cf].push_back(str); + } + return Status::OK(); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return AddUntrackedKey(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + return AddUntrackedKey(cf, key); + } + return Status::OK(); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + UntrackedKeyHandler handler( + this, wupt_db_->txn_db_options_.rollback_merge_operands); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&handler); + assert(s.ok()); + + // TODO(lth): Reduce duplicate code with WritePrepared prepare logic. + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool first_prepare_batch = log_number_ == 0; + // MarkEndPrepare will change Noop marker to the appropriate marker. + s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT, !prepared); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // AddPrepared better to be called in the pre-release callback otherwise there + // is a non-zero chance of max advancing prepare_seq and readers assume the + // data as committed. + // Also having it in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, first_prepare_batch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + // log_number_ should refer to the oldest log containing uncommitted data + // from the current transaction. This means that if log_number_ is set, + // WriteImpl should not overwrite that value, so set log_used to nullptr if + // log_number_ is already set. + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &last_log_number_, + /*log ref*/ 0, !DISABLE_MEMTABLE, &seq_used, + prepare_batch_cnt_, &add_prepared_callback); + if (log_number_ == 0) { + log_number_ = last_log_number_; + } + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + + // Only call SetId if it hasn't been set yet. + if (GetId() == 0) { + SetId(prepare_seq); + } + // unprep_seqs_ will also contain prepared seqnos since they are treated in + // the same way in the prepare/commit callbacks. See the comment on the + // definition of unprep_seqs_. + unprep_seqs_[prepare_seq] = prepare_batch_cnt_; + + // Reset transaction state. + if (!prepared) { + prepare_batch_cnt_ = 0; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() { + assert(unflushed_save_points_ != nullptr && + unflushed_save_points_->size() > 0); + assert(save_points_ != nullptr && save_points_->size() > 0); + assert(save_points_->size() >= unflushed_save_points_->size()); + + // Handler class for creating an unprepared batch from a savepoint. + struct SavePointBatchHandler : public WriteBatch::Handler { + WriteBatchWithIndex* wb_; + const std::map& handles_; + + SavePointBatchHandler( + WriteBatchWithIndex* wb, + const std::map& handles) + : wb_(wb), handles_(handles) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Put(handles_.at(cf), key, value); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return wb_->Delete(handles_.at(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return wb_->SingleDelete(handles_.at(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Merge(handles_.at(cf), key, value); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + // The comparator of the default cf is passed in, similar to the + // initialization of TransactionBaseImpl::write_batch_. This comparator is + // only used if the write batch encounters an invalid cf id, and falls back to + // this comparator. + WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0, + true, 0, write_options_.protection_bytes_per_key); + // Swap with write_batch_ so that wb contains the complete write batch. The + // actual write batch that will be flushed to DB will be built in + // write_batch_, and will be read by FlushWriteBatchToDBInternal. + std::swap(wb, write_batch_); + TransactionBaseImpl::InitWriteBatch(); + + size_t prev_boundary = WriteBatchInternal::kHeader; + const bool kPrepared = true; + for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) { + bool trailing_batch = i == unflushed_save_points_->size(); + SavePointBatchHandler sp_handler(&write_batch_, + *wupt_db_->GetCFHandleMap().get()); + size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize() + : (*unflushed_save_points_)[i]; + + // Construct the partial write batch up to the savepoint. + // + // Theoretically, a memcpy between the write batches should be sufficient + // since the rewriting into the batch should produce the exact same byte + // representation. Rebuilding the WriteBatchWithIndex index is still + // necessary though, and would imply doing two passes over the batch though. + Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler, + prev_boundary, curr_boundary); + if (!s.ok()) { + return s; + } + + if (write_batch_.GetWriteBatch()->Count() > 0) { + // Flush the write batch. + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + } + + if (!trailing_batch) { + if (flushed_save_points_ == nullptr) { + flushed_save_points_.reset( + new autovector()); + } + flushed_save_points_->emplace_back( + unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot())); + } + + prev_boundary = curr_boundary; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + unflushed_save_points_->clear(); + return Status::OK(); +} + +Status WriteUnpreparedTxn::PrepareInternal() { + const bool kPrepared = true; + return FlushWriteBatchToDB(kPrepared); +} + +Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() { + if (unprep_seqs_.empty()) { + assert(log_number_ == 0); + assert(GetId() == 0); + return WritePreparedTxn::CommitWithoutPrepareInternal(); + } + + // TODO(lth): We should optimize commit without prepare to not perform + // a prepare under the hood. + auto s = PrepareInternal(); + if (!s.ok()) { + return s; + } + return CommitInternal(); +} + +Status WriteUnpreparedTxn::CommitInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared commit logic. + + // We take the commit-time batch and append the Commit marker. The Memtable + // will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + const bool includes_data = !empty && !for_recovery; + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt); + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is + // already a connection between the memtable and its WAL, so there is no + // need to redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (LIKELY(s.ok())) { + // Note RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + if (UNLIKELY(!do_one_write)) { + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write to publish seq + + // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the + // commit write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[commit_batch_seq] = commit_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + + // Update commit map only from the 2nd queue + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_commit_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note RemovePrepared should be called after WriteImpl that publishsed the + // seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +Status WriteUnpreparedTxn::WriteRollbackKeys( + const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions) { + // This assertion can be removed when range lock is supported. + assert(lock_tracker.IsPointLockSupported()); + const auto& cf_map = *wupt_db_->GetCFHandleMap(); + auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) { + const auto& cf_handle = cf_map.at(cfid); + PinnableSlice pinnable_val; + bool not_used; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = callback; + auto s = db_impl_->GetImpl(roptions, key, get_impl_options); + + if (s.ok()) { + s = rollback_batch->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + if (wupt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch->SingleDelete(cf_handle, key); + } else { + s = rollback_batch->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + return s; + } + + return Status::OK(); + }; + + std::unique_ptr cf_it( + lock_tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr key_it( + lock_tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + auto s = WriteRollbackKey(key, cf); + if (!s.ok()) { + return s; + } + } + } + + for (const auto& cfkey : untracked_keys_) { + const auto cfid = cfkey.first; + const auto& keys = cfkey.second; + for (const auto& key : keys) { + auto s = WriteRollbackKey(key, cfid); + if (!s.ok()) { + return s; + } + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxn::RollbackInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + WriteBatchWithIndex rollback_batch( + wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0, + write_options_.protection_bytes_per_key); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + Status s; + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + // Note that we do not use WriteUnpreparedTxnReadCallback because we do not + // need to read our own writes when reading prior versions of the key for + // rollback. + WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq); + // TODO(lth): We write rollback batch all in a single batch here, but this + // should be subdivded into multiple batches as well. In phase 2, when key + // sets are read from WAL, this will happen naturally. + s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + // Rollback batch may contain duplicate keys, because tracked_keys_ is not + // comparator aware. + auto rollback_batch_cnt = rollback_batch.SubBatchCnt(); + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + // + // TODO(lth): RollbackInternal is conceptually very similar to + // CommitInternal, with the rollback batch simply taking on the role of + // CommitTimeWriteBatch. We should be able to merge the two code paths. + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt); + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while the rollback + // batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(), + nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE, + &seq_used, rollback_batch_cnt, + do_one_write ? &update_commit_map : nullptr); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write for commit + + uint64_t& prepare_seq = seq_used; + // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the + // rollback write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[prepare_seq] = rollback_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + WriteBatch empty_batch; + const size_t ONE_BATCH = 1; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_rollback_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Mark the txn as rolled back + if (s.ok()) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +void WriteUnpreparedTxn::Clear() { + if (!recovered_txn_) { + txn_db_impl_->UnLock(this, *tracked_locks_); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + for (auto& it : active_iterators_) { + auto bdit = static_cast(it); + bdit->Invalidate(Status::InvalidArgument( + "Cannot use iterator after transaction has finished")); + } + active_iterators_.clear(); + untracked_keys_.clear(); + TransactionBaseImpl::Clear(); +} + +void WriteUnpreparedTxn::SetSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + PessimisticTransaction::SetSavePoint(); + if (unflushed_save_points_ == nullptr) { + unflushed_save_points_.reset(new autovector()); + } + unflushed_save_points_->push_back(write_batch_.GetDataSize()); +} + +Status WriteUnpreparedTxn::RollbackToSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::RollbackToSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + return RollbackToSavePointInternal(); + } + + return Status::NotFound(); +} + +Status WriteUnpreparedTxn::RollbackToSavePointInternal() { + Status s; + + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + + assert(flushed_save_points_->size() > 0); + WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back(); + + assert(save_points_ != nullptr && save_points_->size() > 0); + const LockTracker& tracked_keys = *save_points_->top().new_locks_; + + ReadOptions roptions; + roptions.snapshot = top.snapshot_->snapshot(); + SequenceNumber min_uncommitted = + static_cast_with_check(roptions.snapshot) + ->min_uncommitted_; + SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber(); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + top.unprep_seqs_, + kBackedByDBSnapshot); + s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions); + if (!s.ok()) { + return s; + } + + const bool kPrepared = true; + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + + // PessimisticTransaction::RollbackToSavePoint will call also call + // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has + // no savepoints because this savepoint has already been flushed. Work around + // this by setting a fake savepoint. + write_batch_.SetSavePoint(); + s = PessimisticTransaction::RollbackToSavePoint(); + assert(s.ok()); + if (!s.ok()) { + return s; + } + + flushed_save_points_->pop_back(); + return s; +} + +Status WriteUnpreparedTxn::PopSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on + // write_batch_. However, write_batch_ is empty and has no savepoints + // because this savepoint has already been flushed. Work around this by + // setting a fake savepoint. + write_batch_.SetSavePoint(); + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + flushed_save_points_->pop_back(); + return s; + } + + return Status::NotFound(); +} + +void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WriteUnpreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + value, &callback); + if (LIKELY(callback.valid() && + wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +namespace { +static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) { + auto txn = reinterpret_cast(arg1); + auto iter = reinterpret_cast(arg2); + txn->RemoveActiveIterator(iter); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) { + return GetIterator(options, wupt_db_->DefaultColumnFamily()); +} + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WriteUnprepareTxnDB, not the root db. + Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this); + assert(db_iter); + + auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter); + active_iterators_.push_back(iter); + iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter); + return iter; +} + +Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic. + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WriteUnpreparedTxnReadCallback snap_checker( + wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot); + // TODO(yanqin): Support user-defined timestamp. + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +const std::map& +WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { + return unprep_seqs_; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.h b/src/rocksdb/utilities/transactions/write_unprepared_txn.h new file mode 100644 index 000000000..5a3227f4e --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.h @@ -0,0 +1,341 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "utilities/transactions/write_prepared_txn.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxnDB; +class WriteUnpreparedTxn; + +// WriteUnprepared transactions needs to be able to read their own uncommitted +// writes, and supporting this requires some careful consideration. Because +// writes in the current transaction may be flushed to DB already, we cannot +// rely on the contents of WriteBatchWithIndex to determine whether a key should +// be visible or not, so we have to remember to check the DB for any uncommitted +// keys that should be visible to us. First, we will need to change the seek to +// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq). +// Any key greater than max_visible_seq should not be visible because they +// cannot be unprepared by the current transaction and they are not in its +// snapshot. +// +// When we seek to max_visible_seq, one of these cases will happen: +// 1. We hit a unprepared key from the current transaction. +// 2. We hit a unprepared key from the another transaction. +// 3. We hit a committed key with snap_seq < seq < max_unprep_seq. +// 4. We hit a committed key with seq <= snap_seq. +// +// IsVisibleFullCheck handles all cases correctly. +// +// Other notes: +// Note that max_visible_seq is only calculated once at iterator construction +// time, meaning if the same transaction is adding more unprep seqs through +// writes during iteration, these newer writes may not be visible. This is not a +// problem for MySQL though because it avoids modifying the index as it is +// scanning through it to avoid the Halloween Problem. Instead, it scans the +// index once up front, and modifies based on a temporary copy. +// +// In DBIter, there is a "reseek" optimization if the iterator skips over too +// many keys. However, this assumes that the reseek seeks exactly to the +// required key. In write unprepared, even after seeking directly to +// max_visible_seq, some iteration may be required before hitting a visible key, +// and special precautions must be taken to avoid performing another reseek, +// leading to an infinite loop. +// +class WriteUnpreparedTxnReadCallback : public ReadCallback { + public: + WriteUnpreparedTxnReadCallback( + WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + const std::map& unprep_seqs, + SnapshotBackup backed_by_snapshot) + // Pass our last uncommitted seq as the snapshot to the parent class to + // ensure that the parent will not prematurely filter out own writes. We + // will do the exact comparison against snapshots in IsVisibleFullCheck + // override. + : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted), + db_(db), + unprep_seqs_(unprep_seqs), + wup_snapshot_(snapshot), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WriteUnpreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + virtual bool IsVisibleFullCheck(SequenceNumber seq) override; + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + void Refresh(SequenceNumber seq) override { + max_visible_seq_ = std::max(max_visible_seq_, seq); + wup_snapshot_ = seq; + } + + static SequenceNumber CalcMaxVisibleSeq( + const std::map& unprep_seqs, + SequenceNumber snapshot_seq) { + SequenceNumber max_unprepared = 0; + if (unprep_seqs.size()) { + max_unprepared = + unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; + } + return std::max(max_unprepared, snapshot_seq); + } + + private: + WritePreparedTxnDB* db_; + const std::map& unprep_seqs_; + SequenceNumber wup_snapshot_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class WriteUnpreparedTxn : public WritePreparedTxn { + public: + WriteUnpreparedTxn(WriteUnpreparedTxnDB* db, + const WriteOptions& write_options, + const TransactionOptions& txn_options); + + virtual ~WriteUnpreparedTxn(); + + using TransactionBaseImpl::Put; + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Merge; + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Delete; + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDelete; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) override; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + // In WriteUnprepared, untracked writes will break snapshot validation logic. + // Snapshot validation will only check the largest sequence number of a key to + // see if it was committed or not. However, an untracked unprepared write will + // hide smaller committed sequence numbers. + // + // TODO(lth): Investigate whether it is worth having snapshot validation + // validate all values larger than snap_seq. Otherwise, we should return + // Status::NotSupported for untracked writes. + + virtual Status RebuildFromWriteBatch(WriteBatch*) override; + + virtual uint64_t GetLastLogNumber() const override { + return last_log_number_; + } + + void RemoveActiveIterator(Iterator* iter) { + active_iterators_.erase( + std::remove(active_iterators_.begin(), active_iterators_.end(), iter), + active_iterators_.end()); + } + + protected: + void Initialize(const TransactionOptions& txn_options) override; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + Status CommitInternal() override; + + Status RollbackInternal() override; + + void Clear() override; + + void SetSavePoint() override; + Status RollbackToSavePoint() override; + Status PopSavePoint() override; + + // Get and GetIterator needs to be overridden so that a ReadCallback to + // handle read-your-own-write is used. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + private: + friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test; + friend class WriteUnpreparedTxnDB; + + const std::map& GetUnpreparedSequenceNumbers(); + Status WriteRollbackKeys(const LockTracker& tracked_keys, + WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions); + + Status MaybeFlushWriteBatchToDB(); + Status FlushWriteBatchToDB(bool prepared); + Status FlushWriteBatchToDBInternal(bool prepared); + Status FlushWriteBatchWithSavePointToDB(); + Status RollbackToSavePointInternal(); + Status HandleWrite(std::function do_write); + + // For write unprepared, we check on every writebatch append to see if + // write_batch_flush_threshold_ has been exceeded, and then call + // FlushWriteBatchToDB if so. This logic is encapsulated in + // MaybeFlushWriteBatchToDB. + int64_t write_batch_flush_threshold_; + WriteUnpreparedTxnDB* wupt_db_; + + // Ordered list of unprep_seq sequence numbers that we have already written + // to DB. + // + // This maps unprep_seq => prepare_batch_cnt for each unprepared batch + // written by this transaction. + // + // Note that this contains both prepared and unprepared batches, since they + // are treated similarily in prepare heap/commit map, so it simplifies the + // commit callbacks. + std::map unprep_seqs_; + + uint64_t last_log_number_; + + // Recovered transactions have tracked_keys_ populated, but are not actually + // locked for efficiency reasons. For recovered transactions, skip unlocking + // keys when transaction ends. + bool recovered_txn_; + + // Track the largest sequence number at which we performed snapshot + // validation. If snapshot validation was skipped because no snapshot was set, + // then this is set to GetLastPublishedSequence. This value is useful because + // it means that for keys that have unprepared seqnos, we can guarantee that + // no committed keys by other transactions can exist between + // largest_validated_seq_ and max_unprep_seq. See + // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is + // necessary for iterator Prev(). + // + // Currently this value only increases during the lifetime of a transaction, + // but in some cases, we should be able to restore the previously largest + // value when calling RollbackToSavepoint. + SequenceNumber largest_validated_seq_; + + struct SavePoint { + // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is + // used during RollbackToSavepoint to determine visibility when restoring + // old values. + // + // TODO(lth): Since all unprep_seqs_ sets further down the stack must be + // subsets, this can potentially be deduplicated by just storing set + // difference. Investigate if this is worth it. + std::map unprep_seqs_; + + // This snapshot will be used to read keys at this savepoint if we call + // RollbackToSavePoint. + std::unique_ptr snapshot_; + + SavePoint(const std::map& seqs, + ManagedSnapshot* snapshot) + : unprep_seqs_(seqs), snapshot_(snapshot){}; + }; + + // We have 3 data structures holding savepoint information: + // 1. TransactionBaseImpl::save_points_ + // 2. WriteUnpreparedTxn::flushed_save_points_ + // 3. WriteUnpreparecTxn::unflushed_save_points_ + // + // TransactionBaseImpl::save_points_ holds information about all write + // batches, including the current in-memory write_batch_, or unprepared + // batches that have been written out. Its responsibility is just to track + // which keys have been modified in every savepoint. + // + // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints + // set on unprepared batches that have already flushed. It holds the snapshot + // and unprep_seqs at that savepoint, so that the rollback process can + // determine which keys were visible at that point in time. + // + // WriteUnpreparecTxn::unflushed_save_points_ holds information about + // savepoints on the current in-memory write_batch_. It simply records the + // size of the write batch at every savepoint. + // + // TODO(lth): Remove the redundancy between save_point_boundaries_ and + // write_batch_.save_points_. + // + // Based on this information, here are some invariants: + // size(unflushed_save_points_) = size(write_batch_.save_points_) + // size(flushed_save_points_) + size(unflushed_save_points_) + // = size(save_points_) + // + std::unique_ptr> + flushed_save_points_; + std::unique_ptr> unflushed_save_points_; + + // It is currently unsafe to flush a write batch if there are active iterators + // created from this transaction. This is because we use WriteBatchWithIndex + // to do merging reads from the DB and the write batch. If we flush the write + // batch, it is possible that the delta iterator on the iterator will point to + // invalid memory. + std::vector active_iterators_; + + // Untracked keys that we have to rollback. + // + // TODO(lth): Currently we we do not record untracked keys per-savepoint. + // This means that when rolling back to savepoints, we have to check all + // keys in the current transaction for rollback. Note that this is only + // inefficient, but still correct because we take a snapshot at every + // savepoint, and we will use that snapshot to construct the rollback batch. + // The rollback batch will then contain a reissue of the same marker. + // + // A more optimal solution would be to only check keys changed since the + // last savepoint. Also, it may make sense to merge this into tracked_keys_ + // and differentiate between tracked but not locked keys to avoid having two + // very similar data structures. + using KeySet = std::unordered_map>; + KeySet untracked_keys_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc new file mode 100644 index 000000000..2ed2d5c59 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc @@ -0,0 +1,473 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_unprepared_txn_db.h" + +#include "db/arena_wrapped_db_iter.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Instead of reconstructing a Transaction object, and calling rollback on it, +// we can be more efficient with RollbackRecoveredTransaction by skipping +// unnecessary steps (eg. updating CommitMap, reconstructing keyset) +Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( + const DBImpl::RecoveredTransaction* rtxn) { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + assert(rtxn->unprepared_); + auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap(); + auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap(); + // In theory we could write with disableWAL = true during recovery, and + // assume that if we crash again during recovery, we can just replay from + // the very beginning. Unfortunately, the XIDs from the application may not + // necessarily be unique across restarts, potentially leading to situations + // like this: + // + // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1) + // -- crash and recover with Put(a) rolled back as it was not prepared + // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1) + // COMMIT(xid = 1) + // -- crash and recover with both a, b + // + // We could just write the rollback marker, but then we would have to extend + // MemTableInserter during recovery to actually do writes into the DB + // instead of just dropping the in-memory write batch. + // + WriteOptions w_options; + + class InvalidSnapshotReadCallback : public ReadCallback { + public: + InvalidSnapshotReadCallback(SequenceNumber snapshot) + : ReadCallback(snapshot) {} + + inline bool IsVisibleFullCheck(SequenceNumber) override { + // The seq provided as snapshot is the seq right before we have locked and + // wrote to it, so whatever is there, it is committed. + return true; + } + + // Ignore the refresh request since we are confident that our snapshot seq + // is not going to be affected by concurrent compactions (not enabled yet.) + void Refresh(SequenceNumber) override {} + }; + + // Iterate starting with largest sequence number. + for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) { + auto last_visible_txn = it->first - 1; + const auto& batch = it->second.batch_; + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + w_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* db_; + ReadOptions roptions; + InvalidSnapshotReadCallback callback; + WriteBatch* rollback_batch_; + std::map& comparators_; + std::map& handles_; + using CFKeys = std::set; + std::map keys_; + bool rollback_merge_operands_; + RollbackWriteBatchBuilder( + DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch, + std::map& comparators, + std::map& handles, + bool rollback_merge_operands) + : db_(db), + callback(snap_seq), + // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto res = cf_keys.insert(key); + if (res.second == + false) { // second is false if a element already existed. + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_->GetImpl(roptions, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + s = rollback_batch_->Delete(cf_handle, key); + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + // Recovered batches do not contain 2PC markers. + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + Status MarkBeginPrepare(bool) override { + return Status::InvalidArgument(); + } + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + } rollback_handler(db_impl_, last_visible_txn, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + txn_db_options_.rollback_merge_operands); + + auto s = batch->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_); + if (!s.ok()) { + return s; + } + + const uint64_t kNoLogRef = 0; + const bool kDisableMemtable = true; + const size_t kOneBatch = 1; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr, + kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch); + if (!s.ok()) { + return s; + } + + // If two_write_queues, we must manually release the sequence number to + // readers. + if (db_impl_->immutable_db_options().two_write_queues) { + db_impl_->SetLastPublishedSequence(seq_used); + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxnDB::Initialize( + const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) { + // TODO(lth): Reduce code duplication in this function. + auto dbimpl = static_cast_with_check(GetRootDB()); + assert(dbimpl != nullptr); + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + // PessimisticTransactionDB::Initialize + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + // create 'real' transactions from recovered shell transactions + auto rtxns = dbimpl->recovered_transactions(); + std::map ordered_seq_cnt; + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() >= 1); + assert(recovered_trx->name_.length()); + + // We can only rollback transactions after AdvanceMaxEvictedSeq is called, + // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why + // two iterations is required. + if (recovered_trx->unprepared_) { + continue; + } + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + + auto first_log_number = recovered_trx->batches_.begin()->second.log_number_; + auto first_seq = recovered_trx->batches_.begin()->first; + auto last_prepare_batch_cnt = + recovered_trx->batches_.begin()->second.batch_cnt_; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + auto wupt = static_cast_with_check(real_trx); + wupt->recovered_txn_ = true; + + real_trx->SetLogNumber(first_log_number); + real_trx->SetId(first_seq); + Status s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + return s; + } + wupt->prepare_batch_cnt_ = last_prepare_batch_cnt; + + for (auto batch : recovered_trx->batches_) { + const auto& seq = batch.first; + const auto& batch_info = batch.second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + assert(batch_info.log_number_); + + ordered_seq_cnt[seq] = cnt; + assert(wupt->unprep_seqs_.count(seq) == 0); + wupt->unprep_seqs_[seq] = cnt; + + s = wupt->RebuildFromWriteBatch(batch_info.batch_); + assert(s.ok()); + if (!s.ok()) { + return s; + } + } + + const bool kClear = true; + wupt->InitWriteBatch(kClear); + + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + return s; + } + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + Status s; + // Rollback unprepared transactions. + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + if (recovered_trx->unprepared_) { + s = RollbackRecoveredTransaction(recovered_trx); + if (!s.ok()) { + return s; + } + continue; + } + } + + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + + // Compaction should start only after max_evicted_seq_ is set AND recovered + // transactions are either added to PrepareHeap or rolled back. + s = EnableAutoCompaction(compaction_enabled_cf_handles); + } + + return s; +} + +Transaction* WriteUnpreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteUnpreparedTxn(this, write_options, txn_options); + } +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WriteUnpreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr s, + SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn) + : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_, + kBackedByDBSnapshot), + snapshot(s) {} + SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); } + + WriteUnpreparedTxnReadCallback callback; + std::shared_ptr snapshot; +}; + +namespace { +static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast(arg1); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn) { + // TODO(lth): Refactor so that this logic is shared with WritePrepared. + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + + // Currently, the Prev() iterator logic does not work well without snapshot + // validation. The logic simply iterates through values of a key in + // ascending seqno order, stopping at the first non-visible value and + // returning the last visible value. + // + // For example, if snapshot sequence is 3, and we have the following keys: + // foo: v1 1 + // foo: v2 2 + // foo: v3 3 + // foo: v4 4 + // foo: v5 5 + // + // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3, + // which is the last visible value. + // + // For unprepared transactions, if we have snap_seq = 3, but the current + // transaction has unprep_seq 5, then returning the first non-visible value + // would be incorrect, as we should return v5, and not v3. The problem is that + // there are committed values at snapshot_seq < commit_seq < unprep_seq. + // + // Snapshot validation can prevent this problem by ensuring that no committed + // values exist at snapshot_seq < commit_seq, and thus any value with a + // sequence number greater than snapshot_seq must be unprepared values. For + // example, if the transaction had a snapshot at 3, then snapshot validation + // would be performed during the Put(v5) call. It would find v4, and the Put + // would fail with snapshot validation failure. + // + // TODO(lth): Improve Prev() logic to continue iterating until + // max_visible_seq, and then return the last visible value, so that this + // restriction can be lifted. + const Snapshot* snapshot = nullptr; + if (options.snapshot == nullptr) { + snapshot = GetSnapshot(); + own_snapshot = std::make_shared(db_impl_, snapshot); + } else { + snapshot = options.snapshot; + } + + snapshot_seq = snapshot->GetSequenceNumber(); + assert(snapshot_seq != kMaxSequenceNumber); + // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are + // guaranteed that for keys that were modified by this transaction (and thus + // might have unprepared values), no committed values exist at + // largest_validated_seq < commit_seq (or the contrapositive: any committed + // value must exist at commit_seq <= largest_validated_seq). This implies + // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <= + // snapshot_seq. As explained above, the problem with Prev() only happens when + // snapshot_seq < commit_seq. + // + // For keys that were not modified by this transaction, largest_validated_seq_ + // is meaningless, and Prev() should just work with the existing visibility + // logic. + if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() && + !txn->unprep_seqs_.empty()) { + ROCKS_LOG_ERROR(info_log_, + "WriteUnprepared iterator creation failed since the " + "transaction has performed unvalidated writes"); + return nullptr; + } + min_uncommitted = + static_cast_with_check(snapshot)->min_uncommitted_; + + auto* cfd = + static_cast_with_check(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); + auto* db_iter = db_impl_->NewIteratorImpl( + options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index, + allow_refresh); + db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h new file mode 100644 index 000000000..c40e96d49 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxn; + +class WriteUnpreparedTxnDB : public WritePreparedTxnDB { + public: + using WritePreparedTxnDB::WritePreparedTxnDB; + + Status Initialize(const std::vector& compaction_enabled_cf_indices, + const std::vector& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + using WritePreparedTxnDB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn); + + private: + Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn); +}; + +class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + // TODO(lth): Reduce code duplication with + // WritePreparedCommitEntryPreReleaseCallback + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WriteUnpreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, + const std::map& unprep_seqs, + size_t data_batch_cnt = 0, bool publish_seq = true) + : db_(db), + db_impl_(db_impl), + unprep_seqs_(unprep_seqs), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + publish_seq_(publish_seq) { + assert(unprep_seqs.size() > 0); + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt. + for (const auto& s : unprep_seqs_) { + for (size_t i = 0; i < s.second; i++) { + db_->AddCommitted(s.first + i, last_commit_seq); + } + } + + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + const std::map& unprep_seqs_; + size_t data_batch_cnt_; + // Either because it is commit without prepare or it has a + // CommitTimeWriteBatch + bool includes_data_; + // Should the callback also publishes the commit seq number + bool publish_seq_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.cc b/src/rocksdb/utilities/ttl/db_ttl_impl.cc new file mode 100644 index 000000000..6ec9d87b0 --- /dev/null +++ b/src/rocksdb/utilities/ttl/db_ttl_impl.cc @@ -0,0 +1,609 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + +#include "utilities/ttl/db_ttl_impl.h" + +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map ttl_merge_op_type_info = + {{"user_operator", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}}; + +TtlMergeOperator::TtlMergeOperator( + const std::shared_ptr& merge_op, SystemClock* clock) + : user_merge_op_(merge_op), clock_(clock) { + RegisterOptions("TtlMergeOptions", &user_merge_op_, &ttl_merge_op_type_info); +} + +bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) { + ROCKS_LOG_ERROR(merge_in.logger, + "Error: Could not remove timestamp from existing value."); + return false; + } + + // Extract time-stamp from each operand to be passed to user_merge_op_ + std::vector operands_without_ts; + for (const auto& operand : merge_in.operand_list) { + if (operand.size() < ts_len) { + ROCKS_LOG_ERROR(merge_in.logger, + "Error: Could not remove timestamp from operand value."); + return false; + } + operands_without_ts.push_back(operand); + operands_without_ts.back().remove_suffix(ts_len); + } + + // Apply the user merge operator (store result in *new_value) + bool good = true; + MergeOperationOutput user_merge_out(merge_out->new_value, + merge_out->existing_operand); + if (merge_in.existing_value) { + Slice existing_value_without_ts(merge_in.existing_value->data(), + merge_in.existing_value->size() - ts_len); + good = user_merge_op_->FullMergeV2( + MergeOperationInput(merge_in.key, &existing_value_without_ts, + operands_without_ts, merge_in.logger), + &user_merge_out); + } else { + good = user_merge_op_->FullMergeV2( + MergeOperationInput(merge_in.key, nullptr, operands_without_ts, + merge_in.logger), + &user_merge_out); + } + + // Return false if the user merge operator returned false + if (!good) { + return false; + } + + if (merge_out->existing_operand.data()) { + merge_out->new_value.assign(merge_out->existing_operand.data(), + merge_out->existing_operand.size()); + merge_out->existing_operand = Slice(nullptr, 0); + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!clock_->GetCurrentTime(&curtime).ok()) { + ROCKS_LOG_ERROR( + merge_in.logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + merge_out->new_value.append(ts_string, ts_len); + return true; + } +} + +bool TtlMergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + const uint32_t ts_len = DBWithTTLImpl::kTSLength; + std::deque operands_without_ts; + + for (const auto& operand : operand_list) { + if (operand.size() < ts_len) { + ROCKS_LOG_ERROR(logger, "Error: Could not remove timestamp from value."); + return false; + } + + operands_without_ts.push_back( + Slice(operand.data(), operand.size() - ts_len)); + } + + // Apply the user partial-merge operator (store result in *new_value) + assert(new_value); + if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value, + logger)) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!clock_->GetCurrentTime(&curtime).ok()) { + ROCKS_LOG_ERROR( + logger, + "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } +} + +Status TtlMergeOperator::PrepareOptions(const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return MergeOperator::PrepareOptions(config_options); +} + +Status TtlMergeOperator::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (user_merge_op_ == nullptr) { + return Status::InvalidArgument( + "UserMergeOperator required by TtlMergeOperator"); + } else if (clock_ == nullptr) { + return Status::InvalidArgument("SystemClock required by TtlMergeOperator"); + } else { + return MergeOperator::ValidateOptions(db_opts, cf_opts); + } +} + +void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, + SystemClock* clock) { + if (options->compaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, clock, options->compaction_filter); + } else { + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, clock, options->compaction_filter_factory)); + } + + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator, clock)); + } +} + +static std::unordered_map ttl_type_info = { + {"ttl", {0, OptionType::kInt32T}}, +}; + +static std::unordered_map ttl_cff_type_info = { + {"user_filter_factory", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kNone)}}; +static std::unordered_map user_cf_type_info = { + {"user_filter", + OptionTypeInfo::AsCustomRawPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}}; + +TtlCompactionFilter::TtlCompactionFilter( + int32_t ttl, SystemClock* clock, const CompactionFilter* _user_comp_filter, + std::unique_ptr _user_comp_filter_from_factory) + : LayeredCompactionFilterBase(_user_comp_filter, + std::move(_user_comp_filter_from_factory)), + ttl_(ttl), + clock_(clock) { + RegisterOptions("TTL", &ttl_, &ttl_type_info); + RegisterOptions("UserFilter", &user_comp_filter_, &user_cf_type_info); +} + +bool TtlCompactionFilter::Filter(int level, const Slice& key, + const Slice& old_val, std::string* new_val, + bool* value_changed) const { + if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) { + return true; + } + if (user_comp_filter() == nullptr) { + return false; + } + assert(old_val.size() >= DBWithTTLImpl::kTSLength); + Slice old_val_without_ts(old_val.data(), + old_val.size() - DBWithTTLImpl::kTSLength); + if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val, + value_changed)) { + return true; + } + if (*value_changed) { + new_val->append(old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength, + DBWithTTLImpl::kTSLength); + } + return false; +} + +Status TtlCompactionFilter::PrepareOptions( + const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return LayeredCompactionFilterBase::PrepareOptions(config_options); +} + +Status TtlCompactionFilter::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (clock_ == nullptr) { + return Status::InvalidArgument( + "SystemClock required by TtlCompactionFilter"); + } else { + return LayeredCompactionFilterBase::ValidateOptions(db_opts, cf_opts); + } +} + +TtlCompactionFilterFactory::TtlCompactionFilterFactory( + int32_t ttl, SystemClock* clock, + std::shared_ptr comp_filter_factory) + : ttl_(ttl), clock_(clock), user_comp_filter_factory_(comp_filter_factory) { + RegisterOptions("UserOptions", &user_comp_filter_factory_, + &ttl_cff_type_info); + RegisterOptions("TTL", &ttl_, &ttl_type_info); +} + +std::unique_ptr +TtlCompactionFilterFactory::CreateCompactionFilter( + const CompactionFilter::Context& context) { + std::unique_ptr user_comp_filter_from_factory = + nullptr; + if (user_comp_filter_factory_) { + user_comp_filter_from_factory = + user_comp_filter_factory_->CreateCompactionFilter(context); + } + + return std::unique_ptr(new TtlCompactionFilter( + ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory))); +} + +Status TtlCompactionFilterFactory::PrepareOptions( + const ConfigOptions& config_options) { + if (clock_ == nullptr) { + clock_ = config_options.env->GetSystemClock().get(); + } + return CompactionFilterFactory::PrepareOptions(config_options); +} + +Status TtlCompactionFilterFactory::ValidateOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (clock_ == nullptr) { + return Status::InvalidArgument( + "SystemClock required by TtlCompactionFilterFactory"); + } else { + return CompactionFilterFactory::ValidateOptions(db_opts, cf_opts); + } +} + +int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) { + library.AddFactory( + TtlMergeOperator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TtlMergeOperator(nullptr, nullptr)); + return guard->get(); + }); + library.AddFactory( + TtlCompactionFilterFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TtlCompactionFilterFactory(0, nullptr, nullptr)); + return guard->get(); + }); + library.AddFactory( + TtlCompactionFilter::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { + return new TtlCompactionFilter(0, nullptr, nullptr); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +// Open the db inside DBWithTTLImpl because options needs pointer to its ttl +DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {} + +DBWithTTLImpl::~DBWithTTLImpl() { + if (!closed_) { + Close().PermitUncheckedError(); + } +} + +Status DBWithTTLImpl::Close() { + Status ret = Status::OK(); + if (!closed_) { + Options default_options = GetOptions(); + // Need to stop background compaction before getting rid of the filter + CancelAllBackgroundWork(db_, /* wait = */ true); + ret = db_->Close(); + delete default_options.compaction_filter; + closed_ = true; + } + return ret; +} + +void DBWithTTLImpl::RegisterTtlClasses() { + static std::once_flag once; + std::call_once(once, [&]() { + ObjectRegistry::Default()->AddLibrary("TTL", RegisterTtlObjects, ""); + }); +} + +Status DBWithTTL::Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl, bool read_only) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles, + dbptr, {ttl}, read_only); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} + +Status DBWithTTL::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DBWithTTL** dbptr, + const std::vector& ttls, bool read_only) { + DBWithTTLImpl::RegisterTtlClasses(); + if (ttls.size() != column_families.size()) { + return Status::InvalidArgument( + "ttls size has to be the same as number of column families"); + } + + SystemClock* clock = (db_options.env == nullptr) + ? SystemClock::Default().get() + : db_options.env->GetSystemClock().get(); + + std::vector column_families_sanitized = + column_families; + for (size_t i = 0; i < column_families_sanitized.size(); ++i) { + DBWithTTLImpl::SanitizeOptions( + ttls[i], &column_families_sanitized[i].options, clock); + } + DB* db; + + Status st; + if (read_only) { + st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized, + handles, &db); + } else { + st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTLImpl(db); + } else { + *dbptr = nullptr; + } + return st; +} + +Status DBWithTTLImpl::CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) { + RegisterTtlClasses(); + ColumnFamilyOptions sanitized_options = options; + DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, + GetEnv()->GetSystemClock().get()); + + return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name, + handle); +} + +Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0); +} + +// Appends the current timestamp to the string. +// Returns false if could not get the current_time, true if append succeeds +Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts, + SystemClock* clock) { + val_with_ts->reserve(kTSLength + val.size()); + char ts_string[kTSLength]; + int64_t curtime; + Status st = clock->GetCurrentTime(&curtime); + if (!st.ok()) { + return st; + } + EncodeFixed32(ts_string, (int32_t)curtime); + val_with_ts->append(val.data(), val.size()); + val_with_ts->append(ts_string, kTSLength); + return st; +} + +// Returns corruption if the length of the string is lesser than timestamp, or +// timestamp refers to a time lesser than ttl-feature release time +Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) { + if (str.size() < kTSLength) { + return Status::Corruption("Error: value's length less than timestamp's\n"); + } + // Checks that TS is not lesser than kMinTimestamp + // Gaurds against corruption & normal database opened incorrectly in ttl mode + int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength); + if (timestamp_value < kMinTimestamp) { + return Status::Corruption("Error: Timestamp < ttl feature release time!\n"); + } + return Status::OK(); +} + +// Checks if the string is stale or not according to TTl provided +bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, + SystemClock* clock) { + if (ttl <= 0) { // Data is fresh if TTL is non-positive + return false; + } + int64_t curtime; + if (!clock->GetCurrentTime(&curtime).ok()) { + return false; // Treat the data as fresh if could not get current time + } + int32_t timestamp_value = + DecodeFixed32(value.data() + value.size() - kTSLength); + return (timestamp_value + ttl) < curtime; +} + +// Strips the TS from the end of the slice +Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) { + if (pinnable_val->size() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + pinnable_val->remove_suffix(kTSLength); + return Status::OK(); +} + +// Strips the TS from the end of the string +Status DBWithTTLImpl::StripTS(std::string* str) { + if (str->length() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + str->erase(str->length() - kTSLength, kTSLength); + return Status::OK(); +} + +Status DBWithTTLImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) { + WriteBatch batch; + Status st = batch.Put(column_family, key, val); + if (st.ok()) { + st = Write(options, &batch); + } + return st; +} + +Status DBWithTTLImpl::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + Status st = db_->Get(options, column_family, key, value); + if (!st.ok()) { + return st; + } + st = SanityCheckTimestamp(*value); + if (!st.ok()) { + return st; + } + return StripTS(value); +} + +std::vector DBWithTTLImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + auto statuses = db_->MultiGet(options, column_family, keys, values); + for (size_t i = 0; i < keys.size(); ++i) { + if (!statuses[i].ok()) { + continue; + } + statuses[i] = SanityCheckTimestamp((*values)[i]); + if (!statuses[i].ok()) { + continue; + } + statuses[i] = StripTS(&(*values)[i]); + } + return statuses; +} + +bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool* value_found) { + bool ret = db_->KeyMayExist(options, column_family, key, value, value_found); + if (ret && value != nullptr && value_found != nullptr && *value_found) { + if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { + return false; + } + } + return ret; +} + +Status DBWithTTLImpl::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + WriteBatch batch; + Status st = batch.Merge(column_family, key, value); + if (st.ok()) { + st = Write(options, &batch); + } + return st; +} + +Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler : public WriteBatch::Handler { + public: + explicit Handler(SystemClock* clock) : clock_(clock) {} + WriteBatch updates_ttl; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, clock_); + if (!st.ok()) { + return st; + } + return WriteBatchInternal::Put(&updates_ttl, column_family_id, key, + value_with_ts); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + std::string value_with_ts; + Status st = AppendTS(value, &value_with_ts, clock_); + if (!st.ok()) { + return st; + } + return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key, + value_with_ts); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key); + } + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id, + begin_key, end_key); + } + void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); } + + private: + SystemClock* clock_; + }; + Handler handler(GetEnv()->GetSystemClock().get()); + Status st = updates->Iterate(&handler); + if (!st.ok()) { + return st; + } else { + return db_->Write(opts, &(handler.updates_ttl)); + } +} + +Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) { + return new TtlIterator(db_->NewIterator(opts, column_family)); +} + +void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { + std::shared_ptr filter; + Options opts; + opts = GetOptions(h); + filter = std::static_pointer_cast( + opts.compaction_filter_factory); + if (!filter) return; + filter->SetTtl(ttl); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.h b/src/rocksdb/utilities/ttl/db_ttl_impl.h new file mode 100644 index 000000000..dd67a6ddc --- /dev/null +++ b/src/rocksdb/utilities/ttl/db_ttl_impl.h @@ -0,0 +1,245 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/db_ttl.h" +#include "utilities/compaction_filters/layered_compaction_filter_base.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; +class ObjectLibrary; +class ObjectRegistry; +class DBWithTTLImpl : public DBWithTTL { + public: + static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options, + SystemClock* clock); + + static void RegisterTtlClasses(); + explicit DBWithTTLImpl(DB* db); + + virtual ~DBWithTTLImpl(); + + virtual Status Close() override; + + Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle, + int ttl) override; + + Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + using StackableDB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override; + + virtual DB* GetBaseDB() override { return db_; } + + static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock); + + static Status AppendTS(const Slice& val, std::string* val_with_ts, + SystemClock* clock); + + static Status SanityCheckTimestamp(const Slice& str); + + static Status StripTS(std::string* str); + + static Status StripTS(PinnableSlice* str); + + static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp + + static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 + + static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 + + void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); } + + void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override; + + private: + // remember whether the Close completes or not + bool closed_; +}; + +class TtlIterator : public Iterator { + public: + explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); } + + ~TtlIterator() { delete iter_; } + + bool Valid() const override { return iter_->Valid(); } + + void SeekToFirst() override { iter_->SeekToFirst(); } + + void SeekToLast() override { iter_->SeekToLast(); } + + void Seek(const Slice& target) override { iter_->Seek(target); } + + void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); } + + void Next() override { iter_->Next(); } + + void Prev() override { iter_->Prev(); } + + Slice key() const override { return iter_->key(); } + + int32_t ttl_timestamp() const { + return DecodeFixed32(iter_->value().data() + iter_->value().size() - + DBWithTTLImpl::kTSLength); + } + + Slice value() const override { + // TODO: handle timestamp corruption like in general iterator semantics + assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok()); + Slice trimmed_value = iter_->value(); + trimmed_value.size_ -= DBWithTTLImpl::kTSLength; + return trimmed_value; + } + + Status status() const override { return iter_->status(); } + + private: + Iterator* iter_; +}; + +class TtlCompactionFilter : public LayeredCompactionFilterBase { + public: + TtlCompactionFilter(int32_t ttl, SystemClock* clock, + const CompactionFilter* _user_comp_filter, + std::unique_ptr + _user_comp_filter_from_factory = nullptr); + + virtual bool Filter(int level, const Slice& key, const Slice& old_val, + std::string* new_val, bool* value_changed) const override; + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "TtlCompactionFilter"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == "Delete By TTL") { + return true; + } else { + return LayeredCompactionFilterBase::IsInstanceOf(name); + } + } + + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + + private: + int32_t ttl_; + SystemClock* clock_; +}; + +class TtlCompactionFilterFactory : public CompactionFilterFactory { + public: + TtlCompactionFilterFactory( + int32_t ttl, SystemClock* clock, + std::shared_ptr comp_filter_factory); + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; + void SetTtl(int32_t ttl) { ttl_ = ttl; } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "TtlCompactionFilterFactory"; } + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + const Customizable* Inner() const override { + return user_comp_filter_factory_.get(); + } + + private: + int32_t ttl_; + SystemClock* clock_; + std::shared_ptr user_comp_filter_factory_; +}; + +class TtlMergeOperator : public MergeOperator { + public: + explicit TtlMergeOperator(const std::shared_ptr& merge_op, + SystemClock* clock); + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const override; + + static const char* kClassName() { return "TtlMergeOperator"; } + + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override { + if (name == "Merge By TTL") { + return true; + } else { + return MergeOperator::IsInstanceOf(name); + } + } + + Status PrepareOptions(const ConfigOptions& config_options) override; + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + const Customizable* Inner() const override { return user_merge_op_.get(); } + + private: + std::shared_ptr user_merge_op_; + SystemClock* clock_; +}; +extern "C" { +int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/); +} // extern "C" + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/ttl/ttl_test.cc b/src/rocksdb/utilities/ttl/ttl_test.cc new file mode 100644 index 000000000..a42e0acb4 --- /dev/null +++ b/src/rocksdb/utilities/ttl/ttl_test.cc @@ -0,0 +1,912 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/testharness.h" +#include "util/string_util.h" +#include "utilities/merge_operators/bytesxor.h" +#include "utilities/ttl/db_ttl_impl.h" +#ifndef OS_WIN +#include +#endif + +namespace ROCKSDB_NAMESPACE { + +namespace { + +using KVMap = std::map; + +enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 }; +} // namespace + +class SpecialTimeEnv : public EnvWrapper { + public: + explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) { + EXPECT_OK(base->GetCurrentTime(¤t_time_)); + } + const char* Name() const override { return "SpecialTimeEnv"; } + void Sleep(int64_t sleep_time) { current_time_ += sleep_time; } + Status GetCurrentTime(int64_t* current_time) override { + *current_time = current_time_; + return Status::OK(); + } + + private: + int64_t current_time_ = 0; +}; + +class TtlTest : public testing::Test { + public: + TtlTest() { + env_.reset(new SpecialTimeEnv(Env::Default())); + dbname_ = test::PerThreadDBPath("db_ttl"); + options_.create_if_missing = true; + options_.env = env_.get(); + // ensure that compaction is kicked in to always strip timestamp from kvs + options_.max_compaction_bytes = 1; + // compaction should take place always from level0 for determinism + db_ttl_ = nullptr; + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + ~TtlTest() override { + CloseTtl(); + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + // Open database with TTL support when TTL not provided with db_ttl_ pointer + void OpenTtl() { + ASSERT_TRUE(db_ttl_ == + nullptr); // db should be closed before opening again + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_)); + } + + // Open database with TTL support when TTL provided with db_ttl_ pointer + void OpenTtl(int32_t ttl) { + ASSERT_TRUE(db_ttl_ == nullptr); + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl)); + } + + // Open with TestFilter compaction filter + void OpenTtlWithTestCompaction(int32_t ttl) { + options_.compaction_filter_factory = + std::shared_ptr( + new TestFilterFactory(kSampleSize_, kNewValue_)); + OpenTtl(ttl); + } + + // Open database with TTL support in read_only mode + void OpenReadOnlyTtl(int32_t ttl) { + ASSERT_TRUE(db_ttl_ == nullptr); + ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true)); + } + + // Call db_ttl_->Close() before delete db_ttl_ + void CloseTtl() { CloseTtlHelper(true); } + + // No db_ttl_->Close() before delete db_ttl_ + void CloseTtlNoDBClose() { CloseTtlHelper(false); } + + void CloseTtlHelper(bool close_db) { + if (db_ttl_ != nullptr) { + if (close_db) { + EXPECT_OK(db_ttl_->Close()); + } + delete db_ttl_; + db_ttl_ = nullptr; + } + } + + // Populates and returns a kv-map + void MakeKVMap(int64_t num_entries) { + kvmap_.clear(); + int digits = 1; + for (int64_t dummy = num_entries; dummy /= 10; ++digits) { + } + int digits_in_i = 1; + for (int64_t i = 0; i < num_entries; i++) { + std::string key = "key"; + std::string value = "value"; + if (i % 10 == 0) { + digits_in_i++; + } + for (int j = digits_in_i; j < digits; j++) { + key.append("0"); + value.append("0"); + } + AppendNumberTo(&key, i); + AppendNumberTo(&value, i); + kvmap_[key] = value; + } + ASSERT_EQ(static_cast(kvmap_.size()), + num_entries); // check all insertions done + } + + // Makes a write-batch with key-vals from kvmap_ and 'Write''s it + void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) { + ASSERT_LE(num_ops, static_cast(kvmap_.size())); + static WriteOptions wopts; + static FlushOptions flush_opts; + WriteBatch batch; + kv_it_ = kvmap_.begin(); + for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) { + switch (batch_ops[i]) { + case OP_PUT: + ASSERT_OK(batch.Put(kv_it_->first, kv_it_->second)); + break; + case OP_DELETE: + ASSERT_OK(batch.Delete(kv_it_->first)); + break; + default: + FAIL(); + } + } + ASSERT_OK(db_ttl_->Write(wopts, &batch)); + ASSERT_OK(db_ttl_->Flush(flush_opts)); + } + + // Puts num_entries starting from start_pos_map from kvmap_ into the database + void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true, + ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + ASSERT_LE(start_pos_map + num_entries, static_cast(kvmap_.size())); + static WriteOptions wopts; + static FlushOptions flush_opts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, start_pos_map); + for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries; + i++, ++kv_it_) { + ASSERT_OK(cf == nullptr + ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second) + : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second)); + } + // Put a mock kv at the end because CompactionFilter doesn't delete last key + ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, "keymock", "valuemock") + : db_ttl_->Put(wopts, cf, "keymock", "valuemock")); + if (flush) { + if (cf == nullptr) { + ASSERT_OK(db_ttl_->Flush(flush_opts)); + } else { + ASSERT_OK(db_ttl_->Flush(flush_opts, cf)); + } + } + } + + // Runs a manual compaction + Status ManualCompact(ColumnFamilyHandle* cf = nullptr) { + assert(db_ttl_); + if (cf == nullptr) { + return db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } else { + return db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr); + } + } + + // Runs a DeleteRange + void MakeDeleteRange(std::string start, std::string end, + ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + static WriteOptions wops; + WriteBatch wb; + ASSERT_OK(cf == nullptr + ? wb.DeleteRange(db_ttl_->DefaultColumnFamily(), start, end) + : wb.DeleteRange(cf, start, end)); + ASSERT_OK(db_ttl_->Write(wops, &wb)); + } + + // checks the whole kvmap_ to return correct values using KeyMayExist + void SimpleKeyMayExistCheck() { + static ReadOptions ropts; + bool value_found; + std::string val; + for (auto& kv : kvmap_) { + bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found); + if (ret == false || value_found == false) { + fprintf(stderr, + "KeyMayExist could not find key=%s in the database but" + " should have\n", + kv.first.c_str()); + FAIL(); + } else if (val.compare(kv.second) != 0) { + fprintf(stderr, + " value for key=%s present in database is %s but" + " should be %s\n", + kv.first.c_str(), val.c_str(), kv.second.c_str()); + FAIL(); + } + } + } + + // checks the whole kvmap_ to return correct values using MultiGet + void SimpleMultiGetTest() { + static ReadOptions ropts; + std::vector keys; + std::vector values; + + for (auto& kv : kvmap_) { + keys.emplace_back(kv.first); + } + + auto statuses = db_ttl_->MultiGet(ropts, keys, &values); + size_t i = 0; + for (auto& kv : kvmap_) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], kv.second); + ++i; + } + } + + void CompactCheck(int64_t st_pos, int64_t span, bool check = true, + bool test_compaction_change = false, + ColumnFamilyHandle* cf = nullptr) { + static ReadOptions ropts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + std::string v; + for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) { + Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v) + : db_ttl_->Get(ropts, cf, kv_it_->first, &v); + if (s.ok() != check) { + fprintf(stderr, "key=%s ", kv_it_->first.c_str()); + if (!s.ok()) { + fprintf(stderr, "is absent from db but was expected to be present\n"); + } else { + fprintf(stderr, "is present in db but was expected to be absent\n"); + } + FAIL(); + } else if (s.ok()) { + if (test_compaction_change && v.compare(kNewValue_) != 0) { + fprintf(stderr, + " value for key=%s present in database is %s but " + " should be %s\n", + kv_it_->first.c_str(), v.c_str(), kNewValue_.c_str()); + FAIL(); + } else if (!test_compaction_change && v.compare(kv_it_->second) != 0) { + fprintf(stderr, + " value for key=%s present in database is %s but " + " should be %s\n", + kv_it_->first.c_str(), v.c_str(), kv_it_->second.c_str()); + FAIL(); + } + } + } + } + // Sleeps for slp_tim then runs a manual compaction + // Checks span starting from st_pos from kvmap_ in the db and + // Gets should return true if check is true and false otherwise + // Also checks that value that we got is the same as inserted; and =kNewValue + // if test_compaction_change is true + void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span, + bool check = true, bool test_compaction_change = false, + ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + + env_->Sleep(slp_tim); + ASSERT_OK(ManualCompact(cf)); + CompactCheck(st_pos, span, check, test_compaction_change, cf); + } + + // Similar as SleepCompactCheck but uses TtlIterator to read from db + void SleepCompactCheckIter(int slp, int st_pos, int64_t span, + bool check = true) { + ASSERT_TRUE(db_ttl_); + env_->Sleep(slp); + ASSERT_OK(ManualCompact()); + static ReadOptions ropts; + Iterator* dbiter = db_ttl_->NewIterator(ropts); + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + + dbiter->Seek(kv_it_->first); + if (!check) { + if (dbiter->Valid()) { + ASSERT_NE(dbiter->value().compare(kv_it_->second), 0); + } + } else { // dbiter should have found out kvmap_[st_pos] + for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span; + i++, ++kv_it_) { + ASSERT_TRUE(dbiter->Valid()); + ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); + dbiter->Next(); + } + } + ASSERT_OK(dbiter->status()); + delete dbiter; + } + + // Set ttl on open db + void SetTtl(int32_t ttl, ColumnFamilyHandle* cf = nullptr) { + ASSERT_TRUE(db_ttl_); + cf == nullptr ? db_ttl_->SetTtl(ttl) : db_ttl_->SetTtl(cf, ttl); + } + + class TestFilter : public CompactionFilter { + public: + TestFilter(const int64_t kSampleSize, const std::string& kNewValue) + : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {} + + // Works on keys of the form "key" + // Drops key if number at the end of key is in [0, kSampleSize_/3), + // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3), + // Change value if it is in [2*kSampleSize_/3, kSampleSize_) + // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5... + bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { + assert(new_value != nullptr); + + std::string search_str = "0123456789"; + std::string key_string = key.ToString(); + size_t pos = key_string.find_first_of(search_str); + int num_key_end; + if (pos != std::string::npos) { + auto key_substr = key_string.substr(pos, key.size() - pos); +#ifndef CYGWIN + num_key_end = std::stoi(key_substr); +#else + num_key_end = std::strtol(key_substr.c_str(), 0, 10); +#endif + + } else { + return false; // Keep keys not matching the format "key" + } + + int64_t partition = kSampleSize_ / 3; + if (num_key_end < partition) { + return true; + } else if (num_key_end < partition * 2) { + return false; + } else { + *new_value = kNewValue_; + *value_changed = true; + return false; + } + } + + const char* Name() const override { return "TestFilter"; } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + class TestFilterFactory : public CompactionFilterFactory { + public: + TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue) + : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new TestFilter(kSampleSize_, kNewValue_)); + } + + const char* Name() const override { return "TestFilterFactory"; } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer + static const int64_t kSampleSize_ = 100; + std::string dbname_; + DBWithTTL* db_ttl_; + std::unique_ptr env_; + + private: + Options options_; + KVMap kvmap_; + KVMap::iterator kv_it_; + const std::string kNewValue_ = "new_value"; + std::unique_ptr test_comp_filter_; +}; // class TtlTest + +// If TTL is non positive or not provided, the behaviour is TTL = infinity +// This test opens the db 3 times with such default behavior and inserts a +// bunch of kvs each time. All kvs should accumulate in the db till the end +// Partitions the sample-size provided into 3 sets over boundary1 and boundary2 +TEST_F(TtlTest, NoEffect) { + MakeKVMap(kSampleSize_); + int64_t boundary1 = kSampleSize_ / 3; + int64_t boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); // T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); // T=1: Set1 still there + CloseTtl(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); // T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); // T=2: Sets1 & 2 still there + CloseTtl(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); // T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); // T=4: Sets 1,2,3 still there + CloseTtl(); +} + +// Rerun the NoEffect test with a different version of CloseTtl +// function, where db is directly deleted without close. +TEST_F(TtlTest, DestructWithoutClose) { + MakeKVMap(kSampleSize_); + int64_t boundary1 = kSampleSize_ / 3; + int64_t boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); // T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); // T=1: Set1 still there + CloseTtlNoDBClose(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); // T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); // T=2: Sets1 & 2 still there + CloseTtlNoDBClose(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); // T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); // T=4: Sets 1,2,3 still there + CloseTtlNoDBClose(); +} + +// Puts a set of values and checks its presence using Get during ttl +TEST_F(TtlTest, PresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_, + true); // T=1:Set1 should still be there + CloseTtl(); +} + +// Puts a set of values and checks its absence using Get after ttl +TEST_F(TtlTest, AbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there + CloseTtl(); +} + +// Resets the timestamp of a set of kvs by updating them and checks that they +// are not deleted according to the old timestamp +TEST_F(TtlTest, ResetTimestamp) { + MakeKVMap(kSampleSize_); + + OpenTtl(3); + PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 + env_->Sleep(2); // T=2 + PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 + SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there + CloseTtl(); +} + +// Similar to PresentDuringTTL but uses Iterator +TEST_F(TtlTest, IterPresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + SleepCompactCheckIter(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Similar to AbsentAfterTTL but uses Iterator +TEST_F(TtlTest, IterAbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST_F(TtlTest, MultiOpenSamePresent) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + CloseTtl(); + + OpenTtl(2); // T=0. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Checks absence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST_F(TtlTest, MultiOpenSameAbsent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(1); // T=0.Delete at t=1 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with bigger ttl +TEST_F(TtlTest, MultiOpenDifferent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(3); // T=0: Set deleted at t=3 + SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there + CloseTtl(); +} + +// Checks presence during ttl in read_only mode +TEST_F(TtlTest, ReadOnlyPresentForever) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db normally + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + CloseTtl(); + + OpenReadOnlyTtl(1); + ASSERT_TRUE(db_ttl_); + + env_->Sleep(2); + Status s = ManualCompact(); // T=2:Set1 should still be there + ASSERT_TRUE(s.IsNotSupported()); + CompactCheck(0, kSampleSize_); + CloseTtl(); +} + +// Checks whether WriteBatch works well with TTL +// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half +TEST_F(TtlTest, WriteBatchTest) { + MakeKVMap(kSampleSize_); + BatchOperation batch_ops[kSampleSize_]; + for (int i = 0; i < kSampleSize_; i++) { + batch_ops[i] = OP_PUT; + } + + OpenTtl(2); + MakePutWriteBatch(batch_ops, kSampleSize_); + for (int i = 0; i < kSampleSize_ / 2; i++) { + batch_ops[i] = OP_DELETE; + } + MakePutWriteBatch(batch_ops, kSampleSize_ / 2); + SleepCompactCheck(0, 0, kSampleSize_ / 2, false); + SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2); + CloseTtl(); +} + +// Checks user's compaction filter for correctness with TTL logic +TEST_F(TtlTest, CompactionFilter) { + MakeKVMap(kSampleSize_); + + OpenTtlWithTestCompaction(1); + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there + SleepCompactCheck(2, 0, kSampleSize_, false); + CloseTtl(); + + OpenTtlWithTestCompaction(3); + PutValues(0, kSampleSize_); // T=0:Insert Set1. + int64_t partition = kSampleSize_ / 3; + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept + SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed + CloseTtl(); +} + +// Insert some key-values which KeyMayExist should be able to get and check that +// values returned are fine +TEST_F(TtlTest, KeyMayExist) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleKeyMayExistCheck(); + + CloseTtl(); +} + +TEST_F(TtlTest, MultiGetTest) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleMultiGetTest(); + + CloseTtl(); +} + +TEST_F(TtlTest, ColumnFamiliesTest) { + DB* db; + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + DB::Open(options, dbname_, &db); + ColumnFamilyHandle* handle; + ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options), + "ttl_column_family", &handle)); + + delete handle; + delete db; + + std::vector column_families; + column_families.push_back(ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))); + column_families.push_back(ColumnFamilyDescriptor( + "ttl_column_family", ColumnFamilyOptions(options))); + + std::vector handles; + + ASSERT_OK(DBWithTTL::Open(DBOptions(options), dbname_, column_families, + &handles, &db_ttl_, {3, 5}, false)); + ASSERT_EQ(handles.size(), 2U); + ColumnFamilyHandle* new_handle; + ASSERT_OK(db_ttl_->CreateColumnFamilyWithTtl(options, "ttl_column_family_2", + &new_handle, 2)); + handles.push_back(new_handle); + + MakeKVMap(kSampleSize_); + PutValues(0, kSampleSize_, false, handles[0]); + PutValues(0, kSampleSize_, false, handles[1]); + PutValues(0, kSampleSize_, false, handles[2]); + + // everything should be there after 1 second + SleepCompactCheck(1, 0, kSampleSize_, true, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[2]); + + // only column family 1 should be alive after 4 seconds + SleepCompactCheck(3, 0, kSampleSize_, false, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]); + + // nothing should be there after 6 seconds + SleepCompactCheck(2, 0, kSampleSize_, false, false, handles[0]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[1]); + SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]); + + for (auto h : handles) { + delete h; + } + delete db_ttl_; + db_ttl_ = nullptr; +} + +// Puts a set of values and checks its absence using Get after ttl +TEST_F(TtlTest, ChangeTtlOnOpenDb) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db with ttl = 2 + SetTtl(3); + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there + CloseTtl(); +} + +// Test DeleteRange for DBWithTtl +TEST_F(TtlTest, DeleteRangeTest) { + OpenTtl(); + ASSERT_OK(db_ttl_->Put(WriteOptions(), "a", "val")); + MakeDeleteRange("a", "b"); + ASSERT_OK(db_ttl_->Put(WriteOptions(), "c", "val")); + MakeDeleteRange("b", "d"); + ASSERT_OK(db_ttl_->Put(WriteOptions(), "e", "val")); + MakeDeleteRange("d", "e"); + // first iteration verifies query correctness in memtable, second verifies + // query correctness for a single SST file + for (int i = 0; i < 2; i++) { + if (i > 0) { + ASSERT_OK(db_ttl_->Flush(FlushOptions())); + } + std::string value; + ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "a", &value).IsNotFound()); + ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "c", &value).IsNotFound()); + ASSERT_OK(db_ttl_->Get(ReadOptions(), "e", &value)); + } + CloseTtl(); +} + +class DummyFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "DummyFilter"; } +}; + +class DummyFilterFactory : public CompactionFilterFactory { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "DummyFilterFactory"; } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context&) override { + std::unique_ptr f(new DummyFilter()); + return f; + } +}; + +static int RegisterTestObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + "DummyFilter", [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { + static DummyFilter dummy; + return &dummy; + }); + library.AddFactory( + "DummyFilterFactory", [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new DummyFilterFactory()); + return guard->get(); + }); + return 2; +} + +class TtlOptionsTest : public testing::Test { + public: + TtlOptionsTest() { + config_options_.registry->AddLibrary("RegisterTtlObjects", + RegisterTtlObjects, ""); + config_options_.registry->AddLibrary("RegisterTtlTestObjects", + RegisterTestObjects, ""); + } + ConfigOptions config_options_; +}; + +TEST_F(TtlOptionsTest, LoadTtlCompactionFilter) { + const CompactionFilter* filter = nullptr; + + ASSERT_OK(CompactionFilter::CreateFromString( + config_options_, TtlCompactionFilter::kClassName(), &filter)); + ASSERT_NE(filter, nullptr); + ASSERT_STREQ(filter->Name(), TtlCompactionFilter::kClassName()); + auto ttl = filter->GetOptions("TTL"); + ASSERT_NE(ttl, nullptr); + ASSERT_EQ(*ttl, 0); + ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + delete filter; + filter = nullptr; + + ASSERT_OK(CompactionFilter::CreateFromString( + config_options_, "id=TtlCompactionFilter; ttl=123", &filter)); + ASSERT_NE(filter, nullptr); + ttl = filter->GetOptions("TTL"); + ASSERT_NE(ttl, nullptr); + ASSERT_EQ(*ttl, 123); + ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + delete filter; + filter = nullptr; + + ASSERT_OK(CompactionFilter::CreateFromString( + config_options_, + "id=TtlCompactionFilter; ttl=456; user_filter=DummyFilter;", &filter)); + ASSERT_NE(filter, nullptr); + auto inner = filter->CheckedCast(); + ASSERT_NE(inner, nullptr); + ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + std::string mismatch; + std::string opts_str = filter->ToString(config_options_); + const CompactionFilter* copy = nullptr; + ASSERT_OK( + CompactionFilter::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(filter->AreEquivalent(config_options_, copy, &mismatch)); + delete filter; + delete copy; +} + +TEST_F(TtlOptionsTest, LoadTtlCompactionFilterFactory) { + std::shared_ptr cff; + + ASSERT_OK(CompactionFilterFactory::CreateFromString( + config_options_, TtlCompactionFilterFactory::kClassName(), &cff)); + ASSERT_NE(cff.get(), nullptr); + ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName()); + auto ttl = cff->GetOptions("TTL"); + ASSERT_NE(ttl, nullptr); + ASSERT_EQ(*ttl, 0); + ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + + ASSERT_OK(CompactionFilterFactory::CreateFromString( + config_options_, "id=TtlCompactionFilterFactory; ttl=123", &cff)); + ASSERT_NE(cff.get(), nullptr); + ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName()); + ttl = cff->GetOptions("TTL"); + ASSERT_NE(ttl, nullptr); + ASSERT_EQ(*ttl, 123); + ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + + ASSERT_OK(CompactionFilterFactory::CreateFromString( + config_options_, + "id=TtlCompactionFilterFactory; ttl=456; " + "user_filter_factory=DummyFilterFactory;", + &cff)); + ASSERT_NE(cff.get(), nullptr); + auto filter = cff->CreateCompactionFilter(CompactionFilter::Context()); + ASSERT_NE(filter.get(), nullptr); + auto ttlf = filter->CheckedCast(); + ASSERT_EQ(filter.get(), ttlf); + auto user = filter->CheckedCast(); + ASSERT_NE(user, nullptr); + ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + + std::string opts_str = cff->ToString(config_options_); + std::string mismatch; + std::shared_ptr copy; + ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_, opts_str, + ©)); + ASSERT_TRUE(cff->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(TtlOptionsTest, LoadTtlMergeOperator) { + std::shared_ptr mo; + + config_options_.invoke_prepare_options = false; + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, TtlMergeOperator::kClassName(), &mo)); + ASSERT_NE(mo.get(), nullptr); + ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName()); + ASSERT_NOK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + + config_options_.invoke_prepare_options = true; + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, "id=TtlMergeOperator; user_operator=bytesxor", &mo)); + ASSERT_NE(mo.get(), nullptr); + ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName()); + ASSERT_OK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + auto ttl_mo = mo->CheckedCast(); + ASSERT_EQ(mo.get(), ttl_mo); + auto user = ttl_mo->CheckedCast(); + ASSERT_NE(user, nullptr); + + std::string mismatch; + std::string opts_str = mo->ToString(config_options_); + std::shared_ptr copy; + ASSERT_OK(MergeOperator::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(mo->AreEquivalent(config_options_, copy.get(), &mismatch)); +} +} // namespace ROCKSDB_NAMESPACE + +// A black-box test for the ttl wrapper around rocksdb +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/util_merge_operators_test.cc b/src/rocksdb/utilities/util_merge_operators_test.cc new file mode 100644 index 000000000..fed6f1a75 --- /dev/null +++ b/src/rocksdb/utilities/util_merge_operators_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class UtilMergeOperatorTest : public testing::Test { + public: + UtilMergeOperatorTest() {} + + std::string FullMergeV2(std::string existing_value, + std::vector operands, + std::string key = "") { + std::string result; + Slice result_operand(nullptr, 0); + + Slice existing_value_slice(existing_value); + std::vector operands_slice(operands.begin(), operands.end()); + + const MergeOperator::MergeOperationInput merge_in( + key, &existing_value_slice, operands_slice, nullptr); + MergeOperator::MergeOperationOutput merge_out(result, result_operand); + merge_operator_->FullMergeV2(merge_in, &merge_out); + + if (result_operand.data()) { + result.assign(result_operand.data(), result_operand.size()); + } + return result; + } + + std::string FullMergeV2(std::vector operands, + std::string key = "") { + std::string result; + Slice result_operand(nullptr, 0); + + std::vector operands_slice(operands.begin(), operands.end()); + + const MergeOperator::MergeOperationInput merge_in(key, nullptr, + operands_slice, nullptr); + MergeOperator::MergeOperationOutput merge_out(result, result_operand); + merge_operator_->FullMergeV2(merge_in, &merge_out); + + if (result_operand.data()) { + result.assign(result_operand.data(), result_operand.size()); + } + return result; + } + + std::string PartialMerge(std::string left, std::string right, + std::string key = "") { + std::string result; + + merge_operator_->PartialMerge(key, left, right, &result, nullptr); + return result; + } + + std::string PartialMergeMulti(std::deque operands, + std::string key = "") { + std::string result; + std::deque operands_slice(operands.begin(), operands.end()); + + merge_operator_->PartialMergeMulti(key, operands_slice, &result, nullptr); + return result; + } + + protected: + std::shared_ptr merge_operator_; +}; + +TEST_F(UtilMergeOperatorTest, MaxMergeOperator) { + merge_operator_ = MergeOperators::CreateMaxOperator(); + + EXPECT_EQ("B", FullMergeV2("B", {"A"})); + EXPECT_EQ("B", FullMergeV2("A", {"B"})); + EXPECT_EQ("", FullMergeV2({"", "", ""})); + EXPECT_EQ("A", FullMergeV2({"A"})); + EXPECT_EQ("ABC", FullMergeV2({"ABC"})); + EXPECT_EQ("Z", FullMergeV2({"ABC", "Z", "C", "AXX"})); + EXPECT_EQ("ZZZ", FullMergeV2({"ABC", "CC", "Z", "ZZZ"})); + EXPECT_EQ("a", FullMergeV2("a", {"ABC", "CC", "Z", "ZZZ"})); + + EXPECT_EQ("z", PartialMergeMulti({"a", "z", "efqfqwgwew", "aaz", "hhhhh"})); + + EXPECT_EQ("b", PartialMerge("a", "b")); + EXPECT_EQ("z", PartialMerge("z", "azzz")); + EXPECT_EQ("a", PartialMerge("a", "")); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/utilities/wal_filter.cc b/src/rocksdb/utilities/wal_filter.cc new file mode 100644 index 000000000..98bba3610 --- /dev/null +++ b/src/rocksdb/utilities/wal_filter.cc @@ -0,0 +1,23 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/wal_filter.h" + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { +Status WalFilter::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + WalFilter** filter) { + Status s = + LoadStaticObject(config_options, value, nullptr, filter); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc new file mode 100644 index 000000000..408243b3f --- /dev/null +++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc @@ -0,0 +1,695 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/write_batch_with_index.h" + +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "options/db_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { +struct WriteBatchWithIndex::Rep { + explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0, + size_t max_bytes = 0, bool _overwrite_key = false, + size_t protection_bytes_per_key = 0) + : write_batch(reserved_bytes, max_bytes, protection_bytes_per_key, + index_comparator ? index_comparator->timestamp_size() : 0), + comparator(index_comparator, &write_batch), + skip_list(comparator, &arena), + overwrite_key(_overwrite_key), + last_entry_offset(0), + last_sub_batch_offset(0), + sub_batch_cnt(1) {} + ReadableWriteBatch write_batch; + WriteBatchEntryComparator comparator; + Arena arena; + WriteBatchEntrySkipList skip_list; + bool overwrite_key; + size_t last_entry_offset; + // The starting offset of the last sub-batch. A sub-batch starts right before + // inserting a key that is a duplicate of a key in the last sub-batch. Zero, + // the default, means that no duplicate key is detected so far. + size_t last_sub_batch_offset; + // Total number of sub-batches in the write batch. Default is 1. + size_t sub_batch_cnt; + + // Remember current offset of internal write batch, which is used as + // the starting offset of the next record. + void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); } + + // In overwrite mode, find the existing entry for the same key and update it + // to point to the current entry. + // Return true if the key is found and updated. + bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key, + WriteType type); + bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key, + WriteType type); + + // Add the recent entry to the update. + // In overwrite mode, if key already exists in the index, update it. + void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key, + WriteType type); + void AddOrUpdateIndex(const Slice& key, WriteType type); + + // Allocate an index entry pointing to the last entry in the write batch and + // put it to skip list. + void AddNewEntry(uint32_t column_family_id); + + // Clear all updates buffered in this batch. + void Clear(); + void ClearIndex(); + + // Rebuild index by reading all records from the batch. + // Returns non-ok status on corruption. + Status ReBuildIndex(); +}; + +bool WriteBatchWithIndex::Rep::UpdateExistingEntry( + ColumnFamilyHandle* column_family, const Slice& key, WriteType type) { + uint32_t cf_id = GetColumnFamilyID(column_family); + return UpdateExistingEntryWithCfId(cf_id, key, type); +} + +bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId( + uint32_t column_family_id, const Slice& key, WriteType type) { + if (!overwrite_key) { + return false; + } + + WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch, + &comparator); + iter.Seek(key); + if (!iter.Valid()) { + return false; + } else if (!iter.MatchesKey(column_family_id, key)) { + return false; + } else { + // Move to the end of this key (NextKey-Prev) + iter.NextKey(); // Move to the next key + if (iter.Valid()) { + iter.Prev(); // Move back one entry + } else { + iter.SeekToLast(); + } + } + WriteBatchIndexEntry* non_const_entry = + const_cast(iter.GetRawEntry()); + if (LIKELY(last_sub_batch_offset <= non_const_entry->offset)) { + last_sub_batch_offset = last_entry_offset; + sub_batch_cnt++; + } + if (type == kMergeRecord) { + return false; + } else { + non_const_entry->offset = last_entry_offset; + return true; + } +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex( + ColumnFamilyHandle* column_family, const Slice& key, WriteType type) { + if (!UpdateExistingEntry(column_family, key, type)) { + uint32_t cf_id = GetColumnFamilyID(column_family); + const auto* cf_cmp = GetColumnFamilyUserComparator(column_family); + if (cf_cmp != nullptr) { + comparator.SetComparatorForCF(cf_id, cf_cmp); + } + AddNewEntry(cf_id); + } +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key, + WriteType type) { + if (!UpdateExistingEntryWithCfId(0, key, type)) { + AddNewEntry(0); + } +} + +void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) { + const std::string& wb_data = write_batch.Data(); + Slice entry_ptr = Slice(wb_data.data() + last_entry_offset, + wb_data.size() - last_entry_offset); + // Extract key + Slice key; + bool success = + ReadKeyFromWriteBatchEntry(&entry_ptr, &key, column_family_id != 0); +#ifdef NDEBUG + (void)success; +#endif + assert(success); + + const Comparator* const ucmp = comparator.GetComparator(column_family_id); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + + if (ts_sz > 0) { + key.remove_suffix(ts_sz); + } + + auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); + auto* index_entry = + new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id, + key.data() - wb_data.data(), key.size()); + skip_list.Insert(index_entry); +} + +void WriteBatchWithIndex::Rep::Clear() { + write_batch.Clear(); + ClearIndex(); +} + +void WriteBatchWithIndex::Rep::ClearIndex() { + skip_list.~WriteBatchEntrySkipList(); + arena.~Arena(); + new (&arena) Arena(); + new (&skip_list) WriteBatchEntrySkipList(comparator, &arena); + last_entry_offset = 0; + last_sub_batch_offset = 0; + sub_batch_cnt = 1; +} + +Status WriteBatchWithIndex::Rep::ReBuildIndex() { + Status s; + + ClearIndex(); + + if (write_batch.Count() == 0) { + // Nothing to re-index + return s; + } + + size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch); + + Slice input(write_batch.Data()); + input.remove_prefix(offset); + + // Loop through all entries in Rep and add each one to the index + uint32_t found = 0; + while (s.ok() && !input.empty()) { + Slice key, value, blob, xid; + uint32_t column_family_id = 0; // default + char tag = 0; + + // set offset of current entry for call to AddNewEntry() + last_entry_offset = input.data() - write_batch.Data().data(); + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value, + &blob, &xid); + if (!s.ok()) { + break; + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, + kDeleteRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, + kSingleDeleteRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + found++; + if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) { + AddNewEntry(column_family_id); + } + break; + case kTypeLogData: + case kTypeBeginPrepareXID: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeCommitXIDAndTimestamp: + case kTypeRollbackXID: + case kTypeNoop: + break; + default: + return Status::Corruption( + "unknown WriteBatch tag in ReBuildIndex", + std::to_string(static_cast(tag))); + } + } + + if (s.ok() && found != write_batch.Count()) { + s = Status::Corruption("WriteBatch has wrong count"); + } + + return s; +} + +WriteBatchWithIndex::WriteBatchWithIndex( + const Comparator* default_index_comparator, size_t reserved_bytes, + bool overwrite_key, size_t max_bytes, size_t protection_bytes_per_key) + : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes, + overwrite_key, protection_bytes_per_key)) {} + +WriteBatchWithIndex::~WriteBatchWithIndex() {} + +WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default; + +WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) = + default; + +WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; } + +size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; } + +WBWIIterator* WriteBatchWithIndex::NewIterator() { + return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch, + &(rep->comparator)); +} + +WBWIIterator* WriteBatchWithIndex::NewIterator( + ColumnFamilyHandle* column_family) { + return new WBWIIteratorImpl(GetColumnFamilyID(column_family), + &(rep->skip_list), &rep->write_batch, + &(rep->comparator)); +} + +Iterator* WriteBatchWithIndex::NewIteratorWithBase( + ColumnFamilyHandle* column_family, Iterator* base_iterator, + const ReadOptions* read_options) { + auto wbwiii = + new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list), + &rep->write_batch, &rep->comparator); + return new BaseDeltaIterator(column_family, base_iterator, wbwiii, + GetColumnFamilyUserComparator(column_family), + read_options); +} + +Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) { + // default column family's comparator + auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch, + &rep->comparator); + return new BaseDeltaIterator(nullptr, base_iterator, wbwiii, + rep->comparator.default_comparator()); +} + +Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Put(column_family, key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kPutRecord); + } + return s; +} + +Status WriteBatchWithIndex::Put(const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Put(key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kPutRecord); + } + return s; +} + +Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, + const Slice& /*key*/, const Slice& /*ts*/, + const Slice& /*value*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::Put() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Delete(column_family, key); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::Delete(const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Delete(key); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, + const Slice& /*key*/, const Slice& /*ts*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::Delete() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.SingleDelete(column_family, key); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::SingleDelete(const Slice& key) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.SingleDelete(key); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kSingleDeleteRecord); + } + return s; +} + +Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& /*key*/, + const Slice& /*ts*/) { + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be nullptr"); + } + // TODO: support WBWI::SingleDelete() with timestamp. + return Status::NotSupported(); +} + +Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Merge(column_family, key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(column_family, key, kMergeRecord); + } + return s; +} + +Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) { + rep->SetLastEntryOffset(); + auto s = rep->write_batch.Merge(key, value); + if (s.ok()) { + rep->AddOrUpdateIndex(key, kMergeRecord); + } + return s; +} + +Status WriteBatchWithIndex::PutLogData(const Slice& blob) { + return rep->write_batch.PutLogData(blob); +} + +void WriteBatchWithIndex::Clear() { rep->Clear(); } + +Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, + const DBOptions& options, + const Slice& key, std::string* value) { + Status s; + WriteBatchWithIndexInternal wbwii(&options, column_family); + auto result = wbwii.GetFromBatch(this, key, value, &s); + + switch (result) { + case WBWIIteratorImpl::kFound: + case WBWIIteratorImpl::kError: + // use returned status + break; + case WBWIIteratorImpl::kDeleted: + case WBWIIteratorImpl::kNotFound: + s = Status::NotFound(); + break; + case WBWIIteratorImpl::kMergeInProgress: + s = Status::MergeInProgress(); + break; + default: + assert(false); + } + + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key, + &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + const Slice& key, + PinnableSlice* pinnable_val) { + return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key, + pinnable_val); +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = + GetFromBatchAndDB(db, read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, + const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val) { + return GetFromBatchAndDB(db, read_options, column_family, key, pinnable_val, + nullptr); +} + +Status WriteBatchWithIndex::GetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) { + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + if (ts_sz > 0 && !read_options.timestamp) { + return Status::InvalidArgument("Must specify timestamp"); + } + + Status s; + WriteBatchWithIndexInternal wbwii(db, column_family); + + // Since the lifetime of the WriteBatch is the same as that of the transaction + // we cannot pin it as otherwise the returned value will not be available + // after the transaction finishes. + std::string& batch_value = *pinnable_val->GetSelf(); + auto result = wbwii.GetFromBatch(this, key, &batch_value, &s); + + if (result == WBWIIteratorImpl::kFound) { + pinnable_val->PinSelf(); + return s; + } else if (!s.ok() || result == WBWIIteratorImpl::kError) { + return s; + } else if (result == WBWIIteratorImpl::kDeleted) { + return Status::NotFound(); + } + assert(result == WBWIIteratorImpl::kMergeInProgress || + result == WBWIIteratorImpl::kNotFound); + + // Did not find key in batch OR could not resolve Merges. Try DB. + if (!callback) { + s = db->Get(read_options, column_family, key, pinnable_val); + } else { + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = pinnable_val; + get_impl_options.callback = callback; + s = static_cast_with_check(db->GetRootDB()) + ->GetImpl(read_options, key, get_impl_options); + } + + if (s.ok() || s.IsNotFound()) { // DB Get Succeeded + if (result == WBWIIteratorImpl::kMergeInProgress) { + // Merge result from DB with merges in Batch + std::string merge_result; + if (s.ok()) { + s = wbwii.MergeKey(key, pinnable_val, &merge_result); + } else { // Key not present in db (s.IsNotFound()) + s = wbwii.MergeKey(key, nullptr, &merge_result); + } + if (s.ok()) { + pinnable_val->Reset(); + *pinnable_val->GetSelf() = std::move(merge_result); + pinnable_val->PinSelf(); + } + } + } + + return s; +} + +void WriteBatchWithIndex::MultiGetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, bool sorted_input) { + MultiGetFromBatchAndDB(db, read_options, column_family, num_keys, keys, + values, statuses, sorted_input, nullptr); +} + +void WriteBatchWithIndex::MultiGetFromBatchAndDB( + DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, bool sorted_input, ReadCallback* callback) { + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); + size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; + if (ts_sz > 0 && !read_options.timestamp) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::InvalidArgument("Must specify timestamp"); + } + return; + } + + WriteBatchWithIndexInternal wbwii(db, column_family); + + autovector key_context; + autovector sorted_keys; + // To hold merges from the write batch + autovector, + MultiGetContext::MAX_BATCH_SIZE> + merges; + // Since the lifetime of the WriteBatch is the same as that of the transaction + // we cannot pin it as otherwise the returned value will not be available + // after the transaction finishes. + for (size_t i = 0; i < num_keys; ++i) { + MergeContext merge_context; + std::string batch_value; + Status* s = &statuses[i]; + PinnableSlice* pinnable_val = &values[i]; + pinnable_val->Reset(); + auto result = + wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s); + + if (result == WBWIIteratorImpl::kFound) { + *pinnable_val->GetSelf() = std::move(batch_value); + pinnable_val->PinSelf(); + continue; + } + if (result == WBWIIteratorImpl::kDeleted) { + *s = Status::NotFound(); + continue; + } + if (result == WBWIIteratorImpl::kError) { + continue; + } + assert(result == WBWIIteratorImpl::kMergeInProgress || + result == WBWIIteratorImpl::kNotFound); + key_context.emplace_back(column_family, keys[i], &values[i], + /*timestamp*/ nullptr, &statuses[i]); + merges.emplace_back(result, std::move(merge_context)); + } + + for (KeyContext& key : key_context) { + sorted_keys.emplace_back(&key); + } + + // Did not find key in batch OR could not resolve Merges. Try DB. + static_cast_with_check(db->GetRootDB()) + ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys); + static_cast_with_check(db->GetRootDB()) + ->MultiGetWithCallback(read_options, column_family, callback, + &sorted_keys); + + for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) { + KeyContext& key = *iter; + if (key.s->ok() || key.s->IsNotFound()) { // DB Get Succeeded + size_t index = iter - key_context.begin(); + std::pair& merge_result = + merges[index]; + if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) { + std::string merged_value; + // Merge result from DB with merges in Batch + if (key.s->ok()) { + *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second, + &merged_value); + } else { // Key not present in db (s.IsNotFound()) + *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second, + &merged_value); + } + if (key.s->ok()) { + key.value->Reset(); + *key.value->GetSelf() = std::move(merged_value); + key.value->PinSelf(); + } + } + } + } +} + +void WriteBatchWithIndex::SetSavePoint() { rep->write_batch.SetSavePoint(); } + +Status WriteBatchWithIndex::RollbackToSavePoint() { + Status s = rep->write_batch.RollbackToSavePoint(); + + if (s.ok()) { + rep->sub_batch_cnt = 1; + rep->last_sub_batch_offset = 0; + s = rep->ReBuildIndex(); + } + + return s; +} + +Status WriteBatchWithIndex::PopSavePoint() { + return rep->write_batch.PopSavePoint(); +} + +void WriteBatchWithIndex::SetMaxBytes(size_t max_bytes) { + rep->write_batch.SetMaxBytes(max_bytes); +} + +size_t WriteBatchWithIndex::GetDataSize() const { + return rep->write_batch.GetDataSize(); +} + +const Comparator* WriteBatchWithIndexInternal::GetUserComparator( + const WriteBatchWithIndex& wbwi, uint32_t cf_id) { + const WriteBatchEntryComparator& ucmps = wbwi.rep->comparator; + return ucmps.GetComparator(cf_id); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc new file mode 100644 index 000000000..3c9205bf7 --- /dev/null +++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -0,0 +1,735 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family, + Iterator* base_iterator, + WBWIIteratorImpl* delta_iterator, + const Comparator* comparator, + const ReadOptions* read_options) + : forward_(true), + current_at_base_(true), + equal_keys_(false), + status_(Status::OK()), + base_iterator_(base_iterator), + delta_iterator_(delta_iterator), + comparator_(comparator), + iterate_upper_bound_(read_options ? read_options->iterate_upper_bound + : nullptr) { + assert(comparator_); + wbwii_.reset(new WriteBatchWithIndexInternal(column_family)); +} + +bool BaseDeltaIterator::Valid() const { + return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false; +} + +void BaseDeltaIterator::SeekToFirst() { + forward_ = true; + base_iterator_->SeekToFirst(); + delta_iterator_->SeekToFirst(); + UpdateCurrent(); +} + +void BaseDeltaIterator::SeekToLast() { + forward_ = false; + base_iterator_->SeekToLast(); + delta_iterator_->SeekToLast(); + UpdateCurrent(); +} + +void BaseDeltaIterator::Seek(const Slice& k) { + forward_ = true; + base_iterator_->Seek(k); + delta_iterator_->Seek(k); + UpdateCurrent(); +} + +void BaseDeltaIterator::SeekForPrev(const Slice& k) { + forward_ = false; + base_iterator_->SeekForPrev(k); + delta_iterator_->SeekForPrev(k); + UpdateCurrent(); +} + +void BaseDeltaIterator::Next() { + if (!Valid()) { + status_ = Status::NotSupported("Next() on invalid iterator"); + return; + } + + if (!forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = true; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToFirst(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToFirst(); + } else if (current_at_base_) { + // Change delta from larger than base to smaller + AdvanceDelta(); + } else { + // Change base from larger than delta to smaller + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (0 == comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false)) { + equal_keys_ = true; + } + } + } + Advance(); +} + +void BaseDeltaIterator::Prev() { + if (!Valid()) { + status_ = Status::NotSupported("Prev() on invalid iterator"); + return; + } + + if (forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = false; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToLast(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToLast(); + } else if (current_at_base_) { + // Change delta from less advanced than base to more advanced + AdvanceDelta(); + } else { + // Change base from less advanced than delta to more advanced + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (0 == comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false)) { + equal_keys_ = true; + } + } + } + + Advance(); +} + +Slice BaseDeltaIterator::key() const { + return current_at_base_ ? base_iterator_->key() + : delta_iterator_->Entry().key; +} + +Slice BaseDeltaIterator::value() const { + if (current_at_base_) { + return base_iterator_->value(); + } else { + WriteEntry delta_entry = delta_iterator_->Entry(); + if (wbwii_->GetNumOperands() == 0) { + return delta_entry.value; + } else if (delta_entry.type == kDeleteRecord || + delta_entry.type == kSingleDeleteRecord) { + status_ = + wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); + } else if (delta_entry.type == kPutRecord) { + status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value, + merge_result_.GetSelf()); + } else if (delta_entry.type == kMergeRecord) { + if (equal_keys_) { + Slice base_value = base_iterator_->value(); + status_ = wbwii_->MergeKey(delta_entry.key, &base_value, + merge_result_.GetSelf()); + } else { + status_ = + wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); + } + } + merge_result_.PinSelf(); + return merge_result_; + } +} + +Status BaseDeltaIterator::status() const { + if (!status_.ok()) { + return status_; + } + if (!base_iterator_->status().ok()) { + return base_iterator_->status(); + } + return delta_iterator_->status(); +} + +void BaseDeltaIterator::Invalidate(Status s) { status_ = s; } + +void BaseDeltaIterator::AssertInvariants() { +#ifndef NDEBUG + bool not_ok = false; + if (!base_iterator_->status().ok()) { + assert(!base_iterator_->Valid()); + not_ok = true; + } + if (!delta_iterator_->status().ok()) { + assert(!delta_iterator_->Valid()); + not_ok = true; + } + if (not_ok) { + assert(!Valid()); + assert(!status().ok()); + return; + } + + if (!Valid()) { + return; + } + if (!BaseValid()) { + assert(!current_at_base_ && delta_iterator_->Valid()); + return; + } + if (!DeltaValid()) { + assert(current_at_base_ && base_iterator_->Valid()); + return; + } + // we don't support those yet + assert(delta_iterator_->Entry().type != kMergeRecord && + delta_iterator_->Entry().type != kLogDataRecord); + int compare = comparator_->CompareWithoutTimestamp( + delta_iterator_->Entry().key, /*a_has_ts=*/false, base_iterator_->key(), + /*b_has_ts=*/false); + if (forward_) { + // current_at_base -> compare < 0 + assert(!current_at_base_ || compare < 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare >= 0); + } else { + // current_at_base -> compare > 0 + assert(!current_at_base_ || compare > 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare <= 0); + } + // equal_keys_ <=> compare == 0 + assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0)); +#endif +} + +void BaseDeltaIterator::Advance() { + if (equal_keys_) { + assert(BaseValid() && DeltaValid()); + AdvanceBase(); + AdvanceDelta(); + } else { + if (current_at_base_) { + assert(BaseValid()); + AdvanceBase(); + } else { + assert(DeltaValid()); + AdvanceDelta(); + } + } + UpdateCurrent(); +} + +void BaseDeltaIterator::AdvanceDelta() { + if (forward_) { + delta_iterator_->NextKey(); + } else { + delta_iterator_->PrevKey(); + } +} +void BaseDeltaIterator::AdvanceBase() { + if (forward_) { + base_iterator_->Next(); + } else { + base_iterator_->Prev(); + } +} + +bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); } +bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); } +void BaseDeltaIterator::UpdateCurrent() { +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + status_ = Status::OK(); + while (true) { + auto delta_result = WBWIIteratorImpl::kNotFound; + WriteEntry delta_entry; + if (DeltaValid()) { + assert(delta_iterator_->status().ok()); + delta_result = + delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext()); + delta_entry = delta_iterator_->Entry(); + } else if (!delta_iterator_->status().ok()) { + // Expose the error status and stop. + current_at_base_ = false; + return; + } + equal_keys_ = false; + if (!BaseValid()) { + if (!base_iterator_->status().ok()) { + // Expose the error status and stop. + current_at_base_ = true; + return; + } + + // Base has finished. + if (!DeltaValid()) { + // Finished + return; + } + if (iterate_upper_bound_) { + if (comparator_->CompareWithoutTimestamp( + delta_entry.key, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { + // out of upper bound -> finished. + return; + } + } + if (delta_result == WBWIIteratorImpl::kDeleted && + wbwii_->GetNumOperands() == 0) { + AdvanceDelta(); + } else { + current_at_base_ = false; + return; + } + } else if (!DeltaValid()) { + // Delta has finished. + current_at_base_ = true; + return; + } else { + int compare = + (forward_ ? 1 : -1) * comparator_->CompareWithoutTimestamp( + delta_entry.key, /*a_has_ts=*/false, + base_iterator_->key(), /*b_has_ts=*/false); + if (compare <= 0) { // delta bigger or equal + if (compare == 0) { + equal_keys_ = true; + } + if (delta_result != WBWIIteratorImpl::kDeleted || + wbwii_->GetNumOperands() > 0) { + current_at_base_ = false; + return; + } + // Delta is less advanced and is delete. + AdvanceDelta(); + if (equal_keys_) { + AdvanceBase(); + } + } else { + current_at_base_ = true; + return; + } + } + } + + AssertInvariants(); +#endif // __clang_analyzer__ +} + +void WBWIIteratorImpl::AdvanceKey(bool forward) { + if (Valid()) { + Slice key = Entry().key; + do { + if (forward) { + Next(); + } else { + Prev(); + } + } while (MatchesKey(column_family_id_, key)); + } +} + +void WBWIIteratorImpl::NextKey() { AdvanceKey(true); } + +void WBWIIteratorImpl::PrevKey() { + AdvanceKey(false); // Move to the tail of the previous key + if (Valid()) { + AdvanceKey(false); // Move back another key. Now we are at the start of + // the previous key + if (Valid()) { // Still a valid + Next(); // Move forward one onto this key + } else { + SeekToFirst(); // Not valid, move to the start + } + } +} + +WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( + MergeContext* merge_context) { + if (Valid()) { + Slice key = Entry().key; + return FindLatestUpdate(key, merge_context); + } else { + merge_context->Clear(); // Clear any entries in the MergeContext + return WBWIIteratorImpl::kNotFound; + } +} + +WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( + const Slice& key, MergeContext* merge_context) { + Result result = WBWIIteratorImpl::kNotFound; + merge_context->Clear(); // Clear any entries in the MergeContext + // TODO(agiardullo): consider adding support for reverse iteration + if (!Valid()) { + return result; + } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) != + 0) { + return result; + } else { + // We want to iterate in the reverse order that the writes were added to the + // batch. Since we don't have a reverse iterator, we must seek past the + // end. We do this by seeking to the next key, and then back one step + NextKey(); + if (Valid()) { + Prev(); + } else { + SeekToLast(); + } + + // We are at the end of the iterator for this key. Search backwards for the + // last Put or Delete, accumulating merges along the way. + while (Valid()) { + const WriteEntry entry = Entry(); + if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) { + break; // Unexpected error or we've reached a different next key + } + + switch (entry.type) { + case kPutRecord: + return WBWIIteratorImpl::kFound; + case kDeleteRecord: + return WBWIIteratorImpl::kDeleted; + case kSingleDeleteRecord: + return WBWIIteratorImpl::kDeleted; + case kMergeRecord: + result = WBWIIteratorImpl::kMergeInProgress; + merge_context->PushOperand(entry.value); + break; + case kLogDataRecord: + break; // ignore + case kXIDRecord: + break; // ignore + default: + return WBWIIteratorImpl::kError; + } // end switch statement + Prev(); + } // End while Valid() + // At this point, we have been through the whole list and found no Puts or + // Deletes. The iterator points to the previous key. Move the iterator back + // onto this one. + if (Valid()) { + Next(); + } else { + SeekToFirst(); + } + } + return result; +} + +Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, + WriteType* type, Slice* Key, + Slice* value, Slice* blob, + Slice* xid) const { + if (type == nullptr || Key == nullptr || value == nullptr || + blob == nullptr || xid == nullptr) { + return Status::InvalidArgument("Output parameters cannot be null"); + } + + if (data_offset == GetDataSize()) { + // reached end of batch. + return Status::NotFound(); + } + + if (data_offset > GetDataSize()) { + return Status::InvalidArgument("data offset exceed write batch size"); + } + Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); + char tag; + uint32_t column_family; + Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, + blob, xid); + if (!s.ok()) { + return s; + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + *type = kPutRecord; + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + *type = kDeleteRecord; + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + *type = kSingleDeleteRecord; + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + *type = kDeleteRangeRecord; + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + *type = kMergeRecord; + break; + case kTypeLogData: + *type = kLogDataRecord; + break; + case kTypeNoop: + case kTypeBeginPrepareXID: + case kTypeBeginPersistedPrepareXID: + case kTypeBeginUnprepareXID: + case kTypeEndPrepareXID: + case kTypeCommitXID: + case kTypeRollbackXID: + *type = kXIDRecord; + break; + default: + return Status::Corruption("unknown WriteBatch tag ", + std::to_string(static_cast(tag))); + } + return Status::OK(); +} + +// If both of `entry1` and `entry2` point to real entry in write batch, we +// compare the entries as following: +// 1. first compare the column family, the one with larger CF will be larger; +// 2. Inside the same CF, we first decode the entry to find the key of the entry +// and the entry with larger key will be larger; +// 3. If two entries are of the same CF and key, the one with larger offset +// will be larger. +// Some times either `entry1` or `entry2` is dummy entry, which is actually +// a search key. In this case, in step 2, we don't go ahead and decode the +// entry but use the value in WriteBatchIndexEntry::search_key. +// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf. +// This indicate that we are going to seek to the first of the column family. +// Once we see this, this entry will be smaller than all the real entries of +// the column family. +int WriteBatchEntryComparator::operator()( + const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const { + if (entry1->column_family > entry2->column_family) { + return 1; + } else if (entry1->column_family < entry2->column_family) { + return -1; + } + + // Deal with special case of seeking to the beginning of a column family + if (entry1->is_min_in_cf()) { + return -1; + } else if (entry2->is_min_in_cf()) { + return 1; + } + + Slice key1, key2; + if (entry1->search_key == nullptr) { + key1 = Slice(write_batch_->Data().data() + entry1->key_offset, + entry1->key_size); + } else { + key1 = *(entry1->search_key); + } + if (entry2->search_key == nullptr) { + key2 = Slice(write_batch_->Data().data() + entry2->key_offset, + entry2->key_size); + } else { + key2 = *(entry2->search_key); + } + + int cmp = CompareKey(entry1->column_family, key1, key2); + if (cmp != 0) { + return cmp; + } else if (entry1->offset > entry2->offset) { + return 1; + } else if (entry1->offset < entry2->offset) { + return -1; + } + return 0; +} + +int WriteBatchEntryComparator::CompareKey(uint32_t column_family, + const Slice& key1, + const Slice& key2) const { + if (column_family < cf_comparators_.size() && + cf_comparators_[column_family] != nullptr) { + return cf_comparators_[column_family]->CompareWithoutTimestamp( + key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false); + } else { + return default_comparator_->CompareWithoutTimestamp( + key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false); + } +} + +const Comparator* WriteBatchEntryComparator::GetComparator( + const ColumnFamilyHandle* column_family) const { + return column_family ? column_family->GetComparator() : default_comparator_; +} + +const Comparator* WriteBatchEntryComparator::GetComparator( + uint32_t column_family) const { + if (column_family < cf_comparators_.size() && + cf_comparators_[column_family]) { + return cf_comparators_[column_family]; + } + return default_comparator_; +} + +WriteEntry WBWIIteratorImpl::Entry() const { + WriteEntry ret; + Slice blob, xid; + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + // this is guaranteed with Valid() + assert(iter_entry != nullptr && + iter_entry->column_family == column_family_id_); + auto s = write_batch_->GetEntryFromDataOffset( + iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid); + assert(s.ok()); + assert(ret.type == kPutRecord || ret.type == kDeleteRecord || + ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord || + ret.type == kMergeRecord); + // Make sure entry.key does not include user-defined timestamp. + const Comparator* const ucmp = comparator_->GetComparator(column_family_id_); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0) { + ret.key = StripTimestampFromUserKey(ret.key, ts_sz); + } + return ret; +} + +bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) { + if (Valid()) { + return comparator_->CompareKey(cf_id, key, Entry().key) == 0; + } else { + return false; + } +} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + ColumnFamilyHandle* column_family) + : db_(nullptr), db_options_(nullptr), column_family_(column_family) {} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + DB* db, ColumnFamilyHandle* column_family) + : db_(db), db_options_(nullptr), column_family_(column_family) { + if (db_ != nullptr && column_family_ == nullptr) { + column_family_ = db_->DefaultColumnFamily(); + } +} + +WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( + const DBOptions* db_options, ColumnFamilyHandle* column_family) + : db_(nullptr), db_options_(db_options), column_family_(column_family) {} + +Status WriteBatchWithIndexInternal::MergeKey(const Slice& key, + const Slice* value, + const MergeContext& context, + std::string* result) const { + if (column_family_ != nullptr) { + auto cfh = static_cast_with_check(column_family_); + const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get(); + if (merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge_operator must be set for column_family"); + } else if (db_ != nullptr) { + const ImmutableDBOptions& immutable_db_options = + static_cast_with_check(db_->GetRootDB()) + ->immutable_db_options(); + Statistics* statistics = immutable_db_options.statistics.get(); + Logger* logger = immutable_db_options.info_log.get(); + SystemClock* clock = immutable_db_options.clock; + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } else if (db_options_ != nullptr) { + Statistics* statistics = db_options_->statistics.get(); + Env* env = db_options_->env; + Logger* logger = db_options_->info_log.get(); + SystemClock* clock = env->GetSystemClock().get(); + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false); + } else { + const auto cf_opts = cfh->cfd()->ioptions(); + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, + cf_opts->logger, cf_opts->stats, cf_opts->clock, + /* result_operand */ nullptr, /* update_num_ops_stats */ false); + } + } else { + return Status::InvalidArgument("Must provide a column_family"); + } +} + +WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( + WriteBatchWithIndex* batch, const Slice& key, MergeContext* context, + std::string* value, Status* s) { + *s = Status::OK(); + + std::unique_ptr iter( + static_cast_with_check( + batch->NewIterator(column_family_))); + + // Search the iterator for this key, and updates/merges to it. + iter->Seek(key); + auto result = iter->FindLatestUpdate(key, context); + if (result == WBWIIteratorImpl::kError) { + (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:", + std::to_string(iter->Entry().type)); + return result; + } else if (result == WBWIIteratorImpl::kNotFound) { + return result; + } else if (result == WBWIIteratorImpl::Result::kFound) { // PUT + Slice entry_value = iter->Entry().value; + if (context->GetNumOperands() > 0) { + *s = MergeKey(key, &entry_value, *context, value); + if (!s->ok()) { + result = WBWIIteratorImpl::Result::kError; + } + } else { + value->assign(entry_value.data(), entry_value.size()); + } + } else if (result == WBWIIteratorImpl::kDeleted) { + if (context->GetNumOperands() > 0) { + *s = MergeKey(key, nullptr, *context, value); + if (s->ok()) { + result = WBWIIteratorImpl::Result::kFound; + } else { + result = WBWIIteratorImpl::Result::kError; + } + } + } + return result; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h new file mode 100644 index 000000000..edabc95bc --- /dev/null +++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h @@ -0,0 +1,344 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/merge_context.h" +#include "memtable/skiplist.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeContext; +class WBWIIteratorImpl; +class WriteBatchWithIndexInternal; +struct Options; + +// when direction == forward +// * current_at_base_ <=> base_iterator > delta_iterator +// when direction == backwards +// * current_at_base_ <=> base_iterator < delta_iterator +// always: +// * equal_keys_ <=> base_iterator == delta_iterator +class BaseDeltaIterator : public Iterator { + public: + BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator, + WBWIIteratorImpl* delta_iterator, + const Comparator* comparator, + const ReadOptions* read_options = nullptr); + + ~BaseDeltaIterator() override {} + + bool Valid() const override; + void SeekToFirst() override; + void SeekToLast() override; + void Seek(const Slice& k) override; + void SeekForPrev(const Slice& k) override; + void Next() override; + void Prev() override; + Slice key() const override; + Slice value() const override; + Status status() const override; + void Invalidate(Status s); + + private: + void AssertInvariants(); + void Advance(); + void AdvanceDelta(); + void AdvanceBase(); + bool BaseValid() const; + bool DeltaValid() const; + void UpdateCurrent(); + + std::unique_ptr wbwii_; + bool forward_; + bool current_at_base_; + bool equal_keys_; + mutable Status status_; + std::unique_ptr base_iterator_; + std::unique_ptr delta_iterator_; + const Comparator* comparator_; // not owned + const Slice* iterate_upper_bound_; + mutable PinnableSlice merge_result_; +}; + +// Key used by skip list, as the binary searchable index of WriteBatchWithIndex. +struct WriteBatchIndexEntry { + WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz) + : offset(o), + column_family(c), + key_offset(ko), + key_size(ksz), + search_key(nullptr) {} + // Create a dummy entry as the search key. This index entry won't be backed + // by an entry from the write batch, but a pointer to the search key. Or a + // special flag of offset can indicate we are seek to first. + // @_search_key: the search key + // @_column_family: column family + // @is_forward_direction: true for Seek(). False for SeekForPrev() + // @is_seek_to_first: true if we seek to the beginning of the column family + // _search_key should be null in this case. + WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family, + bool is_forward_direction, bool is_seek_to_first) + // For SeekForPrev(), we need to make the dummy entry larger than any + // entry who has the same search key. Otherwise, we'll miss those entries. + : offset(is_forward_direction ? 0 : std::numeric_limits::max()), + column_family(_column_family), + key_offset(0), + key_size(is_seek_to_first ? kFlagMinInCf : 0), + search_key(_search_key) { + assert(_search_key != nullptr || is_seek_to_first); + } + + // If this flag appears in the key_size, it indicates a + // key that is smaller than any other entry for the same column family. + static const size_t kFlagMinInCf = std::numeric_limits::max(); + + bool is_min_in_cf() const { + assert(key_size != kFlagMinInCf || + (key_offset == 0 && search_key == nullptr)); + return key_size == kFlagMinInCf; + } + + // offset of an entry in write batch's string buffer. If this is a dummy + // lookup key, in which case search_key != nullptr, offset is set to either + // 0 or max, only for comparison purpose. Because when entries have the same + // key, the entry with larger offset is larger, offset = 0 will make a seek + // key small or equal than all the entries with the seek key, so that Seek() + // will find all the entries of the same key. Similarly, offset = MAX will + // make the entry just larger than all entries with the search key so + // SeekForPrev() will see all the keys with the same key. + size_t offset; + uint32_t column_family; // column family of the entry. + size_t key_offset; // offset of the key in write batch's string buffer. + size_t key_size; // size of the key. kFlagMinInCf indicates + // that this is a dummy look up entry for + // SeekToFirst() to the beginning of the column + // family. We use the flag here to save a boolean + // in the struct. + + const Slice* search_key; // if not null, instead of reading keys from + // write batch, use it to compare. This is used + // for lookup key. +}; + +class ReadableWriteBatch : public WriteBatch { + public: + explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0, + size_t protection_bytes_per_key = 0, + size_t default_cf_ts_sz = 0) + : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key, + default_cf_ts_sz) {} + // Retrieve some information from a write entry in the write batch, given + // the start offset of the write entry. + Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, + Slice* value, Slice* blob, Slice* xid) const; +}; + +class WriteBatchEntryComparator { + public: + WriteBatchEntryComparator(const Comparator* _default_comparator, + const ReadableWriteBatch* write_batch) + : default_comparator_(_default_comparator), write_batch_(write_batch) {} + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + int operator()(const WriteBatchIndexEntry* entry1, + const WriteBatchIndexEntry* entry2) const; + + int CompareKey(uint32_t column_family, const Slice& key1, + const Slice& key2) const; + + void SetComparatorForCF(uint32_t column_family_id, + const Comparator* comparator) { + if (column_family_id >= cf_comparators_.size()) { + cf_comparators_.resize(column_family_id + 1, nullptr); + } + cf_comparators_[column_family_id] = comparator; + } + + const Comparator* default_comparator() { return default_comparator_; } + + const Comparator* GetComparator( + const ColumnFamilyHandle* column_family) const; + + const Comparator* GetComparator(uint32_t column_family) const; + + private: + const Comparator* const default_comparator_; + std::vector cf_comparators_; + const ReadableWriteBatch* const write_batch_; +}; + +using WriteBatchEntrySkipList = + SkipList; + +class WBWIIteratorImpl : public WBWIIterator { + public: + enum Result : uint8_t { + kFound, + kDeleted, + kNotFound, + kMergeInProgress, + kError + }; + WBWIIteratorImpl(uint32_t column_family_id, + WriteBatchEntrySkipList* skip_list, + const ReadableWriteBatch* write_batch, + WriteBatchEntryComparator* comparator) + : column_family_id_(column_family_id), + skip_list_iter_(skip_list), + write_batch_(write_batch), + comparator_(comparator) {} + + ~WBWIIteratorImpl() override {} + + bool Valid() const override { + if (!skip_list_iter_.Valid()) { + return false; + } + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + return (iter_entry != nullptr && + iter_entry->column_family == column_family_id_); + } + + void SeekToFirst() override { + WriteBatchIndexEntry search_entry( + nullptr /* search_key */, column_family_id_, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } + + void SeekToLast() override { + WriteBatchIndexEntry search_entry( + nullptr /* search_key */, column_family_id_ + 1, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + if (!skip_list_iter_.Valid()) { + skip_list_iter_.SeekToLast(); + } else { + skip_list_iter_.Prev(); + } + } + + void Seek(const Slice& key) override { + WriteBatchIndexEntry search_entry(&key, column_family_id_, + true /* is_forward_direction */, + false /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } + + void SeekForPrev(const Slice& key) override { + WriteBatchIndexEntry search_entry(&key, column_family_id_, + false /* is_forward_direction */, + false /* is_seek_to_first */); + skip_list_iter_.SeekForPrev(&search_entry); + } + + void Next() override { skip_list_iter_.Next(); } + + void Prev() override { skip_list_iter_.Prev(); } + + WriteEntry Entry() const override; + + Status status() const override { + // this is in-memory data structure, so the only way status can be non-ok is + // through memory corruption + return Status::OK(); + } + + const WriteBatchIndexEntry* GetRawEntry() const { + return skip_list_iter_.key(); + } + + bool MatchesKey(uint32_t cf_id, const Slice& key); + + // Moves the iterator to first entry of the previous key. + void PrevKey(); + // Moves the iterator to first entry of the next key. + void NextKey(); + + // Moves the iterator to the Update (Put or Delete) for the current key + // If there are no Put/Delete, the Iterator will point to the first entry for + // this key + // @return kFound if a Put was found for the key + // @return kDeleted if a delete was found for the key + // @return kMergeInProgress if only merges were fouund for the key + // @return kError if an unsupported operation was found for the key + // @return kNotFound if no operations were found for this key + // + Result FindLatestUpdate(const Slice& key, MergeContext* merge_context); + Result FindLatestUpdate(MergeContext* merge_context); + + protected: + void AdvanceKey(bool forward); + + private: + uint32_t column_family_id_; + WriteBatchEntrySkipList::Iterator skip_list_iter_; + const ReadableWriteBatch* write_batch_; + WriteBatchEntryComparator* comparator_; +}; + +class WriteBatchWithIndexInternal { + public: + static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi, + uint32_t cf_id); + + // For GetFromBatchAndDB or similar + explicit WriteBatchWithIndexInternal(DB* db, + ColumnFamilyHandle* column_family); + // For GetFromBatchAndDB or similar + explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family); + // For GetFromBatch or similar + explicit WriteBatchWithIndexInternal(const DBOptions* db_options, + ColumnFamilyHandle* column_family); + + // If batch contains a value for key, store it in *value and return kFound. + // If batch contains a deletion for key, return Deleted. + // If batch contains Merge operations as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operands to *operands, + // and return kMergeInProgress + // If batch does not contain this key, return kNotFound + // Else, return kError on error with error Status stored in *s. + WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, + const Slice& key, std::string* value, + Status* s) { + return GetFromBatch(batch, key, &merge_context_, value, s); + } + WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, + const Slice& key, + MergeContext* merge_context, + std::string* value, Status* s); + Status MergeKey(const Slice& key, const Slice* value, + std::string* result) const { + return MergeKey(key, value, merge_context_, result); + } + Status MergeKey(const Slice& key, const Slice* value, + const MergeContext& context, std::string* result) const; + size_t GetNumOperands() const { return merge_context_.GetNumOperands(); } + MergeContext* GetMergeContext() { return &merge_context_; } + Slice GetOperand(int index) const { return merge_context_.GetOperand(index); } + + private: + DB* db_; + const DBOptions* db_options_; + ColumnFamilyHandle* column_family_; + MergeContext merge_context_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc new file mode 100644 index 000000000..350dcc881 --- /dev/null +++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -0,0 +1,2419 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/write_batch_with_index.h" + +#include +#include + +#include "db/column_family.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + id_(id), + comparator_(comparator) {} + uint32_t GetID() const override { return id_; } + const Comparator* GetComparator() const override { return comparator_; } + + private: + uint32_t id_; + const Comparator* comparator_; +}; + +struct Entry { + std::string key; + std::string value; + WriteType type; +}; + +struct TestHandler : public WriteBatch::Handler { + std::map> seen; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + Entry e; + e.key = key.ToString(); + e.value = value.ToString(); + e.type = kPutRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + Entry e; + e.key = key.ToString(); + e.value = value.ToString(); + e.type = kMergeRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } + void LogData(const Slice& /*blob*/) override {} + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + Entry e; + e.key = key.ToString(); + e.value = ""; + e.type = kDeleteRecord; + seen[column_family_id].push_back(e); + return Status::OK(); + } +}; + +using KVMap = std::map; + +class KVIter : public Iterator { + public: + explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {} + bool Valid() const override { return iter_ != map_->end(); } + void SeekToFirst() override { iter_ = map_->begin(); } + void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } + void Next() override { ++iter_; } + void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + Slice key() const override { return iter_->first; } + Slice value() const override { return iter_->second; } + Status status() const override { return Status::OK(); } + + private: + const KVMap* const map_; + KVMap::const_iterator iter_; +}; + +static std::string PrintContents(WriteBatchWithIndex* batch, + ColumnFamilyHandle* column_family, + bool hex = false) { + std::string result; + + WBWIIterator* iter; + if (column_family == nullptr) { + iter = batch->NewIterator(); + } else { + iter = batch->NewIterator(column_family); + } + + iter->SeekToFirst(); + while (iter->Valid()) { + WriteEntry e = iter->Entry(); + + if (e.type == kPutRecord) { + result.append("PUT("); + result.append(e.key.ToString(hex)); + result.append("):"); + result.append(e.value.ToString(hex)); + } else if (e.type == kMergeRecord) { + result.append("MERGE("); + result.append(e.key.ToString(hex)); + result.append("):"); + result.append(e.value.ToString(hex)); + } else if (e.type == kSingleDeleteRecord) { + result.append("SINGLE-DEL("); + result.append(e.key.ToString(hex)); + result.append(")"); + } else { + assert(e.type == kDeleteRecord); + result.append("DEL("); + result.append(e.key.ToString(hex)); + result.append(")"); + } + + result.append(","); + iter->Next(); + } + + delete iter; + return result; +} + +static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map, + ColumnFamilyHandle* column_family) { + std::string result; + + Iterator* iter; + if (column_family == nullptr) { + iter = batch->NewIteratorWithBase(new KVIter(base_map)); + } else { + iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map)); + } + + iter->SeekToFirst(); + while (iter->Valid()) { + assert(iter->status().ok()); + + Slice key = iter->key(); + Slice value = iter->value(); + + result.append(key.ToString()); + result.append(":"); + result.append(value.ToString()); + result.append(","); + + iter->Next(); + } + + delete iter; + return result; +} + +void AssertIter(Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); +} + +void AssertItersMatch(Iterator* iter1, Iterator* iter2) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + if (iter1->Valid()) { + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + } +} + +void AssertItersEqual(Iterator* iter1, Iterator* iter2) { + iter1->SeekToFirst(); + iter2->SeekToFirst(); + while (iter1->Valid()) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + iter1->Next(); + iter2->Next(); + } + ASSERT_EQ(iter1->Valid(), iter2->Valid()); +} + +void AssertIterEqual(WBWIIteratorImpl* wbwii, + const std::vector& keys) { + wbwii->SeekToFirst(); + for (auto k : keys) { + ASSERT_TRUE(wbwii->Valid()); + ASSERT_EQ(wbwii->Entry().key, k); + wbwii->NextKey(); + } + ASSERT_FALSE(wbwii->Valid()); + wbwii->SeekToLast(); + for (auto kit = keys.rbegin(); kit != keys.rend(); ++kit) { + ASSERT_TRUE(wbwii->Valid()); + ASSERT_EQ(wbwii->Entry().key, *kit); + wbwii->PrevKey(); + } + ASSERT_FALSE(wbwii->Valid()); +} +} // namespace + +class WBWIBaseTest : public testing::Test { + public: + explicit WBWIBaseTest(bool overwrite) : db_(nullptr) { + options_.merge_operator = + MergeOperators::CreateFromStringId("stringappend"); + options_.create_if_missing = true; + dbname_ = test::PerThreadDBPath("write_batch_with_index_test"); + EXPECT_OK(DestroyDB(dbname_, options_)); + batch_.reset(new WriteBatchWithIndex(BytewiseComparator(), 20, overwrite)); + } + + virtual ~WBWIBaseTest() { + if (db_ != nullptr) { + ReleaseSnapshot(); + delete db_; + EXPECT_OK(DestroyDB(dbname_, options_)); + } + } + + std::string AddToBatch(ColumnFamilyHandle* cf, const std::string& key) { + std::string result; + for (size_t i = 0; i < key.size(); i++) { + if (key[i] == 'd') { + batch_->Delete(cf, key); + result = ""; + } else if (key[i] == 'p') { + result = key + std::to_string(i); + batch_->Put(cf, key, result); + } else if (key[i] == 'm') { + std::string value = key + std::to_string(i); + batch_->Merge(cf, key, value); + if (result.empty()) { + result = value; + } else { + result = result + "," + value; + } + } + } + return result; + } + + virtual Status OpenDB() { return DB::Open(options_, dbname_, &db_); } + + void ReleaseSnapshot() { + if (read_opts_.snapshot != nullptr) { + EXPECT_NE(db_, nullptr); + db_->ReleaseSnapshot(read_opts_.snapshot); + read_opts_.snapshot = nullptr; + } + } + + public: + DB* db_; + std::string dbname_; + Options options_; + WriteOptions write_opts_; + ReadOptions read_opts_; + std::unique_ptr batch_; +}; + +class WBWIKeepTest : public WBWIBaseTest { + public: + WBWIKeepTest() : WBWIBaseTest(false) {} +}; + +class WBWIOverwriteTest : public WBWIBaseTest { + public: + WBWIOverwriteTest() : WBWIBaseTest(true) {} +}; +class WriteBatchWithIndexTest : public WBWIBaseTest, + public testing::WithParamInterface { + public: + WriteBatchWithIndexTest() : WBWIBaseTest(GetParam()) {} +}; + +void TestValueAsSecondaryIndexHelper(std::vector entries, + WriteBatchWithIndex* batch) { + // In this test, we insert to column family `data`, and + // to column family `index`. Then iterator them in order + // and seek them by key. + + // Sort entries by key + std::map> data_map; + // Sort entries by value + std::map> index_map; + for (auto& e : entries) { + data_map[e.key].push_back(&e); + index_map[e.value].push_back(&e); + } + + ColumnFamilyHandleImplDummy data(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy index(8, BytewiseComparator()); + for (auto& e : entries) { + if (e.type == kPutRecord) { + ASSERT_OK(batch->Put(&data, e.key, e.value)); + ASSERT_OK(batch->Put(&index, e.value, e.key)); + } else if (e.type == kMergeRecord) { + ASSERT_OK(batch->Merge(&data, e.key, e.value)); + ASSERT_OK(batch->Put(&index, e.value, e.key)); + } else { + assert(e.type == kDeleteRecord); + std::unique_ptr iter(batch->NewIterator(&data)); + iter->Seek(e.key); + ASSERT_OK(iter->status()); + auto write_entry = iter->Entry(); + ASSERT_EQ(e.key, write_entry.key.ToString()); + ASSERT_EQ(e.value, write_entry.value.ToString()); + ASSERT_OK(batch->Delete(&data, e.key)); + ASSERT_OK(batch->Put(&index, e.value, "")); + } + } + + // Iterator all keys + { + std::unique_ptr iter(batch->NewIterator(&data)); + for (int seek_to_first : {0, 1}) { + if (seek_to_first) { + iter->SeekToFirst(); + } else { + iter->Seek(""); + } + for (auto pair : data_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + ASSERT_EQ(v->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(v->value, write_entry.value.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + iter->SeekToLast(); + for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) { + for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ((*v)->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ((*v)->value, write_entry.value.ToString()); + } + iter->Prev(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + // Iterator all indexes + { + std::unique_ptr iter(batch->NewIterator(&index)); + for (int seek_to_first : {0, 1}) { + if (seek_to_first) { + iter->SeekToFirst(); + } else { + iter->Seek(""); + } + for (auto pair : index_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + if (v->type != kDeleteRecord) { + ASSERT_EQ(v->key, write_entry.value.ToString()); + ASSERT_EQ(v->value, write_entry.key.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + iter->SeekToLast(); + for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) { + for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + if ((*v)->type != kDeleteRecord) { + ASSERT_EQ((*v)->key, write_entry.value.ToString()); + ASSERT_EQ((*v)->value, write_entry.key.ToString()); + } + iter->Prev(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + // Seek to every key + { + std::unique_ptr iter(batch->NewIterator(&data)); + + // Seek the keys one by one in reverse order + for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) { + iter->Seek(pair->first); + ASSERT_OK(iter->status()); + for (auto v : pair->second) { + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ(v->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(v->value, write_entry.value.ToString()); + } + iter->Next(); + ASSERT_OK(iter->status()); + } + } + } + + // Seek to every index + { + std::unique_ptr iter(batch->NewIterator(&index)); + + // Seek the keys one by one in reverse order + for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) { + iter->Seek(pair->first); + ASSERT_OK(iter->status()); + for (auto v : pair->second) { + ASSERT_TRUE(iter->Valid()); + auto write_entry = iter->Entry(); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ(v->value, write_entry.key.ToString()); + if (v->type != kDeleteRecord) { + ASSERT_EQ(v->key, write_entry.value.ToString()); + } + iter->Next(); + ASSERT_OK(iter->status()); + } + } + } + + // Verify WriteBatch can be iterated + TestHandler handler; + ASSERT_OK(batch->GetWriteBatch()->Iterate(&handler)); + + // Verify data column family + { + ASSERT_EQ(entries.size(), handler.seen[data.GetID()].size()); + size_t i = 0; + for (auto e : handler.seen[data.GetID()]) { + auto write_entry = entries[i++]; + ASSERT_EQ(e.type, write_entry.type); + ASSERT_EQ(e.key, write_entry.key); + if (e.type != kDeleteRecord) { + ASSERT_EQ(e.value, write_entry.value); + } + } + } + + // Verify index column family + { + ASSERT_EQ(entries.size(), handler.seen[index.GetID()].size()); + size_t i = 0; + for (auto e : handler.seen[index.GetID()]) { + auto write_entry = entries[i++]; + ASSERT_EQ(e.key, write_entry.value); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(e.value, write_entry.key); + } + } + } +} + +TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) { + Entry entries[] = { + {"aaa", "0005", kPutRecord}, {"b", "0002", kPutRecord}, + {"cdd", "0002", kMergeRecord}, {"aab", "00001", kPutRecord}, + {"cc", "00005", kPutRecord}, {"cdd", "0002", kPutRecord}, + {"aab", "0003", kPutRecord}, {"cc", "00005", kDeleteRecord}, + }; + std::vector entries_list(entries, entries + 8); + + batch_.reset(new WriteBatchWithIndex(nullptr, 20, false)); + + TestValueAsSecondaryIndexHelper(entries_list, batch_.get()); + + // Clear batch and re-run test with new values + batch_->Clear(); + + Entry new_entries[] = { + {"aaa", "0005", kPutRecord}, {"e", "0002", kPutRecord}, + {"add", "0002", kMergeRecord}, {"aab", "00001", kPutRecord}, + {"zz", "00005", kPutRecord}, {"add", "0002", kPutRecord}, + {"aab", "0003", kPutRecord}, {"zz", "00005", kDeleteRecord}, + }; + + entries_list = std::vector(new_entries, new_entries + 8); + + TestValueAsSecondaryIndexHelper(entries_list, batch_.get()); +} + +TEST_P(WriteBatchWithIndexTest, TestComparatorForCF) { + ColumnFamilyHandleImplDummy cf1(6, nullptr); + ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); + + ASSERT_OK(batch_->Put(&cf1, "ddd", "")); + ASSERT_OK(batch_->Put(&cf2, "aaa", "")); + ASSERT_OK(batch_->Put(&cf2, "eee", "")); + ASSERT_OK(batch_->Put(&cf1, "ccc", "")); + ASSERT_OK(batch_->Put(&reverse_cf, "a11", "")); + ASSERT_OK(batch_->Put(&cf1, "bbb", "")); + + Slice key_slices[] = {"a", "3", "3"}; + Slice value_slice = ""; + ASSERT_OK(batch_->Put(&reverse_cf, SliceParts(key_slices, 3), + SliceParts(&value_slice, 1))); + ASSERT_OK(batch_->Put(&reverse_cf, "a22", "")); + + { + std::unique_ptr iter(batch_->NewIterator(&cf1)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ddd", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch_->NewIterator(&cf2)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch_->NewIterator(&reverse_cf)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("z"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("a22"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + + iter->Seek("a13"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + } +} + +TEST_F(WBWIOverwriteTest, TestOverwriteKey) { + ColumnFamilyHandleImplDummy cf1(6, nullptr); + ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); + + ASSERT_OK(batch_->Merge(&cf1, "ddd", "")); + ASSERT_OK(batch_->Put(&cf1, "ddd", "")); + ASSERT_OK(batch_->Delete(&cf1, "ddd")); + ASSERT_OK(batch_->Put(&cf2, "aaa", "")); + ASSERT_OK(batch_->Delete(&cf2, "aaa")); + ASSERT_OK(batch_->Put(&cf2, "aaa", "aaa")); + ASSERT_OK(batch_->Put(&cf2, "eee", "eee")); + ASSERT_OK(batch_->Put(&cf1, "ccc", "")); + ASSERT_OK(batch_->Put(&reverse_cf, "a11", "")); + ASSERT_OK(batch_->Delete(&cf1, "ccc")); + ASSERT_OK(batch_->Put(&reverse_cf, "a33", "a33")); + ASSERT_OK(batch_->Put(&reverse_cf, "a11", "a11")); + Slice slices[] = {"a", "3", "3"}; + ASSERT_OK(batch_->Delete(&reverse_cf, SliceParts(slices, 3))); + + { + std::unique_ptr iter(batch_->NewIterator(&cf1)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ddd", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch_->NewIterator(&cf2)); + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + ASSERT_EQ("eee", iter->Entry().value.ToString()); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + ASSERT_EQ("aaa", iter->Entry().value.ToString()); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + ASSERT_EQ("aaa", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + ASSERT_EQ("eee", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch_->NewIterator(&reverse_cf)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("z"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + ASSERT_EQ("a11", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + ASSERT_EQ("a11", iter->Entry().value.ToString()); + iter->Prev(); + + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + } +} + +TEST_P(WriteBatchWithIndexTest, TestWBWIIterator) { + ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator()); + ASSERT_OK(batch_->Put(&cf1, "a", "a1")); + ASSERT_OK(batch_->Put(&cf1, "c", "c1")); + ASSERT_OK(batch_->Put(&cf1, "c", "c2")); + ASSERT_OK(batch_->Put(&cf1, "e", "e1")); + ASSERT_OK(batch_->Put(&cf1, "e", "e2")); + ASSERT_OK(batch_->Put(&cf1, "e", "e3")); + std::unique_ptr iter1( + static_cast(batch_->NewIterator(&cf1))); + std::unique_ptr iter2( + static_cast(batch_->NewIterator(&cf2))); + AssertIterEqual(iter1.get(), {"a", "c", "e"}); + AssertIterEqual(iter2.get(), {}); + ASSERT_OK(batch_->Put(&cf2, "a", "a2")); + ASSERT_OK(batch_->Merge(&cf2, "b", "b1")); + ASSERT_OK(batch_->Merge(&cf2, "b", "b2")); + ASSERT_OK(batch_->Delete(&cf2, "d")); + ASSERT_OK(batch_->Merge(&cf2, "d", "d2")); + ASSERT_OK(batch_->Merge(&cf2, "d", "d3")); + ASSERT_OK(batch_->Delete(&cf2, "f")); + AssertIterEqual(iter1.get(), {"a", "c", "e"}); + AssertIterEqual(iter2.get(), {"a", "b", "d", "f"}); +} + +TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) { + std::vector source_strings = {"a", "b", "c", "d", "e", + "f", "g", "h", "i", "j"}; + for (int rand_seed = 301; rand_seed < 366; rand_seed++) { + Random rnd(rand_seed); + + ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator()); + batch_->Clear(); + + if (rand_seed % 2 == 0) { + ASSERT_OK(batch_->Put(&cf2, "zoo", "bar")); + } + if (rand_seed % 4 == 1) { + ASSERT_OK(batch_->Put(&cf3, "zoo", "bar")); + } + + KVMap map; + KVMap merged_map; + for (auto key : source_strings) { + std::string value = key + key; + int type = rnd.Uniform(6); + switch (type) { + case 0: + // only base has it + map[key] = value; + merged_map[key] = value; + break; + case 1: + // only delta has it + ASSERT_OK(batch_->Put(&cf1, key, value)); + map[key] = value; + merged_map[key] = value; + break; + case 2: + // both has it. Delta should win + ASSERT_OK(batch_->Put(&cf1, key, value)); + map[key] = "wrong_value"; + merged_map[key] = value; + break; + case 3: + // both has it. Delta is delete + ASSERT_OK(batch_->Delete(&cf1, key)); + map[key] = "wrong_value"; + break; + case 4: + // only delta has it. Delta is delete + ASSERT_OK(batch_->Delete(&cf1, key)); + map[key] = "wrong_value"; + break; + default: + // Neither iterator has it. + break; + } + } + + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&map))); + std::unique_ptr result_iter(new KVIter(&merged_map)); + + bool is_valid = false; + for (int i = 0; i < 128; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + int type = rnd.Uniform(6); + ASSERT_OK(iter->status()); + switch (type) { + case 0: + // Seek to First + iter->SeekToFirst(); + result_iter->SeekToFirst(); + break; + case 1: + // Seek to last + iter->SeekToLast(); + result_iter->SeekToLast(); + break; + case 2: { + // Seek to random key + auto key_idx = rnd.Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->Seek(key); + result_iter->Seek(key); + break; + } + case 3: { + // SeekForPrev to random key + auto key_idx = rnd.Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->SeekForPrev(key); + result_iter->SeekForPrev(key); + break; + } + case 4: + // Next + if (is_valid) { + iter->Next(); + result_iter->Next(); + } else { + continue; + } + break; + default: + assert(type == 5); + // Prev + if (is_valid) { + iter->Prev(); + result_iter->Prev(); + } else { + continue; + } + break; + } + AssertItersMatch(iter.get(), result_iter.get()); + is_valid = iter->Valid(); + } + + ASSERT_OK(iter->status()); + } +} + +TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBase) { + ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator()); + { + KVMap map; + map["a"] = "aa"; + map["c"] = "cc"; + map["e"] = "ee"; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "e", "ee"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "e", "ee"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "c", "cc"); + + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + + iter->Seek("a"); + AssertIter(iter.get(), "a", "aa"); + } + + // Test the case that there is one element in the write batch + ASSERT_OK(batch_->Put(&cf2, "zoo", "bar")); + ASSERT_OK(batch_->Put(&cf1, "a", "aa")); + { + KVMap empty_map; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + ASSERT_OK(batch_->Delete(&cf1, "b")); + ASSERT_OK(batch_->Put(&cf1, "c", "cc")); + ASSERT_OK(batch_->Put(&cf1, "d", "dd")); + ASSERT_OK(batch_->Delete(&cf1, "e")); + + { + KVMap map; + map["b"] = ""; + map["cc"] = "cccc"; + map["f"] = "ff"; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + iter->Next(); + AssertIter(iter.get(), "f", "ff"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "f", "ff"); + iter->Prev(); + AssertIter(iter.get(), "d", "dd"); + iter->Prev(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("c"); + AssertIter(iter.get(), "c", "cc"); + + iter->Seek("cb"); + AssertIter(iter.get(), "cc", "cccc"); + + iter->Seek("cc"); + AssertIter(iter.get(), "cc", "cccc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + + iter->Seek("e"); + AssertIter(iter.get(), "f", "ff"); + + iter->Prev(); + AssertIter(iter.get(), "d", "dd"); + + iter->Next(); + AssertIter(iter.get(), "f", "ff"); + } + + { + KVMap empty_map; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "d", "dd"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("aa"); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + + iter->Seek("ca"); + AssertIter(iter.get(), "d", "dd"); + + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + } +} + +TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) { + ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator()); + + // Test the case that there is one element in the write batch + ASSERT_OK(batch_->Put(&cf2, "zoo", "bar")); + ASSERT_OK(batch_->Put(&cf1, "a", "aa")); + { + KVMap empty_map; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + ASSERT_OK(batch_->Put(&cf1, "c", "cc")); + { + KVMap map; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "a", "aa"); + + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + + iter->Seek("a"); + AssertIter(iter.get(), "a", "aa"); + } + + // default column family + ASSERT_OK(batch_->Put("a", "b")); + { + KVMap map; + map["b"] = ""; + std::unique_ptr iter( + batch_->NewIteratorWithBase(new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "b"); + iter->Next(); + AssertIter(iter.get(), "b", ""); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "b", ""); + iter->Prev(); + AssertIter(iter.get(), "a", "b"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "b", ""); + + iter->Prev(); + AssertIter(iter.get(), "a", "b"); + + iter->Seek("0"); + AssertIter(iter.get(), "a", "b"); + } +} + +TEST_P(WriteBatchWithIndexTest, TestGetFromBatch) { + Options options; + Status s; + std::string value; + + s = batch_->GetFromBatch(options_, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Put("a", "a")); + ASSERT_OK(batch_->Put("b", "b")); + ASSERT_OK(batch_->Put("c", "c")); + ASSERT_OK(batch_->Put("a", "z")); + ASSERT_OK(batch_->Delete("c")); + ASSERT_OK(batch_->Delete("d")); + ASSERT_OK(batch_->Delete("e")); + ASSERT_OK(batch_->Put("e", "e")); + + s = batch_->GetFromBatch(options_, "b", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = batch_->GetFromBatch(options_, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("z", value); + + s = batch_->GetFromBatch(options_, "c", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = batch_->GetFromBatch(options_, "d", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = batch_->GetFromBatch(options_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = batch_->GetFromBatch(options_, "e", &value); + ASSERT_OK(s); + ASSERT_EQ("e", value); + + ASSERT_OK(batch_->Merge("z", "z")); + + s = batch_->GetFromBatch(options_, "z", &value); + ASSERT_NOK(s); // No merge operator specified. + + s = batch_->GetFromBatch(options_, "b", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); +} + +TEST_P(WriteBatchWithIndexTest, TestGetFromBatchMerge) { + Status s = OpenDB(); + ASSERT_OK(s); + + ColumnFamilyHandle* column_family = db_->DefaultColumnFamily(); + std::string value; + + s = batch_->GetFromBatch(options_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Put("x", "X")); + std::string expected = "X"; + + for (int i = 0; i < 5; i++) { + ASSERT_OK(batch_->Merge("x", std::to_string(i))); + expected = expected + "," + std::to_string(i); + + if (i % 2 == 0) { + ASSERT_OK(batch_->Put("y", std::to_string(i / 2))); + } + + ASSERT_OK(batch_->Merge("z", "z")); + + s = batch_->GetFromBatch(column_family, options_, "x", &value); + ASSERT_OK(s); + ASSERT_EQ(expected, value); + + s = batch_->GetFromBatch(column_family, options_, "y", &value); + ASSERT_OK(s); + ASSERT_EQ(std::to_string(i / 2), value); + + s = batch_->GetFromBatch(column_family, options_, "z", &value); + ASSERT_TRUE(s.IsMergeInProgress()); + } +} + +TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) { + Status s = OpenDB(); + ASSERT_OK(s); + + ColumnFamilyHandle* column_family = db_->DefaultColumnFamily(); + std::string value; + + s = batch_->GetFromBatch(column_family, options_, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Put(column_family, "X", "x")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x", value); + + ASSERT_OK(batch_->Put(column_family, "X", "x2")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x2", value); + + ASSERT_OK(batch_->Merge(column_family, "X", "aaa")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x2,aaa", value); + + ASSERT_OK(batch_->Merge(column_family, "X", "bbb")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x2,aaa,bbb", value); + + ASSERT_OK(batch_->Put(column_family, "X", "x3")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x3", value); + + ASSERT_OK(batch_->Merge(column_family, "X", "ccc")); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("x3,ccc", value); + + ASSERT_OK(batch_->Delete(column_family, "X")); + s = batch_->GetFromBatch(column_family, options_, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + batch_->Merge(column_family, "X", "ddd"); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); + ASSERT_EQ("ddd", value); +} + +TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) { + ASSERT_OK(OpenDB()); + + std::string value; + + ASSERT_OK(db_->Put(write_opts_, "a", "a")); + ASSERT_OK(db_->Put(write_opts_, "b", "b")); + ASSERT_OK(db_->Put(write_opts_, "c", "c")); + + ASSERT_OK(batch_->Put("a", "batch_->a")); + ASSERT_OK(batch_->Delete("b")); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value)); + ASSERT_EQ("batch_->a", value); + + Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value)); + ASSERT_EQ("c", value); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(db_->Delete(write_opts_, "x")); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) { + Status s = OpenDB(); + ASSERT_OK(s); + + std::string value; + + ASSERT_OK(db_->Put(write_opts_, "a", "a0")); + ASSERT_OK(db_->Put(write_opts_, "b", "b0")); + ASSERT_OK(db_->Merge(write_opts_, "b", "b1")); + ASSERT_OK(db_->Merge(write_opts_, "c", "c0")); + ASSERT_OK(db_->Merge(write_opts_, "d", "d0")); + + ASSERT_OK(batch_->Merge("a", "a1")); + ASSERT_OK(batch_->Merge("a", "a2")); + ASSERT_OK(batch_->Merge("b", "b2")); + ASSERT_OK(batch_->Merge("d", "d1")); + ASSERT_OK(batch_->Merge("e", "e0")); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value)); + ASSERT_EQ("a0,a1,a2", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value)); + ASSERT_EQ("b0,b1,b2", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value)); + ASSERT_EQ("c0", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value)); + ASSERT_EQ("d0,d1", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value)); + ASSERT_EQ("e0", value); + + ASSERT_OK(db_->Delete(write_opts_, "x")); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions snapshot_read_options; + snapshot_read_options.snapshot = snapshot; + + ASSERT_OK(db_->Delete(write_opts_, "a")); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value)); + ASSERT_EQ("a1,a2", value); + + ASSERT_OK( + s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value)); + ASSERT_EQ("a0,a1,a2", value); + + ASSERT_OK(batch_->Delete("a")); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1")); + + ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value)); + ASSERT_EQ("c0,c1", value); + + ASSERT_OK( + s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value)); + ASSERT_EQ("c0", value); + + ASSERT_OK(db_->Put(write_opts_, "e", "e1")); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value)); + ASSERT_EQ("e1,e0", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value)); + ASSERT_EQ("e0", value); + + ASSERT_OK(s = db_->Delete(write_opts_, "e")); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value)); + ASSERT_EQ("e0", value); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value)); + ASSERT_EQ("e0", value); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) { + Status s = OpenDB(); + ASSERT_OK(s); + + std::string value; + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Merge("A", "xxx")); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value)); + ASSERT_EQ(value, "xxx"); + + ASSERT_OK(batch_->Merge("A", "yyy")); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value)); + ASSERT_EQ(value, "xxx,yyy"); + + ASSERT_OK(db_->Put(write_opts_, "A", "a0")); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value)); + ASSERT_EQ(value, "a0,xxx,yyy"); + + ASSERT_OK(batch_->Delete("A")); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) { + Status s = OpenDB(); + ASSERT_OK(s); + + FlushOptions flush_options; + std::string value; + + ASSERT_OK(db_->Put(write_opts_, "A", "1")); + ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily())); + ASSERT_OK(batch_->Merge("A", "2")); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value)); + ASSERT_EQ(value, "1,2"); +} + +TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) { + Status s = OpenDB(); + ASSERT_OK(s); + + PinnableSlice value; + + ASSERT_OK(db_->Put(write_opts_, "a", "a0")); + ASSERT_OK(db_->Put(write_opts_, "b", "b0")); + ASSERT_OK(db_->Merge(write_opts_, "b", "b1")); + ASSERT_OK(db_->Merge(write_opts_, "c", "c0")); + ASSERT_OK(db_->Merge(write_opts_, "d", "d0")); + ASSERT_OK(batch_->Merge("a", "a1")); + ASSERT_OK(batch_->Merge("a", "a2")); + ASSERT_OK(batch_->Merge("b", "b2")); + ASSERT_OK(batch_->Merge("d", "d1")); + ASSERT_OK(batch_->Merge("e", "e0")); + + for (int i = 0; i < 2; i++) { + if (i == 1) { + // Do it again with a flushed DB... + ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily())); + } + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value)); + ASSERT_EQ("a0,a1,a2", value.ToString()); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value)); + ASSERT_EQ("b0,b1,b2", value.ToString()); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value)); + ASSERT_EQ("c0", value.ToString()); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value)); + ASSERT_EQ("d0,d1", value.ToString()); + + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value)); + ASSERT_EQ("e0", value.ToString()); + ASSERT_OK(db_->Delete(write_opts_, "x")); + + s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +void AssertKey(std::string key, WBWIIterator* iter) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key, iter->Entry().key.ToString()); +} + +void AssertValue(std::string value, WBWIIterator* iter) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value, iter->Entry().value.ToString()); +} + +// Tests that we can write to the WBWI while we iterate (from a single thread). +// iteration should see the newest writes +TEST_F(WBWIOverwriteTest, MutateWhileIteratingCorrectnessTest) { + for (char c = 'a'; c <= 'z'; ++c) { + ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c))); + } + + std::unique_ptr iter(batch_->NewIterator()); + iter->Seek("k"); + AssertKey("k", iter.get()); + iter->Next(); + AssertKey("l", iter.get()); + ASSERT_OK(batch_->Put("ab", "cc")); + iter->Next(); + AssertKey("m", iter.get()); + ASSERT_OK(batch_->Put("mm", "kk")); + iter->Next(); + AssertKey("mm", iter.get()); + AssertValue("kk", iter.get()); + ASSERT_OK(batch_->Delete("mm")); + + iter->Next(); + AssertKey("n", iter.get()); + iter->Prev(); + AssertKey("mm", iter.get()); + ASSERT_EQ(kDeleteRecord, iter->Entry().type); + + iter->Seek("ab"); + AssertKey("ab", iter.get()); + ASSERT_OK(batch_->Delete("x")); + iter->Seek("x"); + AssertKey("x", iter.get()); + ASSERT_EQ(kDeleteRecord, iter->Entry().type); + iter->Prev(); + AssertKey("w", iter.get()); +} + +void AssertIterKey(std::string key, Iterator* iter) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key, iter->key().ToString()); +} + +void AssertIterValue(std::string value, Iterator* iter) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value, iter->value().ToString()); +} + +// same thing as above, but testing IteratorWithBase +TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseCorrectnessTest) { + WriteBatchWithIndex batch(BytewiseComparator(), 0, true); + for (char c = 'a'; c <= 'z'; ++c) { + ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c))); + } + + KVMap map; + map["aa"] = "aa"; + map["cc"] = "cc"; + map["ee"] = "ee"; + map["em"] = "me"; + + std::unique_ptr iter(batch_->NewIteratorWithBase(new KVIter(&map))); + iter->Seek("k"); + AssertIterKey("k", iter.get()); + iter->Next(); + AssertIterKey("l", iter.get()); + ASSERT_OK(batch_->Put("ab", "cc")); + iter->Next(); + AssertIterKey("m", iter.get()); + ASSERT_OK(batch_->Put("mm", "kk")); + iter->Next(); + AssertIterKey("mm", iter.get()); + AssertIterValue("kk", iter.get()); + ASSERT_OK(batch_->Delete("mm")); + iter->Next(); + AssertIterKey("n", iter.get()); + iter->Prev(); + // "mm" is deleted, so we're back at "m" + AssertIterKey("m", iter.get()); + + iter->Seek("ab"); + AssertIterKey("ab", iter.get()); + iter->Prev(); + AssertIterKey("aa", iter.get()); + iter->Prev(); + AssertIterKey("a", iter.get()); + ASSERT_OK(batch_->Delete("aa")); + iter->Next(); + AssertIterKey("ab", iter.get()); + iter->Prev(); + AssertIterKey("a", iter.get()); + + ASSERT_OK(batch_->Delete("x")); + iter->Seek("x"); + AssertIterKey("y", iter.get()); + iter->Next(); + AssertIterKey("z", iter.get()); + iter->Prev(); + iter->Prev(); + AssertIterKey("w", iter.get()); + + ASSERT_OK(batch_->Delete("e")); + iter->Seek("e"); + AssertIterKey("ee", iter.get()); + AssertIterValue("ee", iter.get()); + ASSERT_OK(batch_->Put("ee", "xx")); + // still the same value + AssertIterValue("ee", iter.get()); + iter->Next(); + AssertIterKey("em", iter.get()); + iter->Prev(); + // new value + AssertIterValue("xx", iter.get()); + + ASSERT_OK(iter->status()); +} + +// stress testing mutations with IteratorWithBase +TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseStressTest) { + for (char c = 'a'; c <= 'z'; ++c) { + ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c))); + } + + KVMap map; + for (char c = 'a'; c <= 'z'; ++c) { + map[std::string(2, c)] = std::string(2, c); + } + + std::unique_ptr iter(batch_->NewIteratorWithBase(new KVIter(&map))); + + Random rnd(301); + for (int i = 0; i < 1000000; ++i) { + int random = rnd.Uniform(8); + char c = static_cast(rnd.Uniform(26) + 'a'); + switch (random) { + case 0: + ASSERT_OK(batch_->Put(std::string(1, c), "xxx")); + break; + case 1: + ASSERT_OK(batch_->Put(std::string(2, c), "xxx")); + break; + case 2: + ASSERT_OK(batch_->Delete(std::string(1, c))); + break; + case 3: + ASSERT_OK(batch_->Delete(std::string(2, c))); + break; + case 4: + iter->Seek(std::string(1, c)); + break; + case 5: + iter->Seek(std::string(2, c)); + break; + case 6: + if (iter->Valid()) { + iter->Next(); + } + break; + case 7: + if (iter->Valid()) { + iter->Prev(); + } + break; + default: + assert(false); + } + } + ASSERT_OK(iter->status()); +} + +TEST_P(WriteBatchWithIndexTest, TestNewIteratorWithBaseFromWbwi) { + ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator()); + KVMap map; + map["a"] = "aa"; + map["c"] = "cc"; + map["e"] = "ee"; + std::unique_ptr iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&map))); + ASSERT_NE(nullptr, iter); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_P(WriteBatchWithIndexTest, SavePointTest) { + ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator()); + KVMap empty_map; + std::unique_ptr cf0_iter( + batch_->NewIteratorWithBase(new KVIter(&empty_map))); + std::unique_ptr cf1_iter( + batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + Status s; + KVMap kvm_cf0_0 = {{"A", "aa"}, {"B", "b"}}; + KVMap kvm_cf1_0 = {{"A", "a1"}, {"C", "c1"}, {"E", "e1"}}; + KVIter kvi_cf0_0(&kvm_cf0_0); + KVIter kvi_cf1_0(&kvm_cf1_0); + + ASSERT_OK(batch_->Put("A", "a")); + ASSERT_OK(batch_->Put("B", "b")); + ASSERT_OK(batch_->Put("A", "aa")); + ASSERT_OK(batch_->Put(&cf1, "A", "a1")); + ASSERT_OK(batch_->Delete(&cf1, "B")); + ASSERT_OK(batch_->Put(&cf1, "C", "c1")); + ASSERT_OK(batch_->Put(&cf1, "E", "e1")); + + AssertItersEqual(cf0_iter.get(), &kvi_cf0_0); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_0); + batch_->SetSavePoint(); // 1 + + KVMap kvm_cf0_1 = {{"B", "bb"}, {"C", "cc"}}; + KVMap kvm_cf1_1 = {{"B", "b1"}, {"C", "c1"}}; + KVIter kvi_cf0_1(&kvm_cf0_1); + KVIter kvi_cf1_1(&kvm_cf1_1); + + ASSERT_OK(batch_->Put("C", "cc")); + ASSERT_OK(batch_->Put("B", "bb")); + ASSERT_OK(batch_->Delete("A")); + ASSERT_OK(batch_->Put(&cf1, "B", "b1")); + ASSERT_OK(batch_->Delete(&cf1, "A")); + ASSERT_OK(batch_->SingleDelete(&cf1, "E")); + batch_->SetSavePoint(); // 2 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_1); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_1); + + KVMap kvm_cf0_2 = {{"A", "xxx"}, {"C", "cc"}}; + KVMap kvm_cf1_2 = {{"B", "b2"}}; + KVIter kvi_cf0_2(&kvm_cf0_2); + KVIter kvi_cf1_2(&kvm_cf1_2); + + ASSERT_OK(batch_->Put("A", "aaa")); + ASSERT_OK(batch_->Put("A", "xxx")); + ASSERT_OK(batch_->Delete("B")); + ASSERT_OK(batch_->Put(&cf1, "B", "b2")); + ASSERT_OK(batch_->Delete(&cf1, "C")); + batch_->SetSavePoint(); // 3 + batch_->SetSavePoint(); // 4 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_2); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_2); + + KVMap kvm_cf0_4 = {{"A", "xxx"}, {"C", "cc"}}; + KVMap kvm_cf1_4 = {{"B", "b2"}}; + KVIter kvi_cf0_4(&kvm_cf0_4); + KVIter kvi_cf1_4(&kvm_cf1_4); + ASSERT_OK(batch_->SingleDelete("D")); + ASSERT_OK(batch_->Delete(&cf1, "D")); + ASSERT_OK(batch_->Delete(&cf1, "E")); + AssertItersEqual(cf0_iter.get(), &kvi_cf0_4); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_4); + + ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 4 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_2); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_2); + + ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 3 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_2); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_2); + + ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 2 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_1); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_1); + + batch_->SetSavePoint(); // 5 + ASSERT_OK(batch_->Put("X", "x")); + + KVMap kvm_cf0_5 = {{"B", "bb"}, {"C", "cc"}, {"X", "x"}}; + KVIter kvi_cf0_5(&kvm_cf0_5); + KVIter kvi_cf1_5(&kvm_cf1_1); + AssertItersEqual(cf0_iter.get(), &kvi_cf0_5); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_5); + + ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 5 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_1); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_1); + + ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 1 + AssertItersEqual(cf0_iter.get(), &kvi_cf0_0); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_0); + + s = batch_->RollbackToSavePoint(); // no savepoint found + ASSERT_TRUE(s.IsNotFound()); + AssertItersEqual(cf0_iter.get(), &kvi_cf0_0); + AssertItersEqual(cf1_iter.get(), &kvi_cf1_0); + + batch_->SetSavePoint(); // 6 + + batch_->Clear(); + ASSERT_EQ("", PrintContents(batch_.get(), nullptr)); + ASSERT_EQ("", PrintContents(batch_.get(), &cf1)); + + s = batch_->RollbackToSavePoint(); // rollback to 6 + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(WriteBatchWithIndexTest, SingleDeleteTest) { + Status s; + std::string value; + + ASSERT_OK(batch_->SingleDelete("A")); + + s = batch_->GetFromBatch(options_, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatch(options_, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + batch_->Clear(); + ASSERT_OK(batch_->Put("A", "a")); + ASSERT_OK(batch_->Put("A", "a2")); + ASSERT_OK(batch_->Put("B", "b")); + ASSERT_OK(batch_->SingleDelete("A")); + + s = batch_->GetFromBatch(options_, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatch(options_, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + ASSERT_OK(batch_->Put("C", "c")); + ASSERT_OK(batch_->Put("A", "a3")); + ASSERT_OK(batch_->Delete("B")); + ASSERT_OK(batch_->SingleDelete("B")); + ASSERT_OK(batch_->SingleDelete("C")); + + s = batch_->GetFromBatch(options_, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a3", value); + s = batch_->GetFromBatch(options_, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatch(options_, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatch(options_, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Put("B", "b4")); + ASSERT_OK(batch_->Put("C", "c4")); + ASSERT_OK(batch_->Put("D", "d4")); + ASSERT_OK(batch_->SingleDelete("D")); + ASSERT_OK(batch_->SingleDelete("D")); + ASSERT_OK(batch_->Delete("A")); + + s = batch_->GetFromBatch(options_, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatch(options_, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b4", value); + s = batch_->GetFromBatch(options_, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c4", value); + s = batch_->GetFromBatch(options_, "D", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) { + std::string value; + ASSERT_OK(batch_->Put("A", "a")); + ASSERT_OK(batch_->Put("A", "a2")); + ASSERT_OK(batch_->Put("B", "b")); + ASSERT_OK(batch_->SingleDelete("A")); + ASSERT_OK(batch_->Delete("B")); + + KVMap map; + value = PrintContents(batch_.get(), &map, nullptr); + ASSERT_EQ("", value); + + map["A"] = "aa"; + map["C"] = "cc"; + map["D"] = "dd"; + + ASSERT_OK(batch_->SingleDelete("B")); + ASSERT_OK(batch_->SingleDelete("C")); + ASSERT_OK(batch_->SingleDelete("Z")); + + value = PrintContents(batch_.get(), &map, nullptr); + ASSERT_EQ("D:dd,", value); + + ASSERT_OK(batch_->Put("A", "a3")); + ASSERT_OK(batch_->Put("B", "b3")); + ASSERT_OK(batch_->SingleDelete("A")); + ASSERT_OK(batch_->SingleDelete("A")); + ASSERT_OK(batch_->SingleDelete("D")); + ASSERT_OK(batch_->SingleDelete("D")); + ASSERT_OK(batch_->Delete("D")); + + map["E"] = "ee"; + + value = PrintContents(batch_.get(), &map, nullptr); + ASSERT_EQ("B:b3,E:ee,", value); +} + +TEST_P(WriteBatchWithIndexTest, MultiGetTest) { + // MultiGet a lot of keys in order to force std::vector reallocations + std::vector keys; + for (int i = 0; i < 100; ++i) { + keys.emplace_back(std::to_string(i)); + } + + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + // Write some data to the db for the even numbered keys + { + WriteBatch wb; + for (size_t i = 0; i < keys.size(); i += 2) { + std::string val = "val" + std::to_string(i); + ASSERT_OK(wb.Put(cf0, keys[i], val)); + } + ASSERT_OK(db_->Write(write_opts_, &wb)); + for (size_t i = 0; i < keys.size(); i += 2) { + std::string value; + ASSERT_OK(db_->Get(read_opts_, cf0, keys[i], &value)); + } + } + + // Write some data to the batch + for (size_t i = 0; i < keys.size(); ++i) { + if ((i % 5) == 0) { + ASSERT_OK(batch_->Delete(cf0, keys[i])); + } else if ((i % 7) == 0) { + std::string val = "new" + std::to_string(i); + ASSERT_OK(batch_->Put(cf0, keys[i], val)); + } + if (i > 0 && (i % 3) == 0) { + ASSERT_OK(batch_->Merge(cf0, keys[i], "merge")); + } + } + + std::vector key_slices; + for (size_t i = 0; i < keys.size(); ++i) { + key_slices.emplace_back(keys[i]); + } + std::vector values(keys.size()); + std::vector statuses(keys.size()); + + batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(), + key_slices.data(), values.data(), + statuses.data(), false); + for (size_t i = 0; i < keys.size(); ++i) { + if (i == 0) { + ASSERT_TRUE(statuses[i].IsNotFound()); + } else if ((i % 3) == 0) { + ASSERT_OK(statuses[i]); + if ((i % 5) == 0) { // Merge after Delete + ASSERT_EQ(values[i], "merge"); + } else if ((i % 7) == 0) { // Merge after Put + std::string val = "new" + std::to_string(i); + ASSERT_EQ(values[i], val + ",merge"); + } else if ((i % 2) == 0) { + std::string val = "val" + std::to_string(i); + ASSERT_EQ(values[i], val + ",merge"); + } else { + ASSERT_EQ(values[i], "merge"); + } + } else if ((i % 5) == 0) { + ASSERT_TRUE(statuses[i].IsNotFound()); + } else if ((i % 7) == 0) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], "new" + std::to_string(i)); + } else if ((i % 2) == 0) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], "val" + std::to_string(i)); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + } +} +TEST_P(WriteBatchWithIndexTest, MultiGetTest2) { + // MultiGet a lot of keys in order to force std::vector reallocations + const int num_keys = 700; + const int keys_per_pass = 100; + std::vector keys; + for (size_t i = 0; i < num_keys; ++i) { + keys.emplace_back(std::to_string(i)); + } + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + // Keys 0- 99 have a PUT in the batch but not DB + // Keys 100-199 have a PUT in the DB + // Keys 200-299 Have a PUT/DELETE + // Keys 300-399 Have a PUT/DELETE/MERGE + // Keys 400-499 have a PUT/MERGE + // Keys 500-599 have a MERGE only + // Keys 600-699 were never written + { + WriteBatch wb; + for (size_t i = 100; i < 500; i++) { + std::string val = std::to_string(i); + ASSERT_OK(wb.Put(cf0, keys[i], val)); + } + ASSERT_OK(db_->Write(write_opts_, &wb)); + } + ASSERT_OK(db_->Flush(FlushOptions(), cf0)); + for (size_t i = 0; i < 100; i++) { + ASSERT_OK(batch_->Put(cf0, keys[i], keys[i])); + } + for (size_t i = 200; i < 400; i++) { + ASSERT_OK(batch_->Delete(cf0, keys[i])); + } + for (size_t i = 300; i < 600; i++) { + std::string val = std::to_string(i) + "m"; + ASSERT_OK(batch_->Merge(cf0, keys[i], val)); + } + + Random rnd(301); + std::vector values(keys_per_pass); + std::vector statuses(keys_per_pass); + for (int pass = 0; pass < 40; pass++) { + std::vector key_slices; + for (size_t i = 0; i < keys_per_pass; i++) { + int random = rnd.Uniform(num_keys); + key_slices.emplace_back(keys[random]); + } + batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass, + key_slices.data(), values.data(), + statuses.data(), false); + for (size_t i = 0; i < keys_per_pass; i++) { + int key = ParseInt(key_slices[i].ToString()); + switch (key / 100) { + case 0: // 0-99 PUT only + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], key_slices[i].ToString()); + break; + case 1: // 100-199 PUT only + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], key_slices[i].ToString()); + break; + case 2: // 200-299 Deleted + ASSERT_TRUE(statuses[i].IsNotFound()); + break; + case 3: // 300-399 Delete+Merge + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], key_slices[i].ToString() + "m"); + break; + case 4: // 400-400 Put+ Merge + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], key_slices[i].ToString() + "," + + key_slices[i].ToString() + "m"); + break; + case 5: // Merge only + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], key_slices[i].ToString() + "m"); + break; + case 6: // Never written + ASSERT_TRUE(statuses[i].IsNotFound()); + break; + default: + assert(false); + } // end switch + } // End for each key + } // end for passes +} + +// This test has merges, but the merge does not play into the final result +TEST_P(WriteBatchWithIndexTest, FakeMergeWithIteratorTest) { + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + // The map we are starting with + KVMap input = { + {"odm", "odm0"}, + {"omd", "omd0"}, + {"omp", "omp0"}, + }; + KVMap result = { + {"odm", "odm2"}, // Orig, Delete, Merge + {"mp", "mp1"}, // Merge, Put + {"omp", "omp2"}, // Origi, Merge, Put + {"mmp", "mmp2"} // Merge, Merge, Put + }; + + for (auto& iter : result) { + EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second); + } + AddToBatch(cf0, "md"); // Merge, Delete + AddToBatch(cf0, "mmd"); // Merge, Merge, Delete + AddToBatch(cf0, "omd"); // Orig, Merge, Delete + + KVIter kvi(&result); + // First try just the batch + std::unique_ptr iter( + batch_->NewIteratorWithBase(cf0, new KVIter(&input))); + AssertItersEqual(iter.get(), &kvi); +} + +TEST_P(WriteBatchWithIndexTest, IteratorMergeTest) { + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + KVMap result = { + {"m", "m0"}, // Merge + {"mm", "mm0,mm1"}, // Merge, Merge + {"dm", "dm1"}, // Delete, Merge + {"dmm", "dmm1,dmm2"}, // Delete, Merge, Merge + {"mdm", "mdm2"}, // Merge, Delete, Merge + {"mpm", "mpm1,mpm2"}, // Merge, Put, Merge + {"pm", "pm0,pm1"}, // Put, Merge + {"pmm", "pmm0,pmm1,pmm2"}, // Put, Merge, Merge + }; + + for (auto& iter : result) { + EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second); + } + + KVIter kvi(&result); + // First try just the batch + KVMap empty_map; + std::unique_ptr iter( + batch_->NewIteratorWithBase(cf0, new KVIter(&empty_map))); + AssertItersEqual(iter.get(), &kvi); +} + +TEST_P(WriteBatchWithIndexTest, IteratorMergeTestWithOrig) { + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + KVMap original; + KVMap results = { + {"m", "om,m0"}, // Merge + {"mm", "omm,mm0,mm1"}, // Merge, Merge + {"dm", "dm1"}, // Delete, Merge + {"dmm", "dmm1,dmm2"}, // Delete, Merge, Merge + {"mdm", "mdm2"}, // Merge, Delete, Merge + {"mpm", "mpm1,mpm2"}, // Merge, Put, Merge + {"pm", "pm0,pm1"}, // Put, Merge + {"pmm", "pmm0,pmm1,pmm2"}, // Put, Merge, Merge + }; + + for (auto& iter : results) { + AddToBatch(cf0, iter.first); + original[iter.first] = "o" + iter.first; + } + + KVIter kvi(&results); + // First try just the batch + std::unique_ptr iter( + batch_->NewIteratorWithBase(cf0, new KVIter(&original))); + AssertItersEqual(iter.get(), &kvi); +} + +TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) { + std::string value; + Status s; + + ASSERT_OK(OpenDB()); + ASSERT_OK(db_->Put(write_opts_, "o", "aa")); + batch_->Merge("o", "bb"); // Merging bb under key "o" + batch_->Merge("m", "cc"); // Merging bc under key "m" + s = batch_->GetFromBatch(options_, "m", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + s = batch_->GetFromBatch(options_, "o", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + + ASSERT_OK(db_->Write(write_opts_, batch_->GetWriteBatch())); + ASSERT_OK(db_->Get(read_opts_, "o", &value)); + ASSERT_EQ(value, "aa,bb"); + ASSERT_OK(db_->Get(read_opts_, "m", &value)); + ASSERT_EQ(value, "cc"); +} + +TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) { + std::string value; + + ASSERT_OK(OpenDB()); + ASSERT_OK(db_->Put(write_opts_, "o", "aa")); + ASSERT_OK(batch_->Merge("o", "bb")); // Merging bb under key "o" + ASSERT_OK(batch_->Merge("m", "cc")); // Merging bc under key "m" + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value)); + ASSERT_EQ(value, "aa,bb"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value)); + ASSERT_EQ(value, "cc"); +} + +TEST_F(WBWIKeepTest, GetAfterPut) { + std::string value; + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + ASSERT_OK(db_->Put(write_opts_, "key", "orig")); + + ASSERT_OK(batch_->Put("key", "aa")); // Writing aa under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "aa"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "aa"); + + ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "aa,bb"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "aa,bb"); + + ASSERT_OK(batch_->Merge("key", "cc")); // Merging cc under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "aa,bb,cc"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "aa,bb,cc"); +} + +TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) { + std::string value; + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + ASSERT_OK(db_->Put(write_opts_, "key", "orig")); + + ASSERT_OK(batch_->Merge("key", "aa")); // Merging aa under key + Status s = batch_->GetFromBatch(cf0, options_, "key", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "orig,aa"); + + ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key + s = batch_->GetFromBatch(cf0, options_, "key", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "orig,aa,bb"); + + ASSERT_OK(batch_->Put("key", "cc")); // Writing cc under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "cc"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "cc"); + + ASSERT_OK(batch_->Merge("key", "dd")); // Merging dd under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "cc,dd"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "cc,dd"); +} + +TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) { + std::string value; + ASSERT_OK(OpenDB()); + ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily(); + + ASSERT_OK(batch_->Merge("key", "aa")); // Merging aa under key + Status s = batch_->GetFromBatch(cf0, options_, "key", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "aa"); + + ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key + s = batch_->GetFromBatch(cf0, options_, "key", &value); + ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "aa,bb"); + + ASSERT_OK(batch_->Delete("key")); // Delete key from batch + s = batch_->GetFromBatch(cf0, options_, "key", &value); + ASSERT_TRUE(s.IsNotFound()); + s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(batch_->Merge("key", "cc")); // Merging cc under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "cc"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "cc"); + ASSERT_OK(batch_->Merge("key", "dd")); // Merging dd under key + ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value)); + ASSERT_EQ(value, "cc,dd"); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value)); + ASSERT_EQ(value, "cc,dd"); +} + +TEST_F(WBWIOverwriteTest, TestBadMergeOperator) { + class FailingMergeOperator : public MergeOperator { + public: + FailingMergeOperator() {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + return false; + } + + const char* Name() const override { return "Failing"; } + }; + options_.merge_operator.reset(new FailingMergeOperator()); + ASSERT_OK(OpenDB()); + + ColumnFamilyHandle* column_family = db_->DefaultColumnFamily(); + std::string value; + + ASSERT_OK(db_->Put(write_opts_, "a", "a0")); + ASSERT_OK(batch_->Put("b", "b0")); + + ASSERT_OK(batch_->Merge("a", "a1")); + ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value)); + ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value)); + ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value)); + ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value)); +} + +TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) { + ColumnFamilyHandleImplDummy cf2(2, + test::BytewiseComparatorWithU64TsWrapper()); + + // Sanity checks + ASSERT_TRUE(batch_->Put(&cf2, "key", "ts", "value").IsNotSupported()); + ASSERT_TRUE(batch_->Put(/*column_family=*/nullptr, "key", "ts", "value") + .IsInvalidArgument()); + ASSERT_TRUE(batch_->Delete(&cf2, "key", "ts").IsNotSupported()); + ASSERT_TRUE(batch_->Delete(/*column_family=*/nullptr, "key", "ts") + .IsInvalidArgument()); + ASSERT_TRUE(batch_->SingleDelete(&cf2, "key", "ts").IsNotSupported()); + ASSERT_TRUE(batch_->SingleDelete(/*column_family=*/nullptr, "key", "ts") + .IsInvalidArgument()); + { + std::string value; + ASSERT_TRUE(batch_ + ->GetFromBatchAndDB( + /*db=*/nullptr, ReadOptions(), &cf2, "key", &value) + .IsInvalidArgument()); + } + { + constexpr size_t num_keys = 2; + std::array keys{{Slice(), Slice()}}; + std::array pinnable_vals{ + {PinnableSlice(), PinnableSlice()}}; + std::array statuses{{Status(), Status()}}; + constexpr bool sorted_input = false; + batch_->MultiGetFromBatchAndDB(/*db=*/nullptr, ReadOptions(), &cf2, + num_keys, keys.data(), pinnable_vals.data(), + statuses.data(), sorted_input); + for (const auto& s : statuses) { + ASSERT_TRUE(s.IsInvalidArgument()); + } + } + + constexpr uint32_t kMaxKey = 10; + + const auto ts_sz_lookup = [&cf2](uint32_t id) { + if (cf2.GetID() == id) { + return sizeof(uint64_t); + } else { + return std::numeric_limits::max(); + } + }; + + // Put keys + for (uint32_t i = 0; i < kMaxKey; ++i) { + std::string key; + PutFixed32(&key, i); + Status s = batch_->Put(&cf2, key, "value" + std::to_string(i)); + ASSERT_OK(s); + } + + WriteBatch* wb = batch_->GetWriteBatch(); + assert(wb); + ASSERT_OK( + wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\0'), ts_sz_lookup)); + + // Point lookup + for (uint32_t i = 0; i < kMaxKey; ++i) { + std::string value; + std::string key; + PutFixed32(&key, i); + Status s = batch_->GetFromBatch(&cf2, Options(), key, &value); + ASSERT_OK(s); + ASSERT_EQ("value" + std::to_string(i), value); + } + + // Iterator + { + std::unique_ptr it(batch_->NewIterator(&cf2)); + uint32_t start = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) { + std::string key; + PutFixed32(&key, start); + ASSERT_OK(it->status()); + ASSERT_EQ(key, it->Entry().key); + ASSERT_EQ("value" + std::to_string(start), it->Entry().value); + ASSERT_EQ(WriteType::kPutRecord, it->Entry().type); + } + ASSERT_EQ(kMaxKey, start); + } + + // Delete the keys with Delete() or SingleDelete() + for (uint32_t i = 0; i < kMaxKey; ++i) { + std::string key; + PutFixed32(&key, i); + Status s; + if (0 == (i % 2)) { + s = batch_->Delete(&cf2, key); + } else { + s = batch_->SingleDelete(&cf2, key); + } + ASSERT_OK(s); + } + + ASSERT_OK(wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\xfe'), + ts_sz_lookup)); + + for (uint32_t i = 0; i < kMaxKey; ++i) { + std::string value; + std::string key; + PutFixed32(&key, i); + Status s = batch_->GetFromBatch(&cf2, Options(), key, &value); + ASSERT_TRUE(s.IsNotFound()); + } + + // Iterator + { + const bool overwrite = GetParam(); + std::unique_ptr it(batch_->NewIterator(&cf2)); + uint32_t start = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) { + std::string key; + PutFixed32(&key, start); + ASSERT_EQ(key, it->Entry().key); + if (!overwrite) { + ASSERT_EQ(WriteType::kPutRecord, it->Entry().type); + it->Next(); + ASSERT_TRUE(it->Valid()); + } + if (0 == (start % 2)) { + ASSERT_EQ(WriteType::kDeleteRecord, it->Entry().type); + } else { + ASSERT_EQ(WriteType::kSingleDeleteRecord, it->Entry().type); + } + } + } +} + +TEST_P(WriteBatchWithIndexTest, IndexNoTs) { + const Comparator* const ucmp = test::BytewiseComparatorWithU64TsWrapper(); + ColumnFamilyHandleImplDummy cf(1, ucmp); + WriteBatchWithIndex wbwi; + ASSERT_OK(wbwi.Put(&cf, "a", "a0")); + ASSERT_OK(wbwi.Put(&cf, "a", "a1")); + { + std::string ts; + PutFixed64(&ts, 10000); + ASSERT_OK(wbwi.GetWriteBatch()->UpdateTimestamps( + ts, [](uint32_t cf_id) { return cf_id == 1 ? 8 : 0; })); + } + { + std::string value; + Status s = wbwi.GetFromBatch(&cf, options_, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("a1", value); + } +} + +INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool()); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main() { + fprintf(stderr, "SKIPPED\n"); + return 0; +} + +#endif // !ROCKSDB_LITE -- cgit v1.2.3