summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/utilities
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/utilities
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/rocksdb/utilities/agg_merge/agg_merge.cc238
-rw-r--r--src/rocksdb/utilities/agg_merge/agg_merge.h49
-rw-r--r--src/rocksdb/utilities/agg_merge/agg_merge_test.cc135
-rw-r--r--src/rocksdb/utilities/agg_merge/test_agg_merge.cc104
-rw-r--r--src/rocksdb/utilities/agg_merge/test_agg_merge.h47
-rw-r--r--src/rocksdb/utilities/backup/backup_engine.cc3181
-rw-r--r--src/rocksdb/utilities/backup/backup_engine_impl.h36
-rw-r--r--src/rocksdb/utilities/backup/backup_engine_test.cc4219
-rw-r--r--src/rocksdb/utilities/blob_db/blob_compaction_filter.cc490
-rw-r--r--src/rocksdb/utilities/blob_db/blob_compaction_filter.h204
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db.cc114
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db.h266
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_gc_stats.h56
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_impl.cc2177
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_impl.h503
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc113
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_iterator.h150
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_listener.h71
-rw-r--r--src/rocksdb/utilities/blob_db/blob_db_test.cc2407
-rw-r--r--src/rocksdb/utilities/blob_db/blob_dump_tool.cc282
-rw-r--r--src/rocksdb/utilities/blob_db/blob_dump_tool.h58
-rw-r--r--src/rocksdb/utilities/blob_db/blob_file.cc318
-rw-r--r--src/rocksdb/utilities/blob_db/blob_file.h246
-rw-r--r--src/rocksdb/utilities/cache_dump_load.cc69
-rw-r--r--src/rocksdb/utilities/cache_dump_load_impl.cc393
-rw-r--r--src/rocksdb/utilities/cache_dump_load_impl.h359
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc110
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h57
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_format_test.cc377
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_functional_test.cc446
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_options.h43
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc98
-rw-r--r--src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc164
-rw-r--r--src/rocksdb/utilities/cassandra/format.cc367
-rw-r--r--src/rocksdb/utilities/cassandra/format.h183
-rw-r--r--src/rocksdb/utilities/cassandra/merge_operator.cc82
-rw-r--r--src/rocksdb/utilities/cassandra/merge_operator.h44
-rw-r--r--src/rocksdb/utilities/cassandra/serialize.h81
-rw-r--r--src/rocksdb/utilities/cassandra/test_utils.cc69
-rw-r--r--src/rocksdb/utilities/cassandra/test_utils.h42
-rw-r--r--src/rocksdb/utilities/checkpoint/checkpoint_impl.cc469
-rw-r--r--src/rocksdb/utilities/checkpoint/checkpoint_impl.h66
-rw-r--r--src/rocksdb/utilities/checkpoint/checkpoint_test.cc974
-rw-r--r--src/rocksdb/utilities/compaction_filters.cc56
-rw-r--r--src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h41
-rw-r--r--src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc26
-rw-r--r--src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h28
-rw-r--r--src/rocksdb/utilities/convenience/info_log_finder.cc26
-rw-r--r--src/rocksdb/utilities/counted_fs.cc379
-rw-r--r--src/rocksdb/utilities/counted_fs.h158
-rw-r--r--src/rocksdb/utilities/debug.cc120
-rw-r--r--src/rocksdb/utilities/env_mirror.cc275
-rw-r--r--src/rocksdb/utilities/env_mirror_test.cc226
-rw-r--r--src/rocksdb/utilities/env_timed.cc187
-rw-r--r--src/rocksdb/utilities/env_timed.h97
-rw-r--r--src/rocksdb/utilities/env_timed_test.cc44
-rw-r--r--src/rocksdb/utilities/fault_injection_env.cc555
-rw-r--r--src/rocksdb/utilities/fault_injection_env.h258
-rw-r--r--src/rocksdb/utilities/fault_injection_fs.cc1032
-rw-r--r--src/rocksdb/utilities/fault_injection_fs.h584
-rw-r--r--src/rocksdb/utilities/fault_injection_secondary_cache.cc131
-rw-r--r--src/rocksdb/utilities/fault_injection_secondary_cache.h108
-rw-r--r--src/rocksdb/utilities/leveldb_options/leveldb_options.cc57
-rw-r--r--src/rocksdb/utilities/memory/memory_test.cc279
-rw-r--r--src/rocksdb/utilities/memory/memory_util.cc52
-rw-r--r--src/rocksdb/utilities/memory_allocators.h104
-rw-r--r--src/rocksdb/utilities/merge_operators.cc120
-rw-r--r--src/rocksdb/utilities/merge_operators.h36
-rw-r--r--src/rocksdb/utilities/merge_operators/bytesxor.cc57
-rw-r--r--src/rocksdb/utilities/merge_operators/bytesxor.h40
-rw-r--r--src/rocksdb/utilities/merge_operators/max.cc80
-rw-r--r--src/rocksdb/utilities/merge_operators/put.cc92
-rw-r--r--src/rocksdb/utilities/merge_operators/sortlist.cc95
-rw-r--r--src/rocksdb/utilities/merge_operators/sortlist.h42
-rw-r--r--src/rocksdb/utilities/merge_operators/string_append/stringappend.cc78
-rw-r--r--src/rocksdb/utilities/merge_operators/string_append/stringappend.h32
-rw-r--r--src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc132
-rw-r--r--src/rocksdb/utilities/merge_operators/string_append/stringappend2.h52
-rw-r--r--src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc640
-rw-r--r--src/rocksdb/utilities/merge_operators/uint64add.cc75
-rw-r--r--src/rocksdb/utilities/object_registry.cc383
-rw-r--r--src/rocksdb/utilities/object_registry_test.cc872
-rw-r--r--src/rocksdb/utilities/option_change_migration/option_change_migration.cc186
-rw-r--r--src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc550
-rw-r--r--src/rocksdb/utilities/options/options_util.cc159
-rw-r--r--src/rocksdb/utilities/options/options_util_test.cc779
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier.cc422
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier.h156
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc610
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h293
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h127
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc86
-rw-r--r--src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h124
-rw-r--r--src/rocksdb/utilities/persistent_cache/hash_table.h239
-rw-r--r--src/rocksdb/utilities/persistent_cache/hash_table_bench.cc310
-rw-r--r--src/rocksdb/utilities/persistent_cache/hash_table_evictable.h168
-rw-r--r--src/rocksdb/utilities/persistent_cache/hash_table_test.cc163
-rw-r--r--src/rocksdb/utilities/persistent_cache/lrulist.h174
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc359
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc462
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_test.h286
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc167
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h342
-rw-r--r--src/rocksdb/utilities/persistent_cache/persistent_cache_util.h67
-rw-r--r--src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc140
-rw-r--r--src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h141
-rw-r--r--src/rocksdb/utilities/simulator_cache/cache_simulator.cc288
-rw-r--r--src/rocksdb/utilities/simulator_cache/cache_simulator.h231
-rw-r--r--src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc497
-rw-r--r--src/rocksdb/utilities/simulator_cache/sim_cache.cc364
-rw-r--r--src/rocksdb/utilities/simulator_cache/sim_cache_test.cc226
-rw-r--r--src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc227
-rw-r--r--src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h70
-rw-r--r--src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc245
-rw-r--r--src/rocksdb/utilities/trace/file_trace_reader_writer.cc133
-rw-r--r--src/rocksdb/utilities/trace/file_trace_reader_writer.h48
-rw-r--r--src/rocksdb/utilities/trace/replayer_impl.cc316
-rw-r--r--src/rocksdb/utilities/trace/replayer_impl.h86
-rw-r--r--src/rocksdb/utilities/transactions/lock/lock_manager.cc29
-rw-r--r--src/rocksdb/utilities/transactions/lock/lock_manager.h82
-rw-r--r--src/rocksdb/utilities/transactions/lock/lock_tracker.h209
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc721
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h224
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc181
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h324
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc257
-rw-r--r--src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h99
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h36
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc459
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3661
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2174
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2339
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README13
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h76
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h138
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h102
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc139
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h174
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc222
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h141
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc527
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h255
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc1023
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h580
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc527
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc265
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h178
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc520
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h302
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc120
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h92
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc213
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h124
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h215
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h39
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h130
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h83
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h286
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h87
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h520
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h179
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h193
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h27
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc132
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc153
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h98
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h144
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc201
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h141
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h794
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h1295
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h165
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h76
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc503
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h137
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc156
-rw-r--r--src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h146
-rw-r--r--src/rocksdb/utilities/transactions/optimistic_transaction.cc196
-rw-r--r--src/rocksdb/utilities/transactions/optimistic_transaction.h101
-rw-r--r--src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc111
-rw-r--r--src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h88
-rw-r--r--src/rocksdb/utilities/transactions/optimistic_transaction_test.cc1491
-rw-r--r--src/rocksdb/utilities/transactions/pessimistic_transaction.cc1175
-rw-r--r--src/rocksdb/utilities/transactions/pessimistic_transaction.h313
-rw-r--r--src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc782
-rw-r--r--src/rocksdb/utilities/transactions/pessimistic_transaction_db.h318
-rw-r--r--src/rocksdb/utilities/transactions/snapshot_checker.cc53
-rw-r--r--src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc466
-rw-r--r--src/rocksdb/utilities/transactions/transaction_base.cc731
-rw-r--r--src/rocksdb/utilities/transactions/transaction_base.h384
-rw-r--r--src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc135
-rw-r--r--src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h26
-rw-r--r--src/rocksdb/utilities/transactions/transaction_test.cc6550
-rw-r--r--src/rocksdb/utilities/transactions/transaction_test.h578
-rw-r--r--src/rocksdb/utilities/transactions/transaction_util.cc206
-rw-r--r--src/rocksdb/utilities/transactions/transaction_util.h85
-rw-r--r--src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc588
-rw-r--r--src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc4078
-rw-r--r--src/rocksdb/utilities/transactions/write_prepared_txn.cc512
-rw-r--r--src/rocksdb/utilities/transactions/write_prepared_txn.h119
-rw-r--r--src/rocksdb/utilities/transactions/write_prepared_txn_db.cc1030
-rw-r--r--src/rocksdb/utilities/transactions/write_prepared_txn_db.h1125
-rw-r--r--src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc790
-rw-r--r--src/rocksdb/utilities/transactions/write_unprepared_txn.cc1053
-rw-r--r--src/rocksdb/utilities/transactions/write_unprepared_txn.h341
-rw-r--r--src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc473
-rw-r--r--src/rocksdb/utilities/transactions/write_unprepared_txn_db.h108
-rw-r--r--src/rocksdb/utilities/ttl/db_ttl_impl.cc609
-rw-r--r--src/rocksdb/utilities/ttl/db_ttl_impl.h245
-rw-r--r--src/rocksdb/utilities/ttl/ttl_test.cc912
-rw-r--r--src/rocksdb/utilities/util_merge_operators_test.cc100
-rw-r--r--src/rocksdb/utilities/wal_filter.cc23
-rw-r--r--src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc695
-rw-r--r--src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc735
-rw-r--r--src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h344
-rw-r--r--src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc2419
216 files changed, 82361 insertions, 0 deletions
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.cc b/src/rocksdb/utilities/agg_merge/agg_merge.cc
new file mode 100644
index 000000000..a7eab1f12
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/agg_merge/agg_merge.h"
+
+#include <assert.h>
+
+#include <deque>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "port/lang.h"
+#include "port/likely.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, std::unique_ptr<Aggregator>> func_map;
+const std::string kUnnamedFuncName = "";
+const std::string kErrorFuncName = "kErrorFuncName";
+
+Status AddAggregator(const std::string& function_name,
+ std::unique_ptr<Aggregator>&& agg) {
+ if (function_name == kErrorFuncName) {
+ return Status::InvalidArgument(
+ "Cannot register function name kErrorFuncName");
+ }
+ func_map.emplace(function_name, std::move(agg));
+ return Status::OK();
+}
+
+AggMergeOperator::AggMergeOperator() {}
+
+std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name,
+ const Slice& value) {
+ std::string result;
+ PutLengthPrefixedSlice(&result, function_name);
+ result += value.ToString();
+ return result;
+}
+
+Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload,
+ std::string& output) {
+ if (function_name == kErrorFuncName) {
+ return Status::InvalidArgument("Cannot use error function name");
+ }
+ if (function_name != kUnnamedFuncName &&
+ func_map.find(function_name.ToString()) == func_map.end()) {
+ return Status::InvalidArgument("Function name not registered");
+ }
+ output = EncodeAggFuncAndPayloadNoCheck(function_name, payload);
+ return Status::OK();
+}
+
+bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value) {
+ value = op;
+ return GetLengthPrefixedSlice(&value, &func);
+}
+
+bool ExtractList(const Slice& encoded_list, std::vector<Slice>& decoded_list) {
+ decoded_list.clear();
+ Slice list_slice = encoded_list;
+ Slice item;
+ while (GetLengthPrefixedSlice(&list_slice, &item)) {
+ decoded_list.push_back(item);
+ }
+ return list_slice.empty();
+}
+
+class AggMergeOperator::Accumulator {
+ public:
+ bool Add(const Slice& op, bool is_partial_aggregation) {
+ if (ignore_operands_) {
+ return true;
+ }
+ Slice my_func;
+ Slice my_value;
+ bool ret = ExtractAggFuncAndValue(op, my_func, my_value);
+ if (!ret) {
+ ignore_operands_ = true;
+ return true;
+ }
+
+ // Determine whether we need to do partial merge.
+ if (is_partial_aggregation && !my_func.empty()) {
+ auto f = func_map.find(my_func.ToString());
+ if (f == func_map.end() || !f->second->DoPartialAggregate()) {
+ return false;
+ }
+ }
+
+ if (!func_valid_) {
+ if (my_func != kUnnamedFuncName) {
+ func_ = my_func;
+ func_valid_ = true;
+ }
+ } else if (func_ != my_func) {
+ // User switched aggregation function. Need to aggregate the older
+ // one first.
+
+ // Previous aggreagion can't be done in partial merge
+ if (is_partial_aggregation) {
+ func_valid_ = false;
+ ignore_operands_ = true;
+ return false;
+ }
+
+ // We could consider stashing an iterator into the hash of aggregators
+ // to avoid repeated lookups when the aggregator doesn't change.
+ auto f = func_map.find(func_.ToString());
+ if (f == func_map.end() || !f->second->Aggregate(values_, scratch_)) {
+ func_valid_ = false;
+ ignore_operands_ = true;
+ return true;
+ }
+ std::swap(scratch_, aggregated_);
+ values_.clear();
+ values_.push_back(aggregated_);
+ func_ = my_func;
+ }
+ values_.push_back(my_value);
+ return true;
+ }
+
+ // Return false if aggregation fails.
+ // One possible reason
+ bool GetResult(std::string& result) {
+ if (!func_valid_) {
+ return false;
+ }
+ auto f = func_map.find(func_.ToString());
+ if (f == func_map.end()) {
+ return false;
+ }
+ if (!f->second->Aggregate(values_, scratch_)) {
+ return false;
+ }
+ result = EncodeAggFuncAndPayloadNoCheck(func_, scratch_);
+ return true;
+ }
+
+ void Clear() {
+ func_.clear();
+ values_.clear();
+ aggregated_.clear();
+ scratch_.clear();
+ ignore_operands_ = false;
+ func_valid_ = false;
+ }
+
+ private:
+ Slice func_;
+ std::vector<Slice> values_;
+ std::string aggregated_;
+ std::string scratch_;
+ bool ignore_operands_ = false;
+ bool func_valid_ = false;
+};
+
+// Creating and using a new Accumulator might invoke multiple malloc and is
+// expensive if it needs to be done when processing each merge operation.
+// AggMergeOperator's merge operators can be invoked concurrently by multiple
+// threads so we cannot simply create one Aggregator and reuse.
+// We use thread local instances instead.
+AggMergeOperator::Accumulator& AggMergeOperator::GetTLSAccumulator() {
+ static thread_local Accumulator tls_acc;
+ tls_acc.Clear();
+ return tls_acc;
+}
+
+void AggMergeOperator::PackAllMergeOperands(const MergeOperationInput& merge_in,
+ MergeOperationOutput& merge_out) {
+ merge_out.new_value = "";
+ PutLengthPrefixedSlice(&merge_out.new_value, kErrorFuncName);
+ if (merge_in.existing_value != nullptr) {
+ PutLengthPrefixedSlice(&merge_out.new_value, *merge_in.existing_value);
+ }
+ for (const Slice& op : merge_in.operand_list) {
+ PutLengthPrefixedSlice(&merge_out.new_value, op);
+ }
+}
+
+bool AggMergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ Accumulator& agg = GetTLSAccumulator();
+ if (merge_in.existing_value != nullptr) {
+ agg.Add(*merge_in.existing_value, /*is_partial_aggregation=*/false);
+ }
+ for (const Slice& e : merge_in.operand_list) {
+ agg.Add(e, /*is_partial_aggregation=*/false);
+ }
+
+ bool succ = agg.GetResult(merge_out->new_value);
+ if (!succ) {
+ // If aggregation can't happen, pack all merge operands. In contrast to
+ // merge operator, we don't want to fail the DB. If users insert wrong
+ // format or call unregistered an aggregation function, we still hope
+ // the DB can continue functioning with other keys.
+ PackAllMergeOperands(merge_in, *merge_out);
+ }
+ agg.Clear();
+ return true;
+}
+
+bool AggMergeOperator::PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const {
+ Accumulator& agg = GetTLSAccumulator();
+ bool do_aggregation = true;
+ for (const Slice& item : operand_list) {
+ do_aggregation = agg.Add(item, /*is_partial_aggregation=*/true);
+ if (!do_aggregation) {
+ break;
+ }
+ }
+ if (do_aggregation) {
+ do_aggregation = agg.GetResult(*new_value);
+ }
+ agg.Clear();
+ return do_aggregation;
+}
+
+std::shared_ptr<MergeOperator> GetAggMergeOperator() {
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<MergeOperator>, instance)
+ (std::make_shared<AggMergeOperator>());
+ assert(instance);
+ return instance;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.h b/src/rocksdb/utilities/agg_merge/agg_merge.h
new file mode 100644
index 000000000..00e58de08
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+class AggMergeOperator : public MergeOperator {
+ public:
+ explicit AggMergeOperator();
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const override;
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "AggMergeOperator.v1"; }
+
+ bool AllowSingleOperand() const override { return true; }
+
+ bool ShouldMerge(const std::vector<Slice>&) const override { return false; }
+
+ private:
+ class Accumulator;
+
+ // Pack all merge operands into one value. This is called when aggregation
+ // fails. The existing values are preserved and returned so that users can
+ // debug the problem.
+ static void PackAllMergeOperands(const MergeOperationInput& merge_in,
+ MergeOperationOutput& merge_out);
+ static Accumulator& GetTLSAccumulator();
+};
+
+extern std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name,
+ const Slice& value);
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge_test.cc b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc
new file mode 100644
index 000000000..a65441cd0
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/agg_merge.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "test_util/testharness.h"
+#include "utilities/agg_merge/agg_merge.h"
+#include "utilities/agg_merge/test_agg_merge.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class AggMergeTest : public DBTestBase {
+ public:
+ AggMergeTest() : DBTestBase("agg_merge_db_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(AggMergeTest, TestUsingMergeOperator) {
+ ASSERT_OK(AddAggregator("sum", std::make_unique<SumAggregator>()));
+ ASSERT_OK(AddAggregator("last3", std::make_unique<Last3Aggregator>()));
+ ASSERT_OK(AddAggregator("mul", std::make_unique<MultipleAggregator>()));
+
+ Options options = CurrentOptions();
+ options.merge_operator = GetAggMergeOperator();
+ Reopen(options);
+ std::string v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+ ASSERT_OK(Merge("foo", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 20);
+ ASSERT_OK(Merge("foo", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 15);
+ ASSERT_OK(Merge("foo", v));
+
+ v = EncodeHelper::EncodeFuncAndList("last3", {"a", "b"});
+ ASSERT_OK(Merge("bar", v));
+ v = EncodeHelper::EncodeFuncAndList("last3", {"c", "d", "e"});
+ ASSERT_OK(Merge("bar", v));
+ ASSERT_OK(Flush());
+ v = EncodeHelper::EncodeFuncAndList("last3", {"f"});
+ ASSERT_OK(Merge("bar", v));
+
+ // Test Put() without aggregation type.
+ v = EncodeHelper::EncodeFuncAndInt(kUnnamedFuncName, 30);
+ ASSERT_OK(Put("foo2", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+ ASSERT_OK(Merge("foo2", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 20);
+ ASSERT_OK(Merge("foo2", v));
+
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 45), Get("foo"));
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndList("last3", {"f", "c", "d"}),
+ Get("bar"));
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 60), Get("foo2"));
+
+ // Test changing aggregation type
+ v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+ ASSERT_OK(Put("bar2", v));
+ v = EncodeHelper::EncodeFuncAndInt("mul", 20);
+ ASSERT_OK(Merge("bar2", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 30);
+ ASSERT_OK(Merge("bar2", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 40);
+ ASSERT_OK(Merge("bar2", v));
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 20 + 30 + 40),
+ Get("bar2"));
+
+ // Changing aggregation type with partial merge
+ v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+ ASSERT_OK(Merge("foo3", v));
+ ASSERT_OK(Flush());
+ v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+ ASSERT_OK(Merge("foo3", v));
+ v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+ ASSERT_OK(Merge("foo3", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+ ASSERT_OK(Merge("foo3", v));
+ ASSERT_OK(Flush());
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 10 * 10 + 10),
+ Get("foo3"));
+
+ // Merge after full merge
+ v = EncodeHelper::EncodeFuncAndInt("sum", 1);
+ ASSERT_OK(Merge("foo4", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 2);
+ ASSERT_OK(Merge("foo4", v));
+ ASSERT_OK(Flush());
+ v = EncodeHelper::EncodeFuncAndInt("sum", 3);
+ ASSERT_OK(Merge("foo4", v));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 4);
+ ASSERT_OK(Merge("foo4", v));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ v = EncodeHelper::EncodeFuncAndInt("sum", 5);
+ ASSERT_OK(Merge("foo4", v));
+ EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 15), Get("foo4"));
+
+ // Test unregistered function name
+ v = EncodeAggFuncAndPayloadNoCheck("non_existing", "1");
+ ASSERT_OK(Merge("bar3", v));
+ std::string v1;
+ v1 = EncodeAggFuncAndPayloadNoCheck("non_existing", "invalid");
+ ;
+ ASSERT_OK(Merge("bar3", v1));
+ EXPECT_EQ(EncodeAggFuncAndPayloadNoCheck(kErrorFuncName,
+ EncodeHelper::EncodeList({v, v1})),
+ Get("bar3"));
+
+ // invalidate input
+ ASSERT_OK(EncodeAggFuncAndPayload("sum", "invalid", v));
+ ASSERT_OK(Merge("bar4", v));
+ v1 = EncodeHelper::EncodeFuncAndInt("sum", 20);
+ ASSERT_OK(Merge("bar4", v1));
+ std::string aggregated_value = Get("bar4");
+ Slice func, payload;
+ ASSERT_TRUE(ExtractAggFuncAndValue(aggregated_value, func, payload));
+ EXPECT_EQ(kErrorFuncName, func);
+ std::vector<Slice> decoded_list;
+ ASSERT_TRUE(ExtractList(payload, decoded_list));
+ ASSERT_EQ(2, decoded_list.size());
+ ASSERT_EQ(v, decoded_list[0]);
+ ASSERT_EQ(v1, decoded_list[1]);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.cc b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc
new file mode 100644
index 000000000..06e5b5697
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_agg_merge.h"
+
+#include <assert.h>
+
+#include <deque>
+#include <vector>
+
+#include "util/coding.h"
+#include "utilities/agg_merge/agg_merge.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeHelper::EncodeFuncAndInt(const Slice& function_name,
+ int64_t value) {
+ std::string encoded_value;
+ PutVarsignedint64(&encoded_value, value);
+ std::string ret;
+ Status s = EncodeAggFuncAndPayload(function_name, encoded_value, ret);
+ assert(s.ok());
+ return ret;
+}
+
+std::string EncodeHelper::EncodeInt(int64_t value) {
+ std::string encoded_value;
+ PutVarsignedint64(&encoded_value, value);
+ return encoded_value;
+}
+
+std::string EncodeHelper::EncodeFuncAndList(const Slice& function_name,
+ const std::vector<Slice>& list) {
+ std::string ret;
+ Status s = EncodeAggFuncAndPayload(function_name, EncodeList(list), ret);
+ assert(s.ok());
+ return ret;
+}
+
+std::string EncodeHelper::EncodeList(const std::vector<Slice>& list) {
+ std::string result;
+ for (const Slice& entity : list) {
+ PutLengthPrefixedSlice(&result, entity);
+ }
+ return result;
+}
+
+bool SumAggregator::Aggregate(const std::vector<Slice>& item_list,
+ std::string& result) const {
+ int64_t sum = 0;
+ for (const Slice& item : item_list) {
+ int64_t ivalue;
+ Slice v = item;
+ if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) {
+ return false;
+ }
+ sum += ivalue;
+ }
+ result = EncodeHelper::EncodeInt(sum);
+ return true;
+}
+
+bool MultipleAggregator::Aggregate(const std::vector<Slice>& item_list,
+ std::string& result) const {
+ int64_t mresult = 1;
+ for (const Slice& item : item_list) {
+ int64_t ivalue;
+ Slice v = item;
+ if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) {
+ return false;
+ }
+ mresult *= ivalue;
+ }
+ result = EncodeHelper::EncodeInt(mresult);
+ return true;
+}
+
+bool Last3Aggregator::Aggregate(const std::vector<Slice>& item_list,
+ std::string& result) const {
+ std::vector<Slice> last3;
+ last3.reserve(3);
+ for (auto it = item_list.rbegin(); it != item_list.rend(); ++it) {
+ Slice input = *it;
+ Slice entity;
+ bool ret;
+ while ((ret = GetLengthPrefixedSlice(&input, &entity)) == true) {
+ last3.push_back(entity);
+ if (last3.size() >= 3) {
+ break;
+ }
+ }
+ if (last3.size() >= 3) {
+ break;
+ }
+ if (!ret) {
+ continue;
+ }
+ }
+ result = EncodeHelper::EncodeList(last3);
+ return true;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.h b/src/rocksdb/utilities/agg_merge/test_agg_merge.h
new file mode 100644
index 000000000..5bdf8b9cc
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SumAggregator : public Aggregator {
+ public:
+ ~SumAggregator() override {}
+ bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+ bool DoPartialAggregate() const override { return true; }
+};
+
+class MultipleAggregator : public Aggregator {
+ public:
+ ~MultipleAggregator() override {}
+ bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+ bool DoPartialAggregate() const override { return true; }
+};
+
+class Last3Aggregator : public Aggregator {
+ public:
+ ~Last3Aggregator() override {}
+ bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+};
+
+class EncodeHelper {
+ public:
+ static std::string EncodeFuncAndInt(const Slice& function_name,
+ int64_t value);
+ static std::string EncodeInt(int64_t value);
+ static std::string EncodeList(const std::vector<Slice>& list);
+ static std::string EncodeFuncAndList(const Slice& function_name,
+ const std::vector<Slice>& list);
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/backup/backup_engine.cc b/src/rocksdb/utilities/backup/backup_engine.cc
new file mode 100644
index 000000000..81b4a6629
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine.cc
@@ -0,0 +1,3181 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <cstdlib>
+#include <functional>
+#include <future>
+#include <limits>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/fs_readonly.h"
+#include "env/fs_remap.h"
+#include "file/filename.h"
+#include "file/line_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/transaction_log.h"
+#include "table/sst_file_dumper.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/channel.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/math.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/backup/backup_engine_impl.h"
+#include "utilities/checkpoint/checkpoint_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+
+constexpr BackupID kLatestBackupIDMarker = static_cast<BackupID>(-2);
+
+inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) {
+ std::string checksum_str;
+ Slice(checksum_hex).DecodeHex(&checksum_str);
+ return EndianSwapValue(DecodeFixed32(checksum_str.c_str()));
+}
+inline std::string ChecksumStrToHex(const std::string& checksum_str) {
+ return Slice(checksum_str).ToString(true);
+}
+inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) {
+ std::string checksum_str;
+ PutFixed32(&checksum_str, EndianSwapValue(checksum_value));
+ return ChecksumStrToHex(checksum_str);
+}
+
+const std::string kPrivateDirName = "private";
+const std::string kMetaDirName = "meta";
+const std::string kSharedDirName = "shared";
+const std::string kSharedChecksumDirName = "shared_checksum";
+const std::string kPrivateDirSlash = kPrivateDirName + "/";
+const std::string kMetaDirSlash = kMetaDirName + "/";
+const std::string kSharedDirSlash = kSharedDirName + "/";
+const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/";
+
+} // namespace
+
+void BackupStatistics::IncrementNumberSuccessBackup() {
+ number_success_backup++;
+}
+void BackupStatistics::IncrementNumberFailBackup() { number_fail_backup++; }
+
+uint32_t BackupStatistics::GetNumberSuccessBackup() const {
+ return number_success_backup;
+}
+uint32_t BackupStatistics::GetNumberFailBackup() const {
+ return number_fail_backup;
+}
+
+std::string BackupStatistics::ToString() const {
+ char result[50];
+ snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u",
+ GetNumberSuccessBackup(), GetNumberFailBackup());
+ return result;
+}
+
+void BackupEngineOptions::Dump(Logger* logger) const {
+ ROCKS_LOG_INFO(logger, " Options.backup_dir: %s",
+ backup_dir.c_str());
+ ROCKS_LOG_INFO(logger, " Options.backup_env: %p", backup_env);
+ ROCKS_LOG_INFO(logger, " Options.share_table_files: %d",
+ static_cast<int>(share_table_files));
+ ROCKS_LOG_INFO(logger, " Options.info_log: %p", info_log);
+ ROCKS_LOG_INFO(logger, " Options.sync: %d",
+ static_cast<int>(sync));
+ ROCKS_LOG_INFO(logger, " Options.destroy_old_data: %d",
+ static_cast<int>(destroy_old_data));
+ ROCKS_LOG_INFO(logger, " Options.backup_log_files: %d",
+ static_cast<int>(backup_log_files));
+ ROCKS_LOG_INFO(logger, " Options.backup_rate_limit: %" PRIu64,
+ backup_rate_limit);
+ ROCKS_LOG_INFO(logger, " Options.restore_rate_limit: %" PRIu64,
+ restore_rate_limit);
+ ROCKS_LOG_INFO(logger, "Options.max_background_operations: %d",
+ max_background_operations);
+}
+
+namespace {
+// -------- BackupEngineImpl class ---------
+class BackupEngineImpl {
+ public:
+ BackupEngineImpl(const BackupEngineOptions& options, Env* db_env,
+ bool read_only = false);
+ ~BackupEngineImpl();
+
+ IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+ DB* db, const std::string& app_metadata,
+ BackupID* new_backup_id_ptr);
+
+ IOStatus PurgeOldBackups(uint32_t num_backups_to_keep);
+
+ IOStatus DeleteBackup(BackupID backup_id);
+
+ void StopBackup() { stop_backup_.store(true, std::memory_order_release); }
+
+ IOStatus GarbageCollect();
+
+ // The returned BackupInfos are in chronological order, which means the
+ // latest backup comes last.
+ void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+ bool include_file_details) const;
+
+ Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+ bool include_file_details = false) const;
+
+ void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) const;
+
+ IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+ BackupID backup_id, const std::string& db_dir,
+ const std::string& wal_dir) const;
+
+ IOStatus RestoreDBFromLatestBackup(const RestoreOptions& options,
+ const std::string& db_dir,
+ const std::string& wal_dir) const {
+ // Note: don't read latest_valid_backup_id_ outside of lock
+ return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+ }
+
+ IOStatus VerifyBackup(BackupID backup_id,
+ bool verify_with_checksum = false) const;
+
+ IOStatus Initialize();
+
+ ShareFilesNaming GetNamingNoFlags() const {
+ return options_.share_files_with_checksum_naming &
+ BackupEngineOptions::kMaskNoNamingFlags;
+ }
+ ShareFilesNaming GetNamingFlags() const {
+ return options_.share_files_with_checksum_naming &
+ BackupEngineOptions::kMaskNamingFlags;
+ }
+
+ void TEST_SetDefaultRateLimitersClock(
+ const std::shared_ptr<SystemClock>& backup_rate_limiter_clock,
+ const std::shared_ptr<SystemClock>& restore_rate_limiter_clock) {
+ if (backup_rate_limiter_clock) {
+ static_cast<GenericRateLimiter*>(options_.backup_rate_limiter.get())
+ ->TEST_SetClock(backup_rate_limiter_clock);
+ }
+
+ if (restore_rate_limiter_clock) {
+ static_cast<GenericRateLimiter*>(options_.restore_rate_limiter.get())
+ ->TEST_SetClock(restore_rate_limiter_clock);
+ }
+ }
+
+ private:
+ void DeleteChildren(const std::string& dir,
+ uint32_t file_type_filter = 0) const;
+ IOStatus DeleteBackupNoGC(BackupID backup_id);
+
+ // Extends the "result" map with pathname->size mappings for the contents of
+ // "dir" in "env". Pathnames are prefixed with "dir".
+ IOStatus ReadChildFileCurrentSizes(
+ const std::string& dir, const std::shared_ptr<FileSystem>&,
+ std::unordered_map<std::string, uint64_t>* result) const;
+
+ struct FileInfo {
+ FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum,
+ const std::string& id, const std::string& sid, Temperature _temp)
+ : refs(0),
+ filename(fname),
+ size(sz),
+ checksum_hex(checksum),
+ db_id(id),
+ db_session_id(sid),
+ temp(_temp) {}
+
+ FileInfo(const FileInfo&) = delete;
+ FileInfo& operator=(const FileInfo&) = delete;
+
+ int refs;
+ const std::string filename;
+ const uint64_t size;
+ // crc32c checksum as hex. empty == unknown / unavailable
+ std::string checksum_hex;
+ // DB identities
+ // db_id is obtained for potential usage in the future but not used
+ // currently
+ const std::string db_id;
+ // db_session_id appears in the backup SST filename if the table naming
+ // option is kUseDbSessionId
+ const std::string db_session_id;
+ Temperature temp;
+
+ std::string GetDbFileName() {
+ std::string rv;
+ // extract the filename part
+ size_t slash = filename.find_last_of('/');
+ // file will either be shared/<file>, shared_checksum/<file_crc32c_size>,
+ // shared_checksum/<file_session>, shared_checksum/<file_crc32c_session>,
+ // or private/<number>/<file>
+ assert(slash != std::string::npos);
+ rv = filename.substr(slash + 1);
+
+ // if the file was in shared_checksum, extract the real file name
+ // in this case the file is <number>_<checksum>_<size>.<type>,
+ // <number>_<session>.<type>, or <number>_<checksum>_<session>.<type>
+ if (filename.substr(0, slash) == kSharedChecksumDirName) {
+ rv = GetFileFromChecksumFile(rv);
+ }
+ return rv;
+ }
+ };
+
+ // TODO: deprecate this function once we migrate all BackupEngine's rate
+ // limiting to lower-level ones (i.e, ones in file access wrapper level like
+ // `WritableFileWriter`)
+ static void LoopRateLimitRequestHelper(const size_t total_bytes_to_request,
+ RateLimiter* rate_limiter,
+ const Env::IOPriority pri,
+ Statistics* stats,
+ const RateLimiter::OpType op_type);
+
+ static inline std::string WithoutTrailingSlash(const std::string& path) {
+ if (path.empty() || path.back() != '/') {
+ return path;
+ } else {
+ return path.substr(path.size() - 1);
+ }
+ }
+
+ static inline std::string WithTrailingSlash(const std::string& path) {
+ if (path.empty() || path.back() != '/') {
+ return path + '/';
+ } else {
+ return path;
+ }
+ }
+
+ // A filesystem wrapper that makes shared backup files appear to be in the
+ // private backup directory (dst_dir), so that the private backup dir can
+ // be opened as a read-only DB.
+ class RemapSharedFileSystem : public RemapFileSystem {
+ public:
+ RemapSharedFileSystem(const std::shared_ptr<FileSystem>& base,
+ const std::string& dst_dir,
+ const std::string& src_base_dir,
+ const std::vector<std::shared_ptr<FileInfo>>& files)
+ : RemapFileSystem(base),
+ dst_dir_(WithoutTrailingSlash(dst_dir)),
+ dst_dir_slash_(WithTrailingSlash(dst_dir)),
+ src_base_dir_(WithTrailingSlash(src_base_dir)) {
+ for (auto& info : files) {
+ if (!StartsWith(info->filename, kPrivateDirSlash)) {
+ assert(StartsWith(info->filename, kSharedDirSlash) ||
+ StartsWith(info->filename, kSharedChecksumDirSlash));
+ remaps_[info->GetDbFileName()] = info;
+ }
+ }
+ }
+
+ const char* Name() const override {
+ return "BackupEngineImpl::RemapSharedFileSystem";
+ }
+
+ // Sometimes a directory listing is required in opening a DB
+ IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) override {
+ IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg);
+ if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+ // Assume remapped files exist
+ for (auto& r : remaps_) {
+ result->push_back(r.first);
+ }
+ }
+ return s;
+ }
+
+ // Sometimes a directory listing is required in opening a DB
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override {
+ IOStatus s =
+ RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg);
+ if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+ // Assume remapped files exist with recorded size
+ for (auto& r : remaps_) {
+ result->emplace_back(); // clean up with C++20
+ FileAttributes& attr = result->back();
+ attr.name = r.first;
+ attr.size_bytes = r.second->size;
+ }
+ }
+ return s;
+ }
+
+ protected:
+ // When a file in dst_dir is requested, see if we need to remap to shared
+ // file path.
+ std::pair<IOStatus, std::string> EncodePath(
+ const std::string& path) override {
+ if (path.empty() || path[0] != '/') {
+ return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+ }
+ std::pair<IOStatus, std::string> rv{IOStatus(), path};
+ if (StartsWith(path, dst_dir_slash_)) {
+ std::string relative = path.substr(dst_dir_slash_.size());
+ auto it = remaps_.find(relative);
+ if (it != remaps_.end()) {
+ rv.second = src_base_dir_ + it->second->filename;
+ }
+ }
+ return rv;
+ }
+
+ private:
+ // Absolute path to a directory that some extra files will be mapped into.
+ const std::string dst_dir_;
+ // Includes a trailing slash.
+ const std::string dst_dir_slash_;
+ // Absolute path to a directory containing some files to be mapped into
+ // dst_dir_. Includes a trailing slash.
+ const std::string src_base_dir_;
+ // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read
+ // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize
+ // sharing with other backup data in memory.
+ std::unordered_map<std::string, std::shared_ptr<FileInfo>> remaps_;
+ };
+
+ class BackupMeta {
+ public:
+ BackupMeta(
+ const std::string& meta_filename, const std::string& meta_tmp_filename,
+ std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
+ Env* env, const std::shared_ptr<FileSystem>& fs)
+ : timestamp_(0),
+ sequence_number_(0),
+ size_(0),
+ meta_filename_(meta_filename),
+ meta_tmp_filename_(meta_tmp_filename),
+ file_infos_(file_infos),
+ env_(env),
+ fs_(fs) {}
+
+ BackupMeta(const BackupMeta&) = delete;
+ BackupMeta& operator=(const BackupMeta&) = delete;
+
+ ~BackupMeta() {}
+
+ void RecordTimestamp() {
+ // Best effort
+ Status s = env_->GetCurrentTime(&timestamp_);
+ if (!s.ok()) {
+ timestamp_ = /* something clearly fabricated */ 1;
+ }
+ }
+ int64_t GetTimestamp() const { return timestamp_; }
+ uint64_t GetSize() const { return size_; }
+ uint32_t GetNumberFiles() const {
+ return static_cast<uint32_t>(files_.size());
+ }
+ void SetSequenceNumber(uint64_t sequence_number) {
+ sequence_number_ = sequence_number;
+ }
+ uint64_t GetSequenceNumber() const { return sequence_number_; }
+
+ const std::string& GetAppMetadata() const { return app_metadata_; }
+
+ void SetAppMetadata(const std::string& app_metadata) {
+ app_metadata_ = app_metadata;
+ }
+
+ IOStatus AddFile(std::shared_ptr<FileInfo> file_info);
+
+ IOStatus Delete(bool delete_meta = true);
+
+ bool Empty() const { return files_.empty(); }
+
+ std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
+ auto it = file_infos_->find(filename);
+ if (it == file_infos_->end()) {
+ return nullptr;
+ }
+ return it->second;
+ }
+
+ const std::vector<std::shared_ptr<FileInfo>>& GetFiles() const {
+ return files_;
+ }
+
+ // @param abs_path_to_size Pre-fetched file sizes (bytes).
+ IOStatus LoadFromFile(
+ const std::string& backup_dir,
+ const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+ RateLimiter* rate_limiter, Logger* info_log,
+ std::unordered_set<std::string>* reported_ignored_fields);
+ IOStatus StoreToFile(
+ bool sync, int schema_version,
+ const TEST_BackupMetaSchemaOptions* schema_test_options);
+
+ std::string GetInfoString() {
+ std::ostringstream ss;
+ ss << "Timestamp: " << timestamp_ << std::endl;
+ char human_size[16];
+ AppendHumanBytes(size_, human_size, sizeof(human_size));
+ ss << "Size: " << human_size << std::endl;
+ ss << "Files:" << std::endl;
+ for (const auto& file : files_) {
+ AppendHumanBytes(file->size, human_size, sizeof(human_size));
+ ss << file->filename << ", size " << human_size << ", refs "
+ << file->refs << std::endl;
+ }
+ return ss.str();
+ }
+
+ const std::shared_ptr<Env>& GetEnvForOpen() const {
+ if (!env_for_open_) {
+ // Lazy initialize
+ // Find directories
+ std::string dst_dir = meta_filename_;
+ auto i = dst_dir.rfind(kMetaDirSlash);
+ assert(i != std::string::npos);
+ std::string src_base_dir = dst_dir.substr(0, i);
+ dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash);
+ // Make the RemapSharedFileSystem
+ std::shared_ptr<FileSystem> remap_fs =
+ std::make_shared<RemapSharedFileSystem>(fs_, dst_dir, src_base_dir,
+ files_);
+ // Make it read-only for safety
+ remap_fs = std::make_shared<ReadOnlyFileSystem>(remap_fs);
+ // Make an Env wrapper
+ env_for_open_ = std::make_shared<CompositeEnvWrapper>(env_, remap_fs);
+ }
+ return env_for_open_;
+ }
+
+ private:
+ int64_t timestamp_;
+ // sequence number is only approximate, should not be used
+ // by clients
+ uint64_t sequence_number_;
+ uint64_t size_;
+ std::string app_metadata_;
+ std::string const meta_filename_;
+ std::string const meta_tmp_filename_;
+ // files with relative paths (without "/" prefix!!)
+ std::vector<std::shared_ptr<FileInfo>> files_;
+ std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
+ Env* env_;
+ mutable std::shared_ptr<Env> env_for_open_;
+ std::shared_ptr<FileSystem> fs_;
+ IOOptions iooptions_ = IOOptions();
+ }; // BackupMeta
+
+ void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta,
+ BackupInfo* backup_info,
+ bool include_file_details) const;
+
+ inline std::string GetAbsolutePath(
+ const std::string& relative_path = "") const {
+ assert(relative_path.size() == 0 || relative_path[0] != '/');
+ return options_.backup_dir + "/" + relative_path;
+ }
+ inline std::string GetPrivateFileRel(BackupID backup_id, bool tmp = false,
+ const std::string& file = "") const {
+ assert(file.size() == 0 || file[0] != '/');
+ return kPrivateDirSlash + std::to_string(backup_id) + (tmp ? ".tmp" : "") +
+ "/" + file;
+ }
+ inline std::string GetSharedFileRel(const std::string& file = "",
+ bool tmp = false) const {
+ assert(file.size() == 0 || file[0] != '/');
+ return kSharedDirSlash + std::string(tmp ? "." : "") + file +
+ (tmp ? ".tmp" : "");
+ }
+ inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
+ bool tmp = false) const {
+ assert(file.size() == 0 || file[0] != '/');
+ return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file +
+ (tmp ? ".tmp" : "");
+ }
+ inline bool UseLegacyNaming(const std::string& sid) const {
+ return GetNamingNoFlags() ==
+ BackupEngineOptions::kLegacyCrc32cAndFileSize ||
+ sid.empty();
+ }
+ inline std::string GetSharedFileWithChecksum(
+ const std::string& file, const std::string& checksum_hex,
+ const uint64_t file_size, const std::string& db_session_id) const {
+ assert(file.size() == 0 || file[0] != '/');
+ std::string file_copy = file;
+ if (UseLegacyNaming(db_session_id)) {
+ assert(!checksum_hex.empty());
+ file_copy.insert(file_copy.find_last_of('.'),
+ "_" + std::to_string(ChecksumHexToInt32(checksum_hex)) +
+ "_" + std::to_string(file_size));
+ } else {
+ file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id);
+ if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) {
+ file_copy.insert(file_copy.find_last_of('.'),
+ "_" + std::to_string(file_size));
+ }
+ }
+ return file_copy;
+ }
+ static inline std::string GetFileFromChecksumFile(const std::string& file) {
+ assert(file.size() == 0 || file[0] != '/');
+ std::string file_copy = file;
+ size_t first_underscore = file_copy.find_first_of('_');
+ return file_copy.erase(first_underscore,
+ file_copy.find_last_of('.') - first_underscore);
+ }
+ inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const {
+ return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") +
+ std::to_string(backup_id) + (tmp ? ".tmp" : "");
+ }
+
+ // If size_limit == 0, there is no size limit, copy everything.
+ //
+ // Exactly one of src and contents must be non-empty.
+ //
+ // @param src If non-empty, the file is copied from this pathname.
+ // @param contents If non-empty, the file will be created with these contents.
+ // @param src_temperature Pass in expected temperature of src, return back
+ // temperature reported by FileSystem
+ IOStatus CopyOrCreateFile(const std::string& src, const std::string& dst,
+ const std::string& contents, uint64_t size_limit,
+ Env* src_env, Env* dst_env,
+ const EnvOptions& src_env_options, bool sync,
+ RateLimiter* rate_limiter,
+ std::function<void()> progress_callback,
+ Temperature* src_temperature,
+ Temperature dst_temperature,
+ uint64_t* bytes_toward_next_callback,
+ uint64_t* size, std::string* checksum_hex);
+
+ IOStatus ReadFileAndComputeChecksum(const std::string& src,
+ const std::shared_ptr<FileSystem>& src_fs,
+ const EnvOptions& src_env_options,
+ uint64_t size_limit,
+ std::string* checksum_hex,
+ const Temperature src_temperature) const;
+
+ // Obtain db_id and db_session_id from the table properties of file_path
+ Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
+ const std::string& file_path,
+ Temperature file_temp, RateLimiter* rate_limiter,
+ std::string* db_id, std::string* db_session_id);
+
+ struct CopyOrCreateResult {
+ ~CopyOrCreateResult() {
+ // The Status needs to be ignored here for two reasons.
+ // First, if the BackupEngineImpl shuts down with jobs outstanding, then
+ // it is possible that the Status in the future/promise is never read,
+ // resulting in an unchecked Status. Second, if there are items in the
+ // channel when the BackupEngineImpl is shutdown, these will also have
+ // Status that have not been checked. This
+ // TODO: Fix those issues so that the Status
+ io_status.PermitUncheckedError();
+ }
+ uint64_t size;
+ std::string checksum_hex;
+ std::string db_id;
+ std::string db_session_id;
+ IOStatus io_status;
+ Temperature expected_src_temperature = Temperature::kUnknown;
+ Temperature current_src_temperature = Temperature::kUnknown;
+ };
+
+ // Exactly one of src_path and contents must be non-empty. If src_path is
+ // non-empty, the file is copied from this pathname. Otherwise, if contents is
+ // non-empty, the file will be created at dst_path with these contents.
+ struct CopyOrCreateWorkItem {
+ std::string src_path;
+ std::string dst_path;
+ Temperature src_temperature;
+ Temperature dst_temperature;
+ std::string contents;
+ Env* src_env;
+ Env* dst_env;
+ EnvOptions src_env_options;
+ bool sync;
+ RateLimiter* rate_limiter;
+ uint64_t size_limit;
+ Statistics* stats;
+ std::promise<CopyOrCreateResult> result;
+ std::function<void()> progress_callback;
+ std::string src_checksum_func_name;
+ std::string src_checksum_hex;
+ std::string db_id;
+ std::string db_session_id;
+
+ CopyOrCreateWorkItem()
+ : src_path(""),
+ dst_path(""),
+ src_temperature(Temperature::kUnknown),
+ dst_temperature(Temperature::kUnknown),
+ contents(""),
+ src_env(nullptr),
+ dst_env(nullptr),
+ src_env_options(),
+ sync(false),
+ rate_limiter(nullptr),
+ size_limit(0),
+ stats(nullptr),
+ src_checksum_func_name(kUnknownFileChecksumFuncName),
+ src_checksum_hex(""),
+ db_id(""),
+ db_session_id("") {}
+
+ CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete;
+ CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete;
+
+ CopyOrCreateWorkItem(CopyOrCreateWorkItem&& o) noexcept {
+ *this = std::move(o);
+ }
+
+ CopyOrCreateWorkItem& operator=(CopyOrCreateWorkItem&& o) noexcept {
+ src_path = std::move(o.src_path);
+ dst_path = std::move(o.dst_path);
+ src_temperature = std::move(o.src_temperature);
+ dst_temperature = std::move(o.dst_temperature);
+ contents = std::move(o.contents);
+ src_env = o.src_env;
+ dst_env = o.dst_env;
+ src_env_options = std::move(o.src_env_options);
+ sync = o.sync;
+ rate_limiter = o.rate_limiter;
+ size_limit = o.size_limit;
+ stats = o.stats;
+ result = std::move(o.result);
+ progress_callback = std::move(o.progress_callback);
+ src_checksum_func_name = std::move(o.src_checksum_func_name);
+ src_checksum_hex = std::move(o.src_checksum_hex);
+ db_id = std::move(o.db_id);
+ db_session_id = std::move(o.db_session_id);
+ src_temperature = o.src_temperature;
+ return *this;
+ }
+
+ CopyOrCreateWorkItem(
+ std::string _src_path, std::string _dst_path,
+ const Temperature _src_temperature, const Temperature _dst_temperature,
+ std::string _contents, Env* _src_env, Env* _dst_env,
+ EnvOptions _src_env_options, bool _sync, RateLimiter* _rate_limiter,
+ uint64_t _size_limit, Statistics* _stats,
+ std::function<void()> _progress_callback = []() {},
+ const std::string& _src_checksum_func_name =
+ kUnknownFileChecksumFuncName,
+ const std::string& _src_checksum_hex = "",
+ const std::string& _db_id = "", const std::string& _db_session_id = "")
+ : src_path(std::move(_src_path)),
+ dst_path(std::move(_dst_path)),
+ src_temperature(_src_temperature),
+ dst_temperature(_dst_temperature),
+ contents(std::move(_contents)),
+ src_env(_src_env),
+ dst_env(_dst_env),
+ src_env_options(std::move(_src_env_options)),
+ sync(_sync),
+ rate_limiter(_rate_limiter),
+ size_limit(_size_limit),
+ stats(_stats),
+ progress_callback(_progress_callback),
+ src_checksum_func_name(_src_checksum_func_name),
+ src_checksum_hex(_src_checksum_hex),
+ db_id(_db_id),
+ db_session_id(_db_session_id) {}
+ };
+
+ struct BackupAfterCopyOrCreateWorkItem {
+ std::future<CopyOrCreateResult> result;
+ bool shared;
+ bool needed_to_copy;
+ Env* backup_env;
+ std::string dst_path_tmp;
+ std::string dst_path;
+ std::string dst_relative;
+ BackupAfterCopyOrCreateWorkItem()
+ : shared(false),
+ needed_to_copy(false),
+ backup_env(nullptr),
+ dst_path_tmp(""),
+ dst_path(""),
+ dst_relative("") {}
+
+ BackupAfterCopyOrCreateWorkItem(
+ BackupAfterCopyOrCreateWorkItem&& o) noexcept {
+ *this = std::move(o);
+ }
+
+ BackupAfterCopyOrCreateWorkItem& operator=(
+ BackupAfterCopyOrCreateWorkItem&& o) noexcept {
+ result = std::move(o.result);
+ shared = o.shared;
+ needed_to_copy = o.needed_to_copy;
+ backup_env = o.backup_env;
+ dst_path_tmp = std::move(o.dst_path_tmp);
+ dst_path = std::move(o.dst_path);
+ dst_relative = std::move(o.dst_relative);
+ return *this;
+ }
+
+ BackupAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
+ bool _shared, bool _needed_to_copy,
+ Env* _backup_env, std::string _dst_path_tmp,
+ std::string _dst_path,
+ std::string _dst_relative)
+ : result(std::move(_result)),
+ shared(_shared),
+ needed_to_copy(_needed_to_copy),
+ backup_env(_backup_env),
+ dst_path_tmp(std::move(_dst_path_tmp)),
+ dst_path(std::move(_dst_path)),
+ dst_relative(std::move(_dst_relative)) {}
+ };
+
+ struct RestoreAfterCopyOrCreateWorkItem {
+ std::future<CopyOrCreateResult> result;
+ std::string from_file;
+ std::string to_file;
+ std::string checksum_hex;
+ RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {}
+ RestoreAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
+ const std::string& _from_file,
+ const std::string& _to_file,
+ const std::string& _checksum_hex)
+ : result(std::move(_result)),
+ from_file(_from_file),
+ to_file(_to_file),
+ checksum_hex(_checksum_hex) {}
+ RestoreAfterCopyOrCreateWorkItem(
+ RestoreAfterCopyOrCreateWorkItem&& o) noexcept {
+ *this = std::move(o);
+ }
+
+ RestoreAfterCopyOrCreateWorkItem& operator=(
+ RestoreAfterCopyOrCreateWorkItem&& o) noexcept {
+ result = std::move(o.result);
+ checksum_hex = std::move(o.checksum_hex);
+ return *this;
+ }
+ };
+
+ bool initialized_;
+ std::mutex byte_report_mutex_;
+ mutable channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
+ std::vector<port::Thread> threads_;
+ std::atomic<CpuPriority> threads_cpu_priority_;
+
+ // Certain operations like PurgeOldBackups and DeleteBackup will trigger
+ // automatic GarbageCollect (true) unless we've already done one in this
+ // session and have not failed to delete backup files since then (false).
+ bool might_need_garbage_collect_ = true;
+
+ // Adds a file to the backup work queue to be copied or created if it doesn't
+ // already exist.
+ //
+ // Exactly one of src_dir and contents must be non-empty.
+ //
+ // @param src_dir If non-empty, the file in this directory named fname will be
+ // copied.
+ // @param fname Name of destination file and, in case of copy, source file.
+ // @param contents If non-empty, the file will be created with these contents.
+ IOStatus AddBackupFileWorkItem(
+ std::unordered_set<std::string>& live_dst_paths,
+ std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
+ BackupID backup_id, bool shared, const std::string& src_dir,
+ const std::string& fname, // starts with "/"
+ const EnvOptions& src_env_options, RateLimiter* rate_limiter,
+ FileType file_type, uint64_t size_bytes, Statistics* stats,
+ uint64_t size_limit = 0, bool shared_checksum = false,
+ std::function<void()> progress_callback = []() {},
+ const std::string& contents = std::string(),
+ const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName,
+ const std::string& src_checksum_str = kUnknownFileChecksum,
+ const Temperature src_temperature = Temperature::kUnknown);
+
+ // backup state data
+ BackupID latest_backup_id_;
+ BackupID latest_valid_backup_id_;
+ std::map<BackupID, std::unique_ptr<BackupMeta>> backups_;
+ std::map<BackupID, std::pair<IOStatus, std::unique_ptr<BackupMeta>>>
+ corrupt_backups_;
+ std::unordered_map<std::string, std::shared_ptr<FileInfo>>
+ backuped_file_infos_;
+ std::atomic<bool> stop_backup_;
+
+ // options data
+ BackupEngineOptions options_;
+ Env* db_env_;
+ Env* backup_env_;
+
+ // directories
+ std::unique_ptr<FSDirectory> backup_directory_;
+ std::unique_ptr<FSDirectory> shared_directory_;
+ std::unique_ptr<FSDirectory> meta_directory_;
+ std::unique_ptr<FSDirectory> private_directory_;
+
+ static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB
+ bool read_only_;
+ BackupStatistics backup_statistics_;
+ std::unordered_set<std::string> reported_ignored_fields_;
+ static const size_t kMaxAppMetaSize = 1024 * 1024; // 1MB
+ std::shared_ptr<FileSystem> db_fs_;
+ std::shared_ptr<FileSystem> backup_fs_;
+ IOOptions io_options_ = IOOptions();
+
+ public:
+ std::unique_ptr<TEST_BackupMetaSchemaOptions> schema_test_options_;
+};
+
+// -------- BackupEngineImplThreadSafe class ---------
+// This locking layer for thread safety in the public API is layered on
+// top to prevent accidental recursive locking with RWMutex, which is UB.
+// Note: BackupEngineReadOnlyBase inherited twice, but has no fields
+class BackupEngineImplThreadSafe : public BackupEngine,
+ public BackupEngineReadOnly {
+ public:
+ BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env,
+ bool read_only = false)
+ : impl_(options, db_env, read_only) {}
+ ~BackupEngineImplThreadSafe() override {}
+
+ using BackupEngine::CreateNewBackupWithMetadata;
+ IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+ DB* db, const std::string& app_metadata,
+ BackupID* new_backup_id) override {
+ WriteLock lock(&mutex_);
+ return impl_.CreateNewBackupWithMetadata(options, db, app_metadata,
+ new_backup_id);
+ }
+
+ IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) override {
+ WriteLock lock(&mutex_);
+ return impl_.PurgeOldBackups(num_backups_to_keep);
+ }
+
+ IOStatus DeleteBackup(BackupID backup_id) override {
+ WriteLock lock(&mutex_);
+ return impl_.DeleteBackup(backup_id);
+ }
+
+ void StopBackup() override {
+ // No locking needed
+ impl_.StopBackup();
+ }
+
+ IOStatus GarbageCollect() override {
+ WriteLock lock(&mutex_);
+ return impl_.GarbageCollect();
+ }
+
+ Status GetLatestBackupInfo(BackupInfo* backup_info,
+ bool include_file_details = false) const override {
+ ReadLock lock(&mutex_);
+ return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info,
+ include_file_details);
+ }
+
+ Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+ bool include_file_details = false) const override {
+ ReadLock lock(&mutex_);
+ return impl_.GetBackupInfo(backup_id, backup_info, include_file_details);
+ }
+
+ void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+ bool include_file_details) const override {
+ ReadLock lock(&mutex_);
+ impl_.GetBackupInfo(backup_info, include_file_details);
+ }
+
+ void GetCorruptedBackups(
+ std::vector<BackupID>* corrupt_backup_ids) const override {
+ ReadLock lock(&mutex_);
+ impl_.GetCorruptedBackups(corrupt_backup_ids);
+ }
+
+ using BackupEngine::RestoreDBFromBackup;
+ IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+ BackupID backup_id, const std::string& db_dir,
+ const std::string& wal_dir) const override {
+ ReadLock lock(&mutex_);
+ return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+ }
+
+ using BackupEngine::RestoreDBFromLatestBackup;
+ IOStatus RestoreDBFromLatestBackup(
+ const RestoreOptions& options, const std::string& db_dir,
+ const std::string& wal_dir) const override {
+ // Defer to above function, which locks
+ return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+ }
+
+ IOStatus VerifyBackup(BackupID backup_id,
+ bool verify_with_checksum = false) const override {
+ ReadLock lock(&mutex_);
+ return impl_.VerifyBackup(backup_id, verify_with_checksum);
+ }
+
+ // Not public API but needed
+ IOStatus Initialize() {
+ // No locking needed
+ return impl_.Initialize();
+ }
+
+ // Not public API but used in testing
+ void TEST_SetBackupMetaSchemaOptions(
+ const TEST_BackupMetaSchemaOptions& options) {
+ impl_.schema_test_options_.reset(new TEST_BackupMetaSchemaOptions(options));
+ }
+
+ // Not public API but used in testing
+ void TEST_SetDefaultRateLimitersClock(
+ const std::shared_ptr<SystemClock>& backup_rate_limiter_clock = nullptr,
+ const std::shared_ptr<SystemClock>& restore_rate_limiter_clock =
+ nullptr) {
+ impl_.TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock,
+ restore_rate_limiter_clock);
+ }
+
+ private:
+ mutable port::RWMutex mutex_;
+ BackupEngineImpl impl_;
+};
+} // namespace
+
+IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env,
+ BackupEngine** backup_engine_ptr) {
+ std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+ new BackupEngineImplThreadSafe(options, env));
+ auto s = backup_engine->Initialize();
+ if (!s.ok()) {
+ *backup_engine_ptr = nullptr;
+ return s;
+ }
+ *backup_engine_ptr = backup_engine.release();
+ return IOStatus::OK();
+}
+
+namespace {
+BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options,
+ Env* db_env, bool read_only)
+ : initialized_(false),
+ threads_cpu_priority_(),
+ latest_backup_id_(0),
+ latest_valid_backup_id_(0),
+ stop_backup_(false),
+ options_(options),
+ db_env_(db_env),
+ backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_),
+ read_only_(read_only) {
+ if (options_.backup_rate_limiter == nullptr &&
+ options_.backup_rate_limit > 0) {
+ options_.backup_rate_limiter.reset(
+ NewGenericRateLimiter(options_.backup_rate_limit));
+ }
+ if (options_.restore_rate_limiter == nullptr &&
+ options_.restore_rate_limit > 0) {
+ options_.restore_rate_limiter.reset(
+ NewGenericRateLimiter(options_.restore_rate_limit));
+ }
+ db_fs_ = db_env_->GetFileSystem();
+ backup_fs_ = backup_env_->GetFileSystem();
+}
+
+BackupEngineImpl::~BackupEngineImpl() {
+ files_to_copy_or_create_.sendEof();
+ for (auto& t : threads_) {
+ t.join();
+ }
+ LogFlush(options_.info_log);
+ for (const auto& it : corrupt_backups_) {
+ it.second.first.PermitUncheckedError();
+ }
+}
+
+IOStatus BackupEngineImpl::Initialize() {
+ assert(!initialized_);
+ initialized_ = true;
+ if (read_only_) {
+ ROCKS_LOG_INFO(options_.info_log, "Starting read_only backup engine");
+ }
+ options_.Dump(options_.info_log);
+
+ auto meta_path = GetAbsolutePath(kMetaDirName);
+
+ if (!read_only_) {
+ // we might need to clean up from previous crash or I/O errors
+ might_need_garbage_collect_ = true;
+
+ if (options_.max_valid_backups_to_open !=
+ std::numeric_limits<int32_t>::max()) {
+ options_.max_valid_backups_to_open = std::numeric_limits<int32_t>::max();
+ ROCKS_LOG_WARN(
+ options_.info_log,
+ "`max_valid_backups_to_open` is not set to the default value. "
+ "Ignoring its value since BackupEngine is not read-only.");
+ }
+
+ // gather the list of directories that we need to create
+ std::vector<std::pair<std::string, std::unique_ptr<FSDirectory>*>>
+ directories;
+ directories.emplace_back(GetAbsolutePath(), &backup_directory_);
+ if (options_.share_table_files) {
+ if (options_.share_files_with_checksum) {
+ directories.emplace_back(
+ GetAbsolutePath(GetSharedFileWithChecksumRel()),
+ &shared_directory_);
+ } else {
+ directories.emplace_back(GetAbsolutePath(GetSharedFileRel()),
+ &shared_directory_);
+ }
+ }
+ directories.emplace_back(GetAbsolutePath(kPrivateDirName),
+ &private_directory_);
+ directories.emplace_back(meta_path, &meta_directory_);
+ // create all the dirs we need
+ for (const auto& d : directories) {
+ IOStatus io_s =
+ backup_fs_->CreateDirIfMissing(d.first, io_options_, nullptr);
+ if (io_s.ok()) {
+ io_s =
+ backup_fs_->NewDirectory(d.first, io_options_, d.second, nullptr);
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ }
+
+ std::vector<std::string> backup_meta_files;
+ {
+ IOStatus io_s = backup_fs_->GetChildren(meta_path, io_options_,
+ &backup_meta_files, nullptr);
+ if (io_s.IsNotFound()) {
+ return IOStatus::NotFound(meta_path + " is missing");
+ } else if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ // create backups_ structure
+ for (auto& file : backup_meta_files) {
+ ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str());
+ BackupID backup_id = 0;
+ sscanf(file.c_str(), "%u", &backup_id);
+ if (backup_id == 0 || file != std::to_string(backup_id)) {
+ // Invalid file name, will be deleted with auto-GC when user
+ // initiates an append or write operation. (Behave as read-only until
+ // then.)
+ ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s",
+ file.c_str());
+ continue;
+ }
+ assert(backups_.find(backup_id) == backups_.end());
+ // Insert all the (backup_id, BackupMeta) that will be loaded later
+ // The loading performed later will check whether there are corrupt backups
+ // and move the corrupt backups to corrupt_backups_
+ backups_.insert(std::make_pair(
+ backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+ GetBackupMetaFile(backup_id, false /* tmp */),
+ GetBackupMetaFile(backup_id, true /* tmp */),
+ &backuped_file_infos_, backup_env_, backup_fs_))));
+ }
+
+ latest_backup_id_ = 0;
+ latest_valid_backup_id_ = 0;
+ if (options_.destroy_old_data) { // Destroy old data
+ assert(!read_only_);
+ ROCKS_LOG_INFO(
+ options_.info_log,
+ "Backup Engine started with destroy_old_data == true, deleting all "
+ "backups");
+ IOStatus io_s = PurgeOldBackups(0);
+ if (io_s.ok()) {
+ io_s = GarbageCollect();
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ } else { // Load data from storage
+ // abs_path_to_size: maps absolute paths of files in backup directory to
+ // their corresponding sizes
+ std::unordered_map<std::string, uint64_t> abs_path_to_size;
+ // Insert files and their sizes in backup sub-directories (shared and
+ // shared_checksum) to abs_path_to_size
+ for (const auto& rel_dir :
+ {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) {
+ const auto abs_dir = GetAbsolutePath(rel_dir);
+ IOStatus io_s =
+ ReadChildFileCurrentSizes(abs_dir, backup_fs_, &abs_path_to_size);
+ if (!io_s.ok()) {
+ // I/O error likely impacting all backups
+ return io_s;
+ }
+ }
+ // load the backups if any, until valid_backups_to_open of the latest
+ // non-corrupted backups have been successfully opened.
+ int valid_backups_to_open = options_.max_valid_backups_to_open;
+ for (auto backup_iter = backups_.rbegin(); backup_iter != backups_.rend();
+ ++backup_iter) {
+ assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first);
+ if (latest_backup_id_ == 0) {
+ latest_backup_id_ = backup_iter->first;
+ }
+ if (valid_backups_to_open == 0) {
+ break;
+ }
+
+ // Insert files and their sizes in backup sub-directories
+ // (private/backup_id) to abs_path_to_size
+ IOStatus io_s = ReadChildFileCurrentSizes(
+ GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_fs_,
+ &abs_path_to_size);
+ if (io_s.ok()) {
+ io_s = backup_iter->second->LoadFromFile(
+ options_.backup_dir, abs_path_to_size,
+ options_.backup_rate_limiter.get(), options_.info_log,
+ &reported_ignored_fields_);
+ }
+ if (io_s.IsCorruption() || io_s.IsNotSupported()) {
+ ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s",
+ backup_iter->first, io_s.ToString().c_str());
+ corrupt_backups_.insert(std::make_pair(
+ backup_iter->first,
+ std::make_pair(io_s, std::move(backup_iter->second))));
+ } else if (!io_s.ok()) {
+ // Distinguish corruption errors from errors in the backup Env.
+ // Errors in the backup Env (i.e., this code path) will cause Open() to
+ // fail, whereas corruption errors would not cause Open() failures.
+ return io_s;
+ } else {
+ ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
+ backup_iter->first,
+ backup_iter->second->GetInfoString().c_str());
+ assert(latest_valid_backup_id_ == 0 ||
+ latest_valid_backup_id_ > backup_iter->first);
+ if (latest_valid_backup_id_ == 0) {
+ latest_valid_backup_id_ = backup_iter->first;
+ }
+ --valid_backups_to_open;
+ }
+ }
+
+ for (const auto& corrupt : corrupt_backups_) {
+ backups_.erase(backups_.find(corrupt.first));
+ }
+ // erase the backups before max_valid_backups_to_open
+ int num_unopened_backups;
+ if (options_.max_valid_backups_to_open == 0) {
+ num_unopened_backups = 0;
+ } else {
+ num_unopened_backups =
+ std::max(0, static_cast<int>(backups_.size()) -
+ options_.max_valid_backups_to_open);
+ }
+ for (int i = 0; i < num_unopened_backups; ++i) {
+ assert(backups_.begin()->second->Empty());
+ backups_.erase(backups_.begin());
+ }
+ }
+
+ ROCKS_LOG_INFO(options_.info_log, "Latest backup is %u", latest_backup_id_);
+ ROCKS_LOG_INFO(options_.info_log, "Latest valid backup is %u",
+ latest_valid_backup_id_);
+
+ // set up threads perform copies from files_to_copy_or_create_ in the
+ // background
+ threads_cpu_priority_ = CpuPriority::kNormal;
+ threads_.reserve(options_.max_background_operations);
+ for (int t = 0; t < options_.max_background_operations; t++) {
+ threads_.emplace_back([this]() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+ pthread_setname_np(pthread_self(), "backup_engine");
+#endif
+#endif
+ CpuPriority current_priority = CpuPriority::kNormal;
+ CopyOrCreateWorkItem work_item;
+ uint64_t bytes_toward_next_callback = 0;
+ while (files_to_copy_or_create_.read(work_item)) {
+ CpuPriority priority = threads_cpu_priority_;
+ if (current_priority != priority) {
+ TEST_SYNC_POINT_CALLBACK(
+ "BackupEngineImpl::Initialize:SetCpuPriority", &priority);
+ port::SetCpuPriority(0, priority);
+ current_priority = priority;
+ }
+ // `bytes_read` and `bytes_written` stats are enabled based on
+ // compile-time support and cannot be dynamically toggled. So we do not
+ // need to worry about `PerfLevel` here, unlike many other
+ // `IOStatsContext` / `PerfContext` stats.
+ uint64_t prev_bytes_read = IOSTATS(bytes_read);
+ uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
+ CopyOrCreateResult result;
+ Temperature temp = work_item.src_temperature;
+ result.io_status = CopyOrCreateFile(
+ work_item.src_path, work_item.dst_path, work_item.contents,
+ work_item.size_limit, work_item.src_env, work_item.dst_env,
+ work_item.src_env_options, work_item.sync, work_item.rate_limiter,
+ work_item.progress_callback, &temp, work_item.dst_temperature,
+ &bytes_toward_next_callback, &result.size, &result.checksum_hex);
+
+ RecordTick(work_item.stats, BACKUP_READ_BYTES,
+ IOSTATS(bytes_read) - prev_bytes_read);
+ RecordTick(work_item.stats, BACKUP_WRITE_BYTES,
+ IOSTATS(bytes_written) - prev_bytes_written);
+
+ result.db_id = work_item.db_id;
+ result.db_session_id = work_item.db_session_id;
+ result.expected_src_temperature = work_item.src_temperature;
+ result.current_src_temperature = temp;
+ if (result.io_status.ok() && !work_item.src_checksum_hex.empty()) {
+ // unknown checksum function name implies no db table file checksum in
+ // db manifest; work_item.src_checksum_hex not empty means
+ // backup engine has calculated its crc32c checksum for the table
+ // file; therefore, we are able to compare the checksums.
+ if (work_item.src_checksum_func_name ==
+ kUnknownFileChecksumFuncName ||
+ work_item.src_checksum_func_name == kDbFileChecksumFuncName) {
+ if (work_item.src_checksum_hex != result.checksum_hex) {
+ std::string checksum_info(
+ "Expected checksum is " + work_item.src_checksum_hex +
+ " while computed checksum is " + result.checksum_hex);
+ result.io_status = IOStatus::Corruption(
+ "Checksum mismatch after copying to " + work_item.dst_path +
+ ": " + checksum_info);
+ }
+ } else {
+ // FIXME(peterd): dead code?
+ std::string checksum_function_info(
+ "Existing checksum function is " +
+ work_item.src_checksum_func_name +
+ " while provided checksum function is " +
+ kBackupFileChecksumFuncName);
+ ROCKS_LOG_INFO(
+ options_.info_log,
+ "Unable to verify checksum after copying to %s: %s\n",
+ work_item.dst_path.c_str(), checksum_function_info.c_str());
+ }
+ }
+ work_item.result.set_value(std::move(result));
+ }
+ });
+ }
+ ROCKS_LOG_INFO(options_.info_log, "Initialized BackupEngine");
+ return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::CreateNewBackupWithMetadata(
+ const CreateBackupOptions& options, DB* db, const std::string& app_metadata,
+ BackupID* new_backup_id_ptr) {
+ assert(initialized_);
+ assert(!read_only_);
+ if (app_metadata.size() > kMaxAppMetaSize) {
+ return IOStatus::InvalidArgument("App metadata too large");
+ }
+
+ if (options.decrease_background_thread_cpu_priority) {
+ if (options.background_thread_cpu_priority < threads_cpu_priority_) {
+ threads_cpu_priority_.store(options.background_thread_cpu_priority);
+ }
+ }
+
+ BackupID new_backup_id = latest_backup_id_ + 1;
+
+ // `bytes_read` and `bytes_written` stats are enabled based on compile-time
+ // support and cannot be dynamically toggled. So we do not need to worry about
+ // `PerfLevel` here, unlike many other `IOStatsContext` / `PerfContext` stats.
+ uint64_t prev_bytes_read = IOSTATS(bytes_read);
+ uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
+ assert(backups_.find(new_backup_id) == backups_.end());
+
+ auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id));
+ IOStatus io_s = backup_fs_->FileExists(private_dir, io_options_, nullptr);
+ if (io_s.ok()) {
+ // maybe last backup failed and left partial state behind, clean it up.
+ // need to do this before updating backups_ such that a private dir
+ // named after new_backup_id will be cleaned up.
+ // (If an incomplete new backup is followed by an incomplete delete
+ // of the latest full backup, then there could be more than one next
+ // id with a private dir, the last thing to be deleted in delete
+ // backup, but all will be cleaned up with a GarbageCollect.)
+ io_s = GarbageCollect();
+ } else if (io_s.IsNotFound()) {
+ // normal case, the new backup's private dir doesn't exist yet
+ io_s = IOStatus::OK();
+ }
+
+ auto ret = backups_.insert(std::make_pair(
+ new_backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+ GetBackupMetaFile(new_backup_id, false /* tmp */),
+ GetBackupMetaFile(new_backup_id, true /* tmp */),
+ &backuped_file_infos_, backup_env_, backup_fs_))));
+ assert(ret.second == true);
+ auto& new_backup = ret.first->second;
+ new_backup->RecordTimestamp();
+ new_backup->SetAppMetadata(app_metadata);
+
+ auto start_backup = backup_env_->NowMicros();
+
+ ROCKS_LOG_INFO(options_.info_log,
+ "Started the backup process -- creating backup %u",
+ new_backup_id);
+
+ if (options_.share_table_files && !options_.share_files_with_checksum) {
+ ROCKS_LOG_WARN(options_.info_log,
+ "BackupEngineOptions::share_files_with_checksum=false is "
+ "DEPRECATED and could lead to data loss.");
+ }
+
+ if (io_s.ok()) {
+ io_s = backup_fs_->CreateDir(private_dir, io_options_, nullptr);
+ }
+
+ // A set into which we will insert the dst_paths that are calculated for live
+ // files and live WAL files.
+ // This is used to check whether a live files shares a dst_path with another
+ // live file.
+ std::unordered_set<std::string> live_dst_paths;
+
+ std::vector<BackupAfterCopyOrCreateWorkItem> backup_items_to_finish;
+ // Add a CopyOrCreateWorkItem to the channel for each live file
+ Status disabled = db->DisableFileDeletions();
+ DBOptions db_options = db->GetDBOptions();
+ Statistics* stats = db_options.statistics.get();
+ if (io_s.ok()) {
+ CheckpointImpl checkpoint(db);
+ uint64_t sequence_number = 0;
+ FileChecksumGenFactory* db_checksum_factory =
+ db_options.file_checksum_gen_factory.get();
+ const std::string kFileChecksumGenFactoryName =
+ "FileChecksumGenCrc32cFactory";
+ bool compare_checksum =
+ db_checksum_factory != nullptr &&
+ db_checksum_factory->Name() == kFileChecksumGenFactoryName
+ ? true
+ : false;
+ EnvOptions src_raw_env_options(db_options);
+ RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+ io_s = status_to_io_status(checkpoint.CreateCustomCheckpoint(
+ [&](const std::string& /*src_dirname*/, const std::string& /*fname*/,
+ FileType) {
+ // custom checkpoint will switch to calling copy_file_cb after it sees
+ // NotSupported returned from link_file_cb.
+ return IOStatus::NotSupported();
+ } /* link_file_cb */,
+ [&](const std::string& src_dirname, const std::string& fname,
+ uint64_t size_limit_bytes, FileType type,
+ const std::string& checksum_func_name,
+ const std::string& checksum_val,
+ const Temperature src_temperature) {
+ if (type == kWalFile && !options_.backup_log_files) {
+ return IOStatus::OK();
+ }
+ Log(options_.info_log, "add file for backup %s", fname.c_str());
+ uint64_t size_bytes = 0;
+ IOStatus io_st;
+ if (type == kTableFile || type == kBlobFile) {
+ io_st = db_fs_->GetFileSize(src_dirname + "/" + fname, io_options_,
+ &size_bytes, nullptr);
+ if (!io_st.ok()) {
+ Log(options_.info_log, "GetFileSize is failed: %s",
+ io_st.ToString().c_str());
+ return io_st;
+ }
+ }
+ EnvOptions src_env_options;
+ switch (type) {
+ case kWalFile:
+ src_env_options =
+ db_env_->OptimizeForLogRead(src_raw_env_options);
+ break;
+ case kTableFile:
+ src_env_options = db_env_->OptimizeForCompactionTableRead(
+ src_raw_env_options, ImmutableDBOptions(db_options));
+ break;
+ case kDescriptorFile:
+ src_env_options =
+ db_env_->OptimizeForManifestRead(src_raw_env_options);
+ break;
+ case kBlobFile:
+ src_env_options = db_env_->OptimizeForBlobFileRead(
+ src_raw_env_options, ImmutableDBOptions(db_options));
+ break;
+ default:
+ // Other backed up files (like options file) are not read by live
+ // DB, so don't need to worry about avoiding mixing buffered and
+ // direct I/O. Just use plain defaults.
+ src_env_options = src_raw_env_options;
+ break;
+ }
+ io_st = AddBackupFileWorkItem(
+ live_dst_paths, backup_items_to_finish, new_backup_id,
+ options_.share_table_files &&
+ (type == kTableFile || type == kBlobFile),
+ src_dirname, fname, src_env_options, rate_limiter, type,
+ size_bytes, db_options.statistics.get(), size_limit_bytes,
+ options_.share_files_with_checksum &&
+ (type == kTableFile || type == kBlobFile),
+ options.progress_callback, "" /* contents */, checksum_func_name,
+ checksum_val, src_temperature);
+ return io_st;
+ } /* copy_file_cb */,
+ [&](const std::string& fname, const std::string& contents,
+ FileType type) {
+ Log(options_.info_log, "add file for backup %s", fname.c_str());
+ return AddBackupFileWorkItem(
+ live_dst_paths, backup_items_to_finish, new_backup_id,
+ false /* shared */, "" /* src_dir */, fname,
+ EnvOptions() /* src_env_options */, rate_limiter, type,
+ contents.size(), db_options.statistics.get(), 0 /* size_limit */,
+ false /* shared_checksum */, options.progress_callback, contents);
+ } /* create_file_cb */,
+ &sequence_number,
+ options.flush_before_backup ? 0 : std::numeric_limits<uint64_t>::max(),
+ compare_checksum));
+ if (io_s.ok()) {
+ new_backup->SetSequenceNumber(sequence_number);
+ }
+ }
+ ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
+ IOStatus item_io_status;
+ for (auto& item : backup_items_to_finish) {
+ item.result.wait();
+ auto result = item.result.get();
+ item_io_status = result.io_status;
+ Temperature temp = result.expected_src_temperature;
+ if (result.current_src_temperature != Temperature::kUnknown &&
+ (temp == Temperature::kUnknown ||
+ options_.current_temperatures_override_manifest)) {
+ temp = result.current_src_temperature;
+ }
+ if (item_io_status.ok() && item.shared && item.needed_to_copy) {
+ item_io_status = item.backup_env->GetFileSystem()->RenameFile(
+ item.dst_path_tmp, item.dst_path, io_options_, nullptr);
+ }
+ if (item_io_status.ok()) {
+ item_io_status = new_backup.get()->AddFile(std::make_shared<FileInfo>(
+ item.dst_relative, result.size, result.checksum_hex, result.db_id,
+ result.db_session_id, temp));
+ }
+ if (!item_io_status.ok()) {
+ io_s = item_io_status;
+ }
+ }
+
+ // we copied all the files, enable file deletions
+ if (disabled.ok()) { // If we successfully disabled file deletions
+ db->EnableFileDeletions(false).PermitUncheckedError();
+ }
+ auto backup_time = backup_env_->NowMicros() - start_backup;
+
+ if (io_s.ok()) {
+ // persist the backup metadata on the disk
+ io_s = new_backup->StoreToFile(options_.sync, options_.schema_version,
+ schema_test_options_.get());
+ }
+ if (io_s.ok() && options_.sync) {
+ std::unique_ptr<FSDirectory> backup_private_directory;
+ backup_fs_
+ ->NewDirectory(GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
+ io_options_, &backup_private_directory, nullptr)
+ .PermitUncheckedError();
+ if (backup_private_directory != nullptr) {
+ io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+ if (io_s.ok() && private_directory_ != nullptr) {
+ io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+ if (io_s.ok() && meta_directory_ != nullptr) {
+ io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+ if (io_s.ok() && shared_directory_ != nullptr) {
+ io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+ if (io_s.ok() && backup_directory_ != nullptr) {
+ io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+ }
+
+ if (io_s.ok()) {
+ backup_statistics_.IncrementNumberSuccessBackup();
+ // here we know that we succeeded and installed the new backup
+ latest_backup_id_ = new_backup_id;
+ latest_valid_backup_id_ = new_backup_id;
+ if (new_backup_id_ptr) {
+ *new_backup_id_ptr = new_backup_id;
+ }
+ ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
+
+ // backup_speed is in byte/second
+ double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
+ ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u",
+ new_backup->GetNumberFiles());
+ char human_size[16];
+ AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size));
+ ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size);
+ ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds",
+ backup_time);
+ ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
+ ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s",
+ backup_statistics_.ToString().c_str());
+ } else {
+ backup_statistics_.IncrementNumberFailBackup();
+ // clean all the files we might have created
+ ROCKS_LOG_INFO(options_.info_log, "Backup failed -- %s",
+ io_s.ToString().c_str());
+ ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n",
+ backup_statistics_.ToString().c_str());
+ // delete files that we might have already written
+ might_need_garbage_collect_ = true;
+ DeleteBackup(new_backup_id).PermitUncheckedError();
+ }
+
+ RecordTick(stats, BACKUP_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read);
+ RecordTick(stats, BACKUP_WRITE_BYTES,
+ IOSTATS(bytes_written) - prev_bytes_written);
+ return io_s;
+}
+
+IOStatus BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
+ assert(initialized_);
+ assert(!read_only_);
+
+ // Best effort deletion even with errors
+ IOStatus overall_status = IOStatus::OK();
+
+ ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u",
+ num_backups_to_keep);
+ std::vector<BackupID> to_delete;
+ auto itr = backups_.begin();
+ while ((backups_.size() - to_delete.size()) > num_backups_to_keep) {
+ to_delete.push_back(itr->first);
+ itr++;
+ }
+ for (auto backup_id : to_delete) {
+ // Do not GC until end
+ IOStatus io_s = DeleteBackupNoGC(backup_id);
+ if (!io_s.ok()) {
+ overall_status = io_s;
+ }
+ }
+ // Clean up after any incomplete backup deletion, potentially from
+ // earlier session.
+ if (might_need_garbage_collect_) {
+ IOStatus io_s = GarbageCollect();
+ if (!io_s.ok() && overall_status.ok()) {
+ overall_status = io_s;
+ }
+ }
+ return overall_status;
+}
+
+IOStatus BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+ IOStatus s1 = DeleteBackupNoGC(backup_id);
+ IOStatus s2 = IOStatus::OK();
+
+ // Clean up after any incomplete backup deletion, potentially from
+ // earlier session.
+ if (might_need_garbage_collect_) {
+ s2 = GarbageCollect();
+ }
+
+ if (!s1.ok()) {
+ // Any failure in the primary objective trumps any failure in the
+ // secondary objective.
+ s2.PermitUncheckedError();
+ return s1;
+ } else {
+ return s2;
+ }
+}
+
+// Does not auto-GarbageCollect nor lock
+IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) {
+ assert(initialized_);
+ assert(!read_only_);
+
+ ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id);
+ auto backup = backups_.find(backup_id);
+ if (backup != backups_.end()) {
+ IOStatus io_s = backup->second->Delete();
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ backups_.erase(backup);
+ } else {
+ auto corrupt = corrupt_backups_.find(backup_id);
+ if (corrupt == corrupt_backups_.end()) {
+ return IOStatus::NotFound("Backup not found");
+ }
+ IOStatus io_s = corrupt->second.second->Delete();
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ corrupt->second.first.PermitUncheckedError();
+ corrupt_backups_.erase(corrupt);
+ }
+
+ // After removing meta file, best effort deletion even with errors.
+ // (Don't delete other files if we can't delete the meta file right
+ // now.)
+ std::vector<std::string> to_delete;
+ for (auto& itr : backuped_file_infos_) {
+ if (itr.second->refs == 0) {
+ IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(itr.first),
+ io_options_, nullptr);
+ ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+ io_s.ToString().c_str());
+ to_delete.push_back(itr.first);
+ if (!io_s.ok()) {
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+ }
+ for (auto& td : to_delete) {
+ backuped_file_infos_.erase(td);
+ }
+
+ // take care of private dirs -- GarbageCollect() will take care of them
+ // if they are not empty
+ std::string private_dir = GetPrivateFileRel(backup_id);
+ IOStatus io_s =
+ backup_fs_->DeleteDir(GetAbsolutePath(private_dir), io_options_, nullptr);
+ ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s",
+ private_dir.c_str(), io_s.ToString().c_str());
+ if (!io_s.ok()) {
+ // Full gc or trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ return IOStatus::OK();
+}
+
+void BackupEngineImpl::SetBackupInfoFromBackupMeta(
+ BackupID id, const BackupMeta& meta, BackupInfo* backup_info,
+ bool include_file_details) const {
+ *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(),
+ meta.GetNumberFiles(), meta.GetAppMetadata());
+ std::string dir =
+ options_.backup_dir + "/" + kPrivateDirSlash + std::to_string(id);
+ if (include_file_details) {
+ auto& file_details = backup_info->file_details;
+ file_details.reserve(meta.GetFiles().size());
+ for (auto& file_ptr : meta.GetFiles()) {
+ BackupFileInfo& finfo = *file_details.emplace(file_details.end());
+ finfo.relative_filename = file_ptr->filename;
+ finfo.size = file_ptr->size;
+ finfo.directory = dir;
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(file_ptr->filename, &number, &type);
+ if (ok) {
+ finfo.file_number = number;
+ finfo.file_type = type;
+ }
+ // TODO: temperature, file_checksum, file_checksum_func_name
+ }
+ backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id));
+ backup_info->name_for_open.pop_back(); // remove trailing '/'
+ backup_info->env_for_open = meta.GetEnvForOpen();
+ }
+}
+
+Status BackupEngineImpl::GetBackupInfo(BackupID backup_id,
+ BackupInfo* backup_info,
+ bool include_file_details) const {
+ assert(initialized_);
+ if (backup_id == kLatestBackupIDMarker) {
+ // Note: Read latest_valid_backup_id_ inside of lock
+ backup_id = latest_valid_backup_id_;
+ }
+ auto corrupt_itr = corrupt_backups_.find(backup_id);
+ if (corrupt_itr != corrupt_backups_.end()) {
+ return Status::Corruption(corrupt_itr->second.first.ToString());
+ }
+ auto backup_itr = backups_.find(backup_id);
+ if (backup_itr == backups_.end()) {
+ return Status::NotFound("Backup not found");
+ }
+ auto& backup = backup_itr->second;
+ if (backup->Empty()) {
+ return Status::NotFound("Backup not found");
+ }
+
+ SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info,
+ include_file_details);
+ return Status::OK();
+}
+
+void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info,
+ bool include_file_details) const {
+ assert(initialized_);
+ backup_info->resize(backups_.size());
+ size_t i = 0;
+ for (auto& backup : backups_) {
+ const BackupMeta& meta = *backup.second;
+ if (!meta.Empty()) {
+ SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++),
+ include_file_details);
+ }
+ }
+}
+
+void BackupEngineImpl::GetCorruptedBackups(
+ std::vector<BackupID>* corrupt_backup_ids) const {
+ assert(initialized_);
+ corrupt_backup_ids->reserve(corrupt_backups_.size());
+ for (auto& backup : corrupt_backups_) {
+ corrupt_backup_ids->push_back(backup.first);
+ }
+}
+
+IOStatus BackupEngineImpl::RestoreDBFromBackup(
+ const RestoreOptions& options, BackupID backup_id,
+ const std::string& db_dir, const std::string& wal_dir) const {
+ assert(initialized_);
+ if (backup_id == kLatestBackupIDMarker) {
+ // Note: Read latest_valid_backup_id_ inside of lock
+ backup_id = latest_valid_backup_id_;
+ }
+ auto corrupt_itr = corrupt_backups_.find(backup_id);
+ if (corrupt_itr != corrupt_backups_.end()) {
+ return corrupt_itr->second.first;
+ }
+ auto backup_itr = backups_.find(backup_id);
+ if (backup_itr == backups_.end()) {
+ return IOStatus::NotFound("Backup not found");
+ }
+ auto& backup = backup_itr->second;
+ if (backup->Empty()) {
+ return IOStatus::NotFound("Backup not found");
+ }
+
+ ROCKS_LOG_INFO(options_.info_log, "Restoring backup id %u\n", backup_id);
+ ROCKS_LOG_INFO(options_.info_log, "keep_log_files: %d\n",
+ static_cast<int>(options.keep_log_files));
+
+ // just in case. Ignore errors
+ db_fs_->CreateDirIfMissing(db_dir, io_options_, nullptr)
+ .PermitUncheckedError();
+ db_fs_->CreateDirIfMissing(wal_dir, io_options_, nullptr)
+ .PermitUncheckedError();
+
+ if (options.keep_log_files) {
+ // delete files in db_dir, but keep all the log files
+ DeleteChildren(db_dir, 1 << kWalFile);
+ // move all the files from archive dir to wal_dir
+ std::string archive_dir = ArchivalDirectory(wal_dir);
+ std::vector<std::string> archive_files;
+ db_fs_->GetChildren(archive_dir, io_options_, &archive_files, nullptr)
+ .PermitUncheckedError(); // ignore errors
+ for (const auto& f : archive_files) {
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(f, &number, &type);
+ if (ok && type == kWalFile) {
+ ROCKS_LOG_INFO(options_.info_log,
+ "Moving log file from archive/ to wal_dir: %s",
+ f.c_str());
+ IOStatus io_s = db_fs_->RenameFile(
+ archive_dir + "/" + f, wal_dir + "/" + f, io_options_, nullptr);
+ if (!io_s.ok()) {
+ // if we can't move log file from archive_dir to wal_dir,
+ // we should fail, since it might mean data loss
+ return io_s;
+ }
+ }
+ }
+ } else {
+ DeleteChildren(wal_dir);
+ DeleteChildren(ArchivalDirectory(wal_dir));
+ DeleteChildren(db_dir);
+ }
+
+ IOStatus io_s;
+ std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
+ std::string temporary_current_file;
+ std::string final_current_file;
+ std::unique_ptr<FSDirectory> db_dir_for_fsync;
+ std::unique_ptr<FSDirectory> wal_dir_for_fsync;
+
+ for (const auto& file_info : backup->GetFiles()) {
+ const std::string& file = file_info->filename;
+ // 1. get DB filename
+ std::string dst = file_info->GetDbFileName();
+
+ // 2. find the filetype
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(dst, &number, &type);
+ if (!ok) {
+ return IOStatus::Corruption("Backup corrupted: Fail to parse filename " +
+ dst);
+ }
+ // 3. Construct the final path
+ // kWalFile lives in wal_dir and all the rest live in db_dir
+ if (type == kWalFile) {
+ dst = wal_dir + "/" + dst;
+ if (options_.sync && !wal_dir_for_fsync) {
+ io_s = db_fs_->NewDirectory(wal_dir, io_options_, &wal_dir_for_fsync,
+ nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ } else {
+ dst = db_dir + "/" + dst;
+ if (options_.sync && !db_dir_for_fsync) {
+ io_s = db_fs_->NewDirectory(db_dir, io_options_, &db_dir_for_fsync,
+ nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ }
+ // For atomicity, initially restore CURRENT file to a temporary name.
+ // This is useful even without options_.sync e.g. in case the restore
+ // process is interrupted.
+ if (type == kCurrentFile) {
+ final_current_file = dst;
+ dst = temporary_current_file = dst + ".tmp";
+ }
+
+ ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(),
+ dst.c_str());
+ CopyOrCreateWorkItem copy_or_create_work_item(
+ GetAbsolutePath(file), dst, Temperature::kUnknown /* src_temp */,
+ file_info->temp, "" /* contents */, backup_env_, db_env_,
+ EnvOptions() /* src_env_options */, options_.sync,
+ options_.restore_rate_limiter.get(), file_info->size,
+ nullptr /* stats */);
+ RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+ copy_or_create_work_item.result.get_future(), file, dst,
+ file_info->checksum_hex);
+ files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
+ restore_items_to_finish.push_back(
+ std::move(after_copy_or_create_work_item));
+ }
+ IOStatus item_io_status;
+ for (auto& item : restore_items_to_finish) {
+ item.result.wait();
+ auto result = item.result.get();
+ item_io_status = result.io_status;
+ // Note: It is possible that both of the following bad-status cases occur
+ // during copying. But, we only return one status.
+ if (!item_io_status.ok()) {
+ io_s = item_io_status;
+ break;
+ } else if (!item.checksum_hex.empty() &&
+ item.checksum_hex != result.checksum_hex) {
+ io_s = IOStatus::Corruption(
+ "While restoring " + item.from_file + " -> " + item.to_file +
+ ": expected checksum is " + item.checksum_hex +
+ " while computed checksum is " + result.checksum_hex);
+ break;
+ }
+ }
+
+ // When enabled, the first FsyncWithDirOptions is to ensure all files are
+ // fully persisted before renaming CURRENT.tmp
+ if (io_s.ok() && db_dir_for_fsync) {
+ ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n");
+ io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+
+ if (io_s.ok() && wal_dir_for_fsync) {
+ io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+ DirFsyncOptions());
+ }
+
+ if (io_s.ok() && !temporary_current_file.empty()) {
+ ROCKS_LOG_INFO(options_.info_log, "Restore: atomic rename CURRENT.tmp\n");
+ assert(!final_current_file.empty());
+ io_s = db_fs_->RenameFile(temporary_current_file, final_current_file,
+ io_options_, nullptr);
+ }
+
+ if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) {
+ // Second FsyncWithDirOptions is to ensure the final atomic rename of DB
+ // restore is fully persisted even if power goes out right after restore
+ // operation returns success
+ assert(db_dir_for_fsync);
+ io_s = db_dir_for_fsync->FsyncWithDirOptions(
+ io_options_, nullptr, DirFsyncOptions(final_current_file));
+ }
+
+ ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n",
+ io_s.ToString().c_str());
+ return io_s;
+}
+
+IOStatus BackupEngineImpl::VerifyBackup(BackupID backup_id,
+ bool verify_with_checksum) const {
+ assert(initialized_);
+ // Check if backup_id is corrupted, or valid and registered
+ auto corrupt_itr = corrupt_backups_.find(backup_id);
+ if (corrupt_itr != corrupt_backups_.end()) {
+ return corrupt_itr->second.first;
+ }
+
+ auto backup_itr = backups_.find(backup_id);
+ if (backup_itr == backups_.end()) {
+ return IOStatus::NotFound();
+ }
+
+ auto& backup = backup_itr->second;
+ if (backup->Empty()) {
+ return IOStatus::NotFound();
+ }
+
+ ROCKS_LOG_INFO(options_.info_log, "Verifying backup id %u\n", backup_id);
+
+ // Find all existing backup files belong to backup_id
+ std::unordered_map<std::string, uint64_t> curr_abs_path_to_size;
+ for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(),
+ GetSharedFileWithChecksumRel()}) {
+ const auto abs_dir = GetAbsolutePath(rel_dir);
+ // Shared directories allowed to be missing in some cases. Expected but
+ // missing files will be reported a few lines down.
+ ReadChildFileCurrentSizes(abs_dir, backup_fs_, &curr_abs_path_to_size)
+ .PermitUncheckedError();
+ }
+
+ // For all files registered in backup
+ for (const auto& file_info : backup->GetFiles()) {
+ const auto abs_path = GetAbsolutePath(file_info->filename);
+ // check existence of the file
+ if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) {
+ return IOStatus::NotFound("File missing: " + abs_path);
+ }
+ // verify file size
+ if (file_info->size != curr_abs_path_to_size[abs_path]) {
+ std::string size_info("Expected file size is " +
+ std::to_string(file_info->size) +
+ " while found file size is " +
+ std::to_string(curr_abs_path_to_size[abs_path]));
+ return IOStatus::Corruption("File corrupted: File size mismatch for " +
+ abs_path + ": " + size_info);
+ }
+ if (verify_with_checksum && !file_info->checksum_hex.empty()) {
+ // verify file checksum
+ std::string checksum_hex;
+ ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n",
+ abs_path.c_str());
+ IOStatus io_s = ReadFileAndComputeChecksum(
+ abs_path, backup_fs_, EnvOptions(), 0 /* size_limit */, &checksum_hex,
+ Temperature::kUnknown);
+ if (!io_s.ok()) {
+ return io_s;
+ } else if (file_info->checksum_hex != checksum_hex) {
+ std::string checksum_info(
+ "Expected checksum is " + file_info->checksum_hex +
+ " while computed checksum is " + checksum_hex);
+ return IOStatus::Corruption("File corrupted: Checksum mismatch for " +
+ abs_path + ": " + checksum_info);
+ }
+ }
+ }
+ return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::CopyOrCreateFile(
+ const std::string& src, const std::string& dst, const std::string& contents,
+ uint64_t size_limit, Env* src_env, Env* dst_env,
+ const EnvOptions& src_env_options, bool sync, RateLimiter* rate_limiter,
+ std::function<void()> progress_callback, Temperature* src_temperature,
+ Temperature dst_temperature, uint64_t* bytes_toward_next_callback,
+ uint64_t* size, std::string* checksum_hex) {
+ assert(src.empty() != contents.empty());
+ IOStatus io_s;
+ std::unique_ptr<FSWritableFile> dst_file;
+ std::unique_ptr<FSSequentialFile> src_file;
+ FileOptions dst_file_options;
+ dst_file_options.use_mmap_writes = false;
+ dst_file_options.temperature = dst_temperature;
+ // TODO:(gzh) maybe use direct reads/writes here if possible
+ if (size != nullptr) {
+ *size = 0;
+ }
+ uint32_t checksum_value = 0;
+
+ // Check if size limit is set. if not, set it to very big number
+ if (size_limit == 0) {
+ size_limit = std::numeric_limits<uint64_t>::max();
+ }
+
+ io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
+ &dst_file, nullptr);
+ if (io_s.ok() && !src.empty()) {
+ auto src_file_options = FileOptions(src_env_options);
+ src_file_options.temperature = *src_temperature;
+ io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options,
+ &src_file, nullptr);
+ }
+ if (io_s.IsPathNotFound() && *src_temperature != Temperature::kUnknown) {
+ // Retry without temperature hint in case the FileSystem is strict with
+ // non-kUnknown temperature option
+ io_s = src_env->GetFileSystem()->NewSequentialFile(
+ src, FileOptions(src_env_options), &src_file, nullptr);
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ size_t buf_size =
+ rate_limiter ? static_cast<size_t>(rate_limiter->GetSingleBurstBytes())
+ : kDefaultCopyFileBufferSize;
+
+ std::unique_ptr<WritableFileWriter> dest_writer(
+ new WritableFileWriter(std::move(dst_file), dst, dst_file_options));
+ std::unique_ptr<SequentialFileReader> src_reader;
+ std::unique_ptr<char[]> buf;
+ if (!src.empty()) {
+ // Return back current temperature in FileSystem
+ *src_temperature = src_file->GetTemperature();
+
+ src_reader.reset(new SequentialFileReader(
+ std::move(src_file), src, nullptr /* io_tracer */, {}, rate_limiter));
+ buf.reset(new char[buf_size]);
+ }
+
+ Slice data;
+ do {
+ if (stop_backup_.load(std::memory_order_acquire)) {
+ return status_to_io_status(Status::Incomplete("Backup stopped"));
+ }
+ if (!src.empty()) {
+ size_t buffer_to_read =
+ (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+ io_s = src_reader->Read(buffer_to_read, &data, buf.get(),
+ Env::IO_LOW /* rate_limiter_priority */);
+ *bytes_toward_next_callback += data.size();
+ } else {
+ data = contents;
+ }
+ size_limit -= data.size();
+ TEST_SYNC_POINT_CALLBACK(
+ "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+ (src.length() > 4 && src.rfind(".sst") == src.length() - 4) ? &data
+ : nullptr);
+
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ if (size != nullptr) {
+ *size += data.size();
+ }
+ if (checksum_hex != nullptr) {
+ checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
+ }
+ io_s = dest_writer->Append(data);
+
+ if (rate_limiter != nullptr) {
+ if (!src.empty()) {
+ rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ } else {
+ LoopRateLimitRequestHelper(data.size(), rate_limiter, Env::IO_LOW,
+ nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ }
+ }
+ while (*bytes_toward_next_callback >=
+ options_.callback_trigger_interval_size) {
+ *bytes_toward_next_callback -= options_.callback_trigger_interval_size;
+ std::lock_guard<std::mutex> lock(byte_report_mutex_);
+ progress_callback();
+ }
+ } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0);
+
+ // Convert uint32_t checksum to hex checksum
+ if (checksum_hex != nullptr) {
+ checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
+ }
+
+ if (io_s.ok() && sync) {
+ io_s = dest_writer->Sync(false);
+ }
+ if (io_s.ok()) {
+ io_s = dest_writer->Close();
+ }
+ return io_s;
+}
+
+// fname will always start with "/"
+IOStatus BackupEngineImpl::AddBackupFileWorkItem(
+ std::unordered_set<std::string>& live_dst_paths,
+ std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
+ BackupID backup_id, bool shared, const std::string& src_dir,
+ const std::string& fname, const EnvOptions& src_env_options,
+ RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes,
+ Statistics* stats, uint64_t size_limit, bool shared_checksum,
+ std::function<void()> progress_callback, const std::string& contents,
+ const std::string& src_checksum_func_name,
+ const std::string& src_checksum_str, const Temperature src_temperature) {
+ assert(contents.empty() != src_dir.empty());
+
+ std::string src_path = src_dir + "/" + fname;
+ std::string dst_relative;
+ std::string dst_relative_tmp;
+ std::string db_id;
+ std::string db_session_id;
+ // crc32c checksum in hex. empty == unavailable / unknown
+ std::string checksum_hex;
+
+ // Whenever a default checksum function name is passed in, we will compares
+ // the corresponding checksum values after copying. Note that only table and
+ // blob files may have a known checksum function name passed in.
+ //
+ // If no default checksum function name is passed in and db session id is not
+ // available, we will calculate the checksum *before* copying in two cases
+ // (we always calcuate checksums when copying or creating for any file types):
+ // a) share_files_with_checksum is true and file type is table;
+ // b) share_table_files is true and the file exists already.
+ //
+ // Step 0: Check if default checksum function name is passed in
+ if (kDbFileChecksumFuncName == src_checksum_func_name) {
+ if (src_checksum_str == kUnknownFileChecksum) {
+ return status_to_io_status(
+ Status::Aborted("Unknown checksum value for " + fname));
+ }
+ checksum_hex = ChecksumStrToHex(src_checksum_str);
+ }
+
+ // Step 1: Prepare the relative path to destination
+ if (shared && shared_checksum) {
+ if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize &&
+ file_type != kBlobFile) {
+ // Prepare db_session_id to add to the file name
+ // Ignore the returned status
+ // In the failed cases, db_id and db_session_id will be empty
+ GetFileDbIdentities(db_env_, src_env_options, src_path, src_temperature,
+ rate_limiter, &db_id, &db_session_id)
+ .PermitUncheckedError();
+ }
+ // Calculate checksum if checksum and db session id are not available.
+ // If db session id is available, we will not calculate the checksum
+ // since the session id should suffice to avoid file name collision in
+ // the shared_checksum directory.
+ if (checksum_hex.empty() && db_session_id.empty()) {
+ IOStatus io_s = ReadFileAndComputeChecksum(
+ src_path, db_fs_, src_env_options, size_limit, &checksum_hex,
+ src_temperature);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ if (size_bytes == std::numeric_limits<uint64_t>::max()) {
+ return IOStatus::NotFound("File missing: " + src_path);
+ }
+ // dst_relative depends on the following conditions:
+ // 1) the naming scheme is kUseDbSessionId,
+ // 2) db_session_id is not empty,
+ // 3) checksum is available in the DB manifest.
+ // If 1,2,3) are satisfied, then dst_relative will be of the form:
+ // shared_checksum/<file_number>_<checksum>_<db_session_id>.sst
+ // If 1,2) are satisfied, then dst_relative will be of the form:
+ // shared_checksum/<file_number>_<db_session_id>.sst
+ // Otherwise, dst_relative is of the form
+ // shared_checksum/<file_number>_<checksum>_<size>.sst
+ //
+ // For blob files, db_session_id is not supported with the blob file format.
+ // It uses original/legacy naming scheme.
+ // dst_relative will be of the form:
+ // shared_checksum/<file_number>_<checksum>_<size>.blob
+ dst_relative = GetSharedFileWithChecksum(fname, checksum_hex, size_bytes,
+ db_session_id);
+ dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);
+ dst_relative = GetSharedFileWithChecksumRel(dst_relative, false);
+ } else if (shared) {
+ dst_relative_tmp = GetSharedFileRel(fname, true);
+ dst_relative = GetSharedFileRel(fname, false);
+ } else {
+ dst_relative = GetPrivateFileRel(backup_id, false, fname);
+ }
+
+ // We copy into `temp_dest_path` and, once finished, rename it to
+ // `final_dest_path`. This allows files to atomically appear at
+ // `final_dest_path`. We can copy directly to the final path when atomicity
+ // is unnecessary, like for files in private backup directories.
+ const std::string* copy_dest_path;
+ std::string temp_dest_path;
+ std::string final_dest_path = GetAbsolutePath(dst_relative);
+ if (!dst_relative_tmp.empty()) {
+ temp_dest_path = GetAbsolutePath(dst_relative_tmp);
+ copy_dest_path = &temp_dest_path;
+ } else {
+ copy_dest_path = &final_dest_path;
+ }
+
+ // Step 2: Determine whether to copy or not
+ // if it's shared, we also need to check if it exists -- if it does, no need
+ // to copy it again.
+ bool need_to_copy = true;
+ // true if final_dest_path is the same path as another live file
+ const bool same_path =
+ live_dst_paths.find(final_dest_path) != live_dst_paths.end();
+
+ bool file_exists = false;
+ if (shared && !same_path) {
+ // Should be in shared directory but not a live path, check existence in
+ // shared directory
+ IOStatus exist =
+ backup_fs_->FileExists(final_dest_path, io_options_, nullptr);
+ if (exist.ok()) {
+ file_exists = true;
+ } else if (exist.IsNotFound()) {
+ file_exists = false;
+ } else {
+ return exist;
+ }
+ }
+
+ if (!contents.empty()) {
+ need_to_copy = false;
+ } else if (shared && (same_path || file_exists)) {
+ need_to_copy = false;
+ auto find_result = backuped_file_infos_.find(dst_relative);
+ if (find_result == backuped_file_infos_.end() && !same_path) {
+ // file exists but not referenced
+ ROCKS_LOG_INFO(
+ options_.info_log,
+ "%s already present, but not referenced by any backup. We will "
+ "overwrite the file.",
+ fname.c_str());
+ need_to_copy = true;
+ // Defer any failure reporting to when we try to write the file
+ backup_fs_->DeleteFile(final_dest_path, io_options_, nullptr)
+ .PermitUncheckedError();
+ } else {
+ // file exists and referenced
+ if (checksum_hex.empty()) {
+ // same_path should not happen for a standard DB, so OK to
+ // read file contents to check for checksum mismatch between
+ // two files from same DB getting same name.
+ // For compatibility with future meta file that might not have
+ // crc32c checksum available, consider it might be empty, but
+ // we don't currently generate meta file without crc32c checksum.
+ // Therefore we have to read & compute it if we don't have it.
+ if (!same_path && !find_result->second->checksum_hex.empty()) {
+ assert(find_result != backuped_file_infos_.end());
+ // Note: to save I/O on incremental backups, we copy prior known
+ // checksum of the file instead of reading entire file contents
+ // to recompute it.
+ checksum_hex = find_result->second->checksum_hex;
+ // Regarding corruption detection, consider:
+ // (a) the DB file is corrupt (since previous backup) and the backup
+ // file is OK: we failed to detect, but the backup is safe. DB can
+ // be repaired/restored once its corruption is detected.
+ // (b) the backup file is corrupt (since previous backup) and the
+ // db file is OK: we failed to detect, but the backup is corrupt.
+ // CreateNewBackup should support fast incremental backups and
+ // there's no way to support that without reading all the files.
+ // We might add an option for extra checks on incremental backup,
+ // but until then, use VerifyBackups to check existing backup data.
+ // (c) file name collision with legitimately different content.
+ // This is almost inconceivable with a well-generated DB session
+ // ID, but even in that case, we double check the file sizes in
+ // BackupMeta::AddFile.
+ } else {
+ IOStatus io_s = ReadFileAndComputeChecksum(
+ src_path, db_fs_, src_env_options, size_limit, &checksum_hex,
+ src_temperature);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ }
+ if (!db_session_id.empty()) {
+ ROCKS_LOG_INFO(options_.info_log,
+ "%s already present, with checksum %s, size %" PRIu64
+ " and DB session identity %s",
+ fname.c_str(), checksum_hex.c_str(), size_bytes,
+ db_session_id.c_str());
+ } else {
+ ROCKS_LOG_INFO(options_.info_log,
+ "%s already present, with checksum %s and size %" PRIu64,
+ fname.c_str(), checksum_hex.c_str(), size_bytes);
+ }
+ }
+ }
+ live_dst_paths.insert(final_dest_path);
+
+ // Step 3: Add work item
+ if (!contents.empty() || need_to_copy) {
+ ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(),
+ copy_dest_path->c_str());
+ CopyOrCreateWorkItem copy_or_create_work_item(
+ src_dir.empty() ? "" : src_path, *copy_dest_path, src_temperature,
+ Temperature::kUnknown /*dst_temp*/, contents, db_env_, backup_env_,
+ src_env_options, options_.sync, rate_limiter, size_limit, stats,
+ progress_callback, src_checksum_func_name, checksum_hex, db_id,
+ db_session_id);
+ BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+ copy_or_create_work_item.result.get_future(), shared, need_to_copy,
+ backup_env_, temp_dest_path, final_dest_path, dst_relative);
+ files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
+ backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
+ } else {
+ std::promise<CopyOrCreateResult> promise_result;
+ BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+ promise_result.get_future(), shared, need_to_copy, backup_env_,
+ temp_dest_path, final_dest_path, dst_relative);
+ backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
+ CopyOrCreateResult result;
+ result.io_status = IOStatus::OK();
+ result.size = size_bytes;
+ result.checksum_hex = std::move(checksum_hex);
+ result.db_id = std::move(db_id);
+ result.db_session_id = std::move(db_session_id);
+ promise_result.set_value(std::move(result));
+ }
+ return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
+ const std::string& src, const std::shared_ptr<FileSystem>& src_fs,
+ const EnvOptions& src_env_options, uint64_t size_limit,
+ std::string* checksum_hex, const Temperature src_temperature) const {
+ if (checksum_hex == nullptr) {
+ return status_to_io_status(Status::Aborted("Checksum pointer is null"));
+ }
+ uint32_t checksum_value = 0;
+ if (size_limit == 0) {
+ size_limit = std::numeric_limits<uint64_t>::max();
+ }
+
+ std::unique_ptr<SequentialFileReader> src_reader;
+ auto file_options = FileOptions(src_env_options);
+ file_options.temperature = src_temperature;
+ RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+ IOStatus io_s = SequentialFileReader::Create(
+ src_fs, src, file_options, &src_reader, nullptr /* dbg */, rate_limiter);
+ if (io_s.IsPathNotFound() && src_temperature != Temperature::kUnknown) {
+ // Retry without temperature hint in case the FileSystem is strict with
+ // non-kUnknown temperature option
+ file_options.temperature = Temperature::kUnknown;
+ io_s = SequentialFileReader::Create(src_fs, src, file_options, &src_reader,
+ nullptr /* dbg */, rate_limiter);
+ }
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ size_t buf_size = kDefaultCopyFileBufferSize;
+ std::unique_ptr<char[]> buf(new char[buf_size]);
+ Slice data;
+
+ do {
+ if (stop_backup_.load(std::memory_order_acquire)) {
+ return status_to_io_status(Status::Incomplete("Backup stopped"));
+ }
+ size_t buffer_to_read =
+ (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+ io_s = src_reader->Read(buffer_to_read, &data, buf.get(),
+ Env::IO_LOW /* rate_limiter_priority */);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ size_limit -= data.size();
+ checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
+ } while (data.size() > 0 && size_limit > 0);
+
+ checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
+
+ return io_s;
+}
+
+Status BackupEngineImpl::GetFileDbIdentities(
+ Env* src_env, const EnvOptions& src_env_options,
+ const std::string& file_path, Temperature file_temp,
+ RateLimiter* rate_limiter, std::string* db_id, std::string* db_session_id) {
+ assert(db_id != nullptr || db_session_id != nullptr);
+
+ Options options;
+ options.env = src_env;
+ SstFileDumper sst_reader(options, file_path, file_temp,
+ 2 * 1024 * 1024
+ /* readahead_size */,
+ false /* verify_checksum */, false /* output_hex */,
+ false /* decode_blob_index */, src_env_options,
+ true /* silent */);
+
+ const TableProperties* table_properties = nullptr;
+ std::shared_ptr<const TableProperties> tp;
+ Status s = sst_reader.getStatus();
+
+ if (s.ok()) {
+ // Try to get table properties from the table reader of sst_reader
+ if (!sst_reader.ReadTableProperties(&tp).ok()) {
+ // Try to use table properites from the initialization of sst_reader
+ table_properties = sst_reader.GetInitTableProperties();
+ } else {
+ table_properties = tp.get();
+ if (table_properties != nullptr && rate_limiter != nullptr) {
+ // sizeof(*table_properties) is a sufficent but far-from-exact
+ // approximation of read bytes due to metaindex block, std::string
+ // properties and varint compression
+ LoopRateLimitRequestHelper(sizeof(*table_properties), rate_limiter,
+ Env::IO_LOW, nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ }
+ }
+ } else {
+ ROCKS_LOG_INFO(options_.info_log, "Failed to read %s: %s",
+ file_path.c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ if (table_properties != nullptr) {
+ if (db_id != nullptr) {
+ db_id->assign(table_properties->db_id);
+ }
+ if (db_session_id != nullptr) {
+ db_session_id->assign(table_properties->db_session_id);
+ if (db_session_id->empty()) {
+ s = Status::NotFound("DB session identity not found in " + file_path);
+ ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+ return s;
+ }
+ }
+ return Status::OK();
+ } else {
+ s = Status::Corruption("Table properties missing in " + file_path);
+ ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+ return s;
+ }
+}
+
+void BackupEngineImpl::LoopRateLimitRequestHelper(
+ const size_t total_bytes_to_request, RateLimiter* rate_limiter,
+ const Env::IOPriority pri, Statistics* stats,
+ const RateLimiter::OpType op_type) {
+ assert(rate_limiter != nullptr);
+ size_t remaining_bytes = total_bytes_to_request;
+ size_t request_bytes = 0;
+ while (remaining_bytes > 0) {
+ request_bytes =
+ std::min(static_cast<size_t>(rate_limiter->GetSingleBurstBytes()),
+ remaining_bytes);
+ rate_limiter->Request(request_bytes, pri, stats, op_type);
+ remaining_bytes -= request_bytes;
+ }
+}
+
+void BackupEngineImpl::DeleteChildren(const std::string& dir,
+ uint32_t file_type_filter) const {
+ std::vector<std::string> children;
+ db_fs_->GetChildren(dir, io_options_, &children, nullptr)
+ .PermitUncheckedError(); // ignore errors
+
+ for (const auto& f : children) {
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(f, &number, &type);
+ if (ok && (file_type_filter & (1 << type))) {
+ // don't delete this file
+ continue;
+ }
+ db_fs_->DeleteFile(dir + "/" + f, io_options_, nullptr)
+ .PermitUncheckedError(); // ignore errors
+ }
+}
+
+IOStatus BackupEngineImpl::ReadChildFileCurrentSizes(
+ const std::string& dir, const std::shared_ptr<FileSystem>& fs,
+ std::unordered_map<std::string, uint64_t>* result) const {
+ assert(result != nullptr);
+ std::vector<Env::FileAttributes> files_attrs;
+ IOStatus io_status = fs->FileExists(dir, io_options_, nullptr);
+ if (io_status.ok()) {
+ io_status =
+ fs->GetChildrenFileAttributes(dir, io_options_, &files_attrs, nullptr);
+ } else if (io_status.IsNotFound()) {
+ // Insert no entries can be considered success
+ io_status = IOStatus::OK();
+ }
+ const bool slash_needed = dir.empty() || dir.back() != '/';
+ for (const auto& file_attrs : files_attrs) {
+ result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name,
+ file_attrs.size_bytes);
+ }
+ return io_status;
+}
+
+IOStatus BackupEngineImpl::GarbageCollect() {
+ assert(!read_only_);
+
+ // We will make a best effort to remove all garbage even in the presence
+ // of inconsistencies or I/O failures that inhibit finding garbage.
+ IOStatus overall_status = IOStatus::OK();
+ // If all goes well, we don't need another auto-GC this session
+ might_need_garbage_collect_ = false;
+
+ ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
+
+ // delete obsolete shared files
+ for (bool with_checksum : {false, true}) {
+ std::vector<std::string> shared_children;
+ {
+ std::string shared_path;
+ if (with_checksum) {
+ shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
+ } else {
+ shared_path = GetAbsolutePath(GetSharedFileRel());
+ }
+ IOStatus io_s = backup_fs_->FileExists(shared_path, io_options_, nullptr);
+ if (io_s.ok()) {
+ io_s = backup_fs_->GetChildren(shared_path, io_options_,
+ &shared_children, nullptr);
+ } else if (io_s.IsNotFound()) {
+ io_s = IOStatus::OK();
+ }
+ if (!io_s.ok()) {
+ overall_status = io_s;
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+ for (auto& child : shared_children) {
+ std::string rel_fname;
+ if (with_checksum) {
+ rel_fname = GetSharedFileWithChecksumRel(child);
+ } else {
+ rel_fname = GetSharedFileRel(child);
+ }
+ auto child_itr = backuped_file_infos_.find(rel_fname);
+ // if it's not refcounted, delete it
+ if (child_itr == backuped_file_infos_.end() ||
+ child_itr->second->refs == 0) {
+ // this might be a directory, but DeleteFile will just fail in that
+ // case, so we're good
+ IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(rel_fname),
+ io_options_, nullptr);
+ ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+ rel_fname.c_str(), io_s.ToString().c_str());
+ backuped_file_infos_.erase(rel_fname);
+ if (!io_s.ok()) {
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+ }
+ }
+
+ // delete obsolete private files
+ std::vector<std::string> private_children;
+ {
+ IOStatus io_s =
+ backup_fs_->GetChildren(GetAbsolutePath(kPrivateDirName), io_options_,
+ &private_children, nullptr);
+ if (!io_s.ok()) {
+ overall_status = io_s;
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+ for (auto& child : private_children) {
+ BackupID backup_id = 0;
+ bool tmp_dir = child.find(".tmp") != std::string::npos;
+ sscanf(child.c_str(), "%u", &backup_id);
+ if (!tmp_dir && // if it's tmp_dir, delete it
+ (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+ // it's either not a number or it's still alive. continue
+ continue;
+ }
+ // here we have to delete the dir and all its children
+ std::string full_private_path =
+ GetAbsolutePath(GetPrivateFileRel(backup_id));
+ std::vector<std::string> subchildren;
+ if (backup_fs_
+ ->GetChildren(full_private_path, io_options_, &subchildren, nullptr)
+ .ok()) {
+ for (auto& subchild : subchildren) {
+ IOStatus io_s = backup_fs_->DeleteFile(full_private_path + subchild,
+ io_options_, nullptr);
+ ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+ (full_private_path + subchild).c_str(),
+ io_s.ToString().c_str());
+ if (!io_s.ok()) {
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+ }
+ // finally delete the private dir
+ IOStatus io_s =
+ backup_fs_->DeleteDir(full_private_path, io_options_, nullptr);
+ ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s",
+ full_private_path.c_str(), io_s.ToString().c_str());
+ if (!io_s.ok()) {
+ // Trying again later might work
+ might_need_garbage_collect_ = true;
+ }
+ }
+
+ assert(overall_status.ok() || might_need_garbage_collect_);
+ return overall_status;
+}
+
+// ------- BackupMeta class --------
+
+IOStatus BackupEngineImpl::BackupMeta::AddFile(
+ std::shared_ptr<FileInfo> file_info) {
+ auto itr = file_infos_->find(file_info->filename);
+ if (itr == file_infos_->end()) {
+ auto ret = file_infos_->insert({file_info->filename, file_info});
+ if (ret.second) {
+ itr = ret.first;
+ itr->second->refs = 1;
+ } else {
+ // if this happens, something is seriously wrong
+ return IOStatus::Corruption("In memory metadata insertion error");
+ }
+ } else {
+ // Compare sizes, because we scanned that off the filesystem on both
+ // ends. This is like a check in VerifyBackup.
+ if (itr->second->size != file_info->size) {
+ std::string msg = "Size mismatch for existing backup file: ";
+ msg.append(file_info->filename);
+ msg.append(" Size in backup is " + std::to_string(itr->second->size) +
+ " while size in DB is " + std::to_string(file_info->size));
+ msg.append(
+ " If this DB file checks as not corrupt, try deleting old"
+ " backups or backing up to a different backup directory.");
+ return IOStatus::Corruption(msg);
+ }
+ if (file_info->checksum_hex.empty()) {
+ // No checksum available to check
+ } else if (itr->second->checksum_hex.empty()) {
+ // Remember checksum if newly acquired
+ itr->second->checksum_hex = file_info->checksum_hex;
+ } else if (itr->second->checksum_hex != file_info->checksum_hex) {
+ // Note: to save I/O, these will be equal trivially on already backed
+ // up files that don't have the checksum in their name. And it should
+ // never fail for files that do have checksum in their name.
+
+ // Should never reach here, but produce an appropriate corruption
+ // message in case we do in a release build.
+ assert(false);
+ std::string msg = "Checksum mismatch for existing backup file: ";
+ msg.append(file_info->filename);
+ msg.append(" Expected checksum is " + itr->second->checksum_hex +
+ " while computed checksum is " + file_info->checksum_hex);
+ msg.append(
+ " If this DB file checks as not corrupt, try deleting old"
+ " backups or backing up to a different backup directory.");
+ return IOStatus::Corruption(msg);
+ }
+ ++itr->second->refs; // increase refcount if already present
+ }
+
+ size_ += file_info->size;
+ files_.push_back(itr->second);
+
+ return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+ IOStatus io_s;
+ for (const auto& file : files_) {
+ --file->refs; // decrease refcount
+ }
+ files_.clear();
+ // delete meta file
+ if (delete_meta) {
+ io_s = fs_->FileExists(meta_filename_, iooptions_, nullptr);
+ if (io_s.ok()) {
+ io_s = fs_->DeleteFile(meta_filename_, iooptions_, nullptr);
+ } else if (io_s.IsNotFound()) {
+ io_s = IOStatus::OK(); // nothing to delete
+ }
+ }
+ timestamp_ = 0;
+ return io_s;
+}
+
+// Constants for backup meta file schema (see LoadFromFile)
+const std::string kSchemaVersionPrefix{"schema_version "};
+const std::string kFooterMarker{"// FOOTER"};
+
+const std::string kAppMetaDataFieldName{"metadata"};
+
+// WART: The checksums are crc32c but named "crc32"
+const std::string kFileCrc32cFieldName{"crc32"};
+const std::string kFileSizeFieldName{"size"};
+const std::string kTemperatureFieldName{"temp"};
+
+// Marks a (future) field that should cause failure if not recognized.
+// Other fields are assumed to be ignorable. For example, in the future
+// we might add
+// ni::file_name_escape uri_percent
+// to indicate all file names have had spaces and special characters
+// escaped using a URI percent encoding.
+const std::string kNonIgnorableFieldPrefix{"ni::"};
+
+// Each backup meta file is of the format (schema version 1):
+//----------------------------------------------------------
+// <timestamp>
+// <seq number>
+// metadata <metadata> (optional)
+// <number of files>
+// <file1> crc32 <crc32c_as_unsigned_decimal>
+// <file2> crc32 <crc32c_as_unsigned_decimal>
+// ...
+//----------------------------------------------------------
+//
+// For schema version 2.x (not in public APIs, but
+// forward-compatibility started):
+//----------------------------------------------------------
+// schema_version <ver>
+// <timestamp>
+// <seq number>
+// [<field name> <field data>]
+// ...
+// <number of files>
+// <file1>( <field name> <field data no spaces>)*
+// <file2>( <field name> <field data no spaces>)*
+// ...
+// [// FOOTER]
+// [<field name> <field data>]
+// ...
+//----------------------------------------------------------
+// where
+// <ver> ::= [0-9]+([.][0-9]+)
+// <field name> ::= [A-Za-z_][A-Za-z_0-9.]+
+// <field data> is anything but newline
+// <field data no spaces> is anything but space and newline
+// Although "// FOOTER" wouldn't strictly be required as a delimiter
+// given the number of files is included, it is there for parsing
+// sanity in case of corruption. It is only required if followed
+// by footer fields, such as a checksum of the meta file (so far).
+// Unrecognized fields are ignored, to support schema evolution on
+// non-critical features with forward compatibility. Update schema
+// major version for breaking changes. Schema minor versions are indicated
+// only for diagnostic/debugging purposes.
+//
+// Fields in schema version 2.0:
+// * Top-level meta fields:
+// * Only "metadata" as in schema version 1
+// * File meta fields:
+// * "crc32" - a crc32c checksum as in schema version 1
+// * "size" - the size of the file (new)
+// * Footer meta fields:
+// * None yet (future use for meta file checksum anticipated)
+//
+IOStatus BackupEngineImpl::BackupMeta::LoadFromFile(
+ const std::string& backup_dir,
+ const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+ RateLimiter* rate_limiter, Logger* info_log,
+ std::unordered_set<std::string>* reported_ignored_fields) {
+ assert(reported_ignored_fields);
+ assert(Empty());
+
+ std::unique_ptr<LineFileReader> backup_meta_reader;
+ {
+ IOStatus io_s = LineFileReader::Create(fs_, meta_filename_, FileOptions(),
+ &backup_meta_reader,
+ nullptr /* dbg */, rate_limiter);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ // If we don't read an explicit schema_version, that implies version 1,
+ // which is what we call the original backup meta schema.
+ int schema_major_version = 1;
+
+ // Failures handled at the end
+ std::string line;
+ if (backup_meta_reader->ReadLine(&line,
+ Env::IO_LOW /* rate_limiter_priority */)) {
+ if (StartsWith(line, kSchemaVersionPrefix)) {
+ std::string ver = line.substr(kSchemaVersionPrefix.size());
+ if (ver == "2" || StartsWith(ver, "2.")) {
+ schema_major_version = 2;
+ } else {
+ return IOStatus::NotSupported(
+ "Unsupported/unrecognized schema version: " + ver);
+ }
+ line.clear();
+ } else if (line.empty()) {
+ return IOStatus::Corruption("Unexpected empty line");
+ }
+ }
+ if (!line.empty()) {
+ timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+ } else if (backup_meta_reader->ReadLine(
+ &line, Env::IO_LOW /* rate_limiter_priority */)) {
+ timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+ }
+ if (backup_meta_reader->ReadLine(&line,
+ Env::IO_LOW /* rate_limiter_priority */)) {
+ sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+ }
+ uint32_t num_files = UINT32_MAX;
+ while (backup_meta_reader->ReadLine(
+ &line, Env::IO_LOW /* rate_limiter_priority */)) {
+ if (line.empty()) {
+ return IOStatus::Corruption("Unexpected empty line");
+ }
+ // Number -> number of files -> exit loop reading optional meta fields
+ if (line[0] >= '0' && line[0] <= '9') {
+ num_files = static_cast<uint32_t>(strtoul(line.c_str(), nullptr, 10));
+ break;
+ }
+ // else, must be a meta field assignment
+ auto space_pos = line.find_first_of(' ');
+ if (space_pos == std::string::npos) {
+ return IOStatus::Corruption("Expected number of files or meta field");
+ }
+ std::string field_name = line.substr(0, space_pos);
+ std::string field_data = line.substr(space_pos + 1);
+ if (field_name == kAppMetaDataFieldName) {
+ // app metadata present
+ bool decode_success = Slice(field_data).DecodeHex(&app_metadata_);
+ if (!decode_success) {
+ return IOStatus::Corruption(
+ "Failed to decode stored hex encoded app metadata");
+ }
+ } else if (schema_major_version < 2) {
+ return IOStatus::Corruption("Expected number of files or \"" +
+ kAppMetaDataFieldName + "\" field");
+ } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+ return IOStatus::NotSupported("Unrecognized non-ignorable meta field " +
+ field_name + " (from future version?)");
+ } else {
+ // Warn the first time we see any particular unrecognized meta field
+ if (reported_ignored_fields->insert("meta:" + field_name).second) {
+ ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s",
+ field_name.c_str());
+ }
+ }
+ }
+ std::vector<std::shared_ptr<FileInfo>> files;
+ bool footer_present = false;
+ while (backup_meta_reader->ReadLine(
+ &line, Env::IO_LOW /* rate_limiter_priority */)) {
+ std::vector<std::string> components = StringSplit(line, ' ');
+
+ if (components.size() < 1) {
+ return IOStatus::Corruption("Empty line instead of file entry.");
+ }
+ if (schema_major_version >= 2 && components.size() == 2 &&
+ line == kFooterMarker) {
+ footer_present = true;
+ break;
+ }
+
+ const std::string& filename = components[0];
+
+ uint64_t actual_size;
+ const std::shared_ptr<FileInfo> file_info = GetFile(filename);
+ if (file_info) {
+ actual_size = file_info->size;
+ } else {
+ std::string abs_path = backup_dir + "/" + filename;
+ auto e = abs_path_to_size.find(abs_path);
+ if (e == abs_path_to_size.end()) {
+ return IOStatus::Corruption(
+ "Pathname in meta file not found on disk: " + abs_path);
+ }
+ actual_size = e->second;
+ }
+
+ if (schema_major_version >= 2) {
+ if (components.size() % 2 != 1) {
+ return IOStatus::Corruption(
+ "Bad number of line components for file entry.");
+ }
+ } else {
+ // Check restricted original schema
+ if (components.size() < 3) {
+ return IOStatus::Corruption("File checksum is missing for " + filename +
+ " in " + meta_filename_);
+ }
+ if (components[1] != kFileCrc32cFieldName) {
+ return IOStatus::Corruption("Unknown checksum type for " + filename +
+ " in " + meta_filename_);
+ }
+ if (components.size() > 3) {
+ return IOStatus::Corruption("Extra data for entry " + filename +
+ " in " + meta_filename_);
+ }
+ }
+
+ std::string checksum_hex;
+ Temperature temp = Temperature::kUnknown;
+ for (unsigned i = 1; i < components.size(); i += 2) {
+ const std::string& field_name = components[i];
+ const std::string& field_data = components[i + 1];
+
+ if (field_name == kFileCrc32cFieldName) {
+ uint32_t checksum_value =
+ static_cast<uint32_t>(strtoul(field_data.c_str(), nullptr, 10));
+ if (field_data != std::to_string(checksum_value)) {
+ return IOStatus::Corruption("Invalid checksum value for " + filename +
+ " in " + meta_filename_);
+ }
+ checksum_hex = ChecksumInt32ToHex(checksum_value);
+ } else if (field_name == kFileSizeFieldName) {
+ uint64_t ex_size =
+ std::strtoull(field_data.c_str(), nullptr, /*base*/ 10);
+ if (ex_size != actual_size) {
+ return IOStatus::Corruption(
+ "For file " + filename + " expected size " +
+ std::to_string(ex_size) + " but found size" +
+ std::to_string(actual_size));
+ }
+ } else if (field_name == kTemperatureFieldName) {
+ auto iter = temperature_string_map.find(field_data);
+ if (iter != temperature_string_map.end()) {
+ temp = iter->second;
+ } else {
+ // Could report corruption, but in case of new temperatures added
+ // in future, letting those map to kUnknown which should generally
+ // be safe.
+ temp = Temperature::kUnknown;
+ }
+ } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+ return IOStatus::NotSupported("Unrecognized non-ignorable file field " +
+ field_name + " (from future version?)");
+ } else {
+ // Warn the first time we see any particular unrecognized file field
+ if (reported_ignored_fields->insert("file:" + field_name).second) {
+ ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s",
+ field_name.c_str());
+ }
+ }
+ }
+
+ files.emplace_back(new FileInfo(filename, actual_size, checksum_hex,
+ /*id*/ "", /*sid*/ "", temp));
+ }
+
+ if (footer_present) {
+ assert(schema_major_version >= 2);
+ while (backup_meta_reader->ReadLine(
+ &line, Env::IO_LOW /* rate_limiter_priority */)) {
+ if (line.empty()) {
+ return IOStatus::Corruption("Unexpected empty line");
+ }
+ auto space_pos = line.find_first_of(' ');
+ if (space_pos == std::string::npos) {
+ return IOStatus::Corruption("Expected footer field");
+ }
+ std::string field_name = line.substr(0, space_pos);
+ std::string field_data = line.substr(space_pos + 1);
+ if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+ return IOStatus::NotSupported("Unrecognized non-ignorable field " +
+ field_name + " (from future version?)");
+ } else if (reported_ignored_fields->insert("footer:" + field_name)
+ .second) {
+ // Warn the first time we see any particular unrecognized footer field
+ ROCKS_LOG_WARN(info_log,
+ "Ignoring unrecognized backup meta footer field %s",
+ field_name.c_str());
+ }
+ }
+ }
+
+ {
+ IOStatus io_s = backup_meta_reader->GetStatus();
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ if (num_files != files.size()) {
+ return IOStatus::Corruption(
+ "Inconsistent number of files or missing/incomplete header in " +
+ meta_filename_);
+ }
+
+ files_.reserve(files.size());
+ for (const auto& file_info : files) {
+ IOStatus io_s = AddFile(file_info);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+
+ return IOStatus::OK();
+}
+
+const std::vector<std::string> minor_version_strings{
+ "", // invalid major version 0
+ "", // implicit major version 1
+ "2.0",
+};
+
+IOStatus BackupEngineImpl::BackupMeta::StoreToFile(
+ bool sync, int schema_version,
+ const TEST_BackupMetaSchemaOptions* schema_test_options) {
+ if (schema_version < 1) {
+ return IOStatus::InvalidArgument(
+ "BackupEngineOptions::schema_version must be >= 1");
+ }
+ if (schema_version > static_cast<int>(minor_version_strings.size() - 1)) {
+ return IOStatus::NotSupported(
+ "Only BackupEngineOptions::schema_version <= " +
+ std::to_string(minor_version_strings.size() - 1) + " is supported");
+ }
+ std::string ver = minor_version_strings[schema_version];
+
+ // Need schema_version >= 2 for TEST_BackupMetaSchemaOptions
+ assert(schema_version >= 2 || schema_test_options == nullptr);
+
+ IOStatus io_s;
+ std::unique_ptr<FSWritableFile> backup_meta_file;
+ FileOptions file_options;
+ file_options.use_mmap_writes = false;
+ file_options.use_direct_writes = false;
+ io_s = fs_->NewWritableFile(meta_tmp_filename_, file_options,
+ &backup_meta_file, nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ std::ostringstream buf;
+ if (schema_test_options) {
+ // override for testing
+ ver = schema_test_options->version;
+ }
+ if (!ver.empty()) {
+ assert(schema_version >= 2);
+ buf << kSchemaVersionPrefix << ver << "\n";
+ }
+ buf << static_cast<unsigned long long>(timestamp_) << "\n";
+ buf << sequence_number_ << "\n";
+
+ if (!app_metadata_.empty()) {
+ std::string hex_encoded_metadata =
+ Slice(app_metadata_).ToString(/* hex */ true);
+ buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n";
+ }
+ if (schema_test_options) {
+ for (auto& e : schema_test_options->meta_fields) {
+ buf << e.first << " " << e.second << "\n";
+ }
+ }
+ buf << files_.size() << "\n";
+
+ for (const auto& file : files_) {
+ buf << file->filename;
+ if (schema_test_options == nullptr ||
+ schema_test_options->crc32c_checksums) {
+ // use crc32c for now, switch to something else if needed
+ buf << " " << kFileCrc32cFieldName << " "
+ << ChecksumHexToInt32(file->checksum_hex);
+ }
+ if (schema_version >= 2 && file->temp != Temperature::kUnknown) {
+ buf << " " << kTemperatureFieldName << " "
+ << temperature_to_string[file->temp];
+ }
+ if (schema_test_options && schema_test_options->file_sizes) {
+ buf << " " << kFileSizeFieldName << " " << std::to_string(file->size);
+ }
+ if (schema_test_options) {
+ for (auto& e : schema_test_options->file_fields) {
+ buf << " " << e.first << " " << e.second;
+ }
+ }
+ buf << "\n";
+ }
+
+ if (schema_test_options && !schema_test_options->footer_fields.empty()) {
+ buf << kFooterMarker << "\n";
+ for (auto& e : schema_test_options->footer_fields) {
+ buf << e.first << " " << e.second << "\n";
+ }
+ }
+
+ io_s = backup_meta_file->Append(Slice(buf.str()), iooptions_, nullptr);
+ IOSTATS_ADD(bytes_written, buf.str().size());
+ if (io_s.ok() && sync) {
+ io_s = backup_meta_file->Sync(iooptions_, nullptr);
+ }
+ if (io_s.ok()) {
+ io_s = backup_meta_file->Close(iooptions_, nullptr);
+ }
+ if (io_s.ok()) {
+ io_s = fs_->RenameFile(meta_tmp_filename_, meta_filename_, iooptions_,
+ nullptr);
+ }
+ return io_s;
+}
+} // namespace
+
+IOStatus BackupEngineReadOnly::Open(const BackupEngineOptions& options,
+ Env* env,
+ BackupEngineReadOnly** backup_engine_ptr) {
+ if (options.destroy_old_data) {
+ return IOStatus::InvalidArgument(
+ "Can't destroy old data with ReadOnly BackupEngine");
+ }
+ std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+ new BackupEngineImplThreadSafe(options, env, true /*read_only*/));
+ auto s = backup_engine->Initialize();
+ if (!s.ok()) {
+ *backup_engine_ptr = nullptr;
+ return s;
+ }
+ *backup_engine_ptr = backup_engine.release();
+ return IOStatus::OK();
+}
+
+void TEST_SetBackupMetaSchemaOptions(
+ BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options) {
+ BackupEngineImplThreadSafe* impl =
+ static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+ impl->TEST_SetBackupMetaSchemaOptions(options);
+}
+
+void TEST_SetDefaultRateLimitersClock(
+ BackupEngine* engine,
+ const std::shared_ptr<SystemClock>& backup_rate_limiter_clock,
+ const std::shared_ptr<SystemClock>& restore_rate_limiter_clock) {
+ BackupEngineImplThreadSafe* impl =
+ static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+ impl->TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock,
+ restore_rate_limiter_clock);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/backup/backup_engine_impl.h b/src/rocksdb/utilities/backup/backup_engine_impl.h
new file mode 100644
index 000000000..398f47f27
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine_impl.h
@@ -0,0 +1,36 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/backup_engine.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TEST_BackupMetaSchemaOptions {
+ std::string version = "2";
+ bool crc32c_checksums = false;
+ bool file_sizes = true;
+ std::map<std::string, std::string> meta_fields;
+ std::map<std::string, std::string> file_fields;
+ std::map<std::string, std::string> footer_fields;
+};
+
+// Modifies the BackupEngine(Impl) to write backup meta files using the
+// unpublished schema version 2, for the life of this object (not backup_dir).
+// TEST_BackupMetaSchemaOptions offers some customization for testing.
+void TEST_SetBackupMetaSchemaOptions(
+ BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options);
+
+// Modifies the BackupEngine(Impl) to use specified clocks for backup and
+// restore rate limiters created by default if not specified by users for
+// test speedup.
+void TEST_SetDefaultRateLimitersClock(
+ BackupEngine* engine,
+ const std::shared_ptr<SystemClock>& backup_rate_limiter_clock = nullptr,
+ const std::shared_ptr<SystemClock>& restore_rate_limiter_clock = nullptr);
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/backup/backup_engine_test.cc b/src/rocksdb/utilities/backup/backup_engine_test.cc
new file mode 100644
index 000000000..d1f74f769
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine_test.cc
@@ -0,0 +1,4219 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include "rocksdb/utilities/backup_engine.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "env/env_chroot.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "utilities/backup/backup_engine_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+const auto kLegacyCrc32cAndFileSize =
+ BackupEngineOptions::kLegacyCrc32cAndFileSize;
+const auto kUseDbSessionId = BackupEngineOptions::kUseDbSessionId;
+const auto kFlagIncludeFileSize = BackupEngineOptions::kFlagIncludeFileSize;
+const auto kNamingDefault = kUseDbSessionId | kFlagIncludeFileSize;
+
+class DummyDB : public StackableDB {
+ public:
+ /* implicit */
+ DummyDB(const Options& options, const std::string& dbname)
+ : StackableDB(nullptr),
+ options_(options),
+ dbname_(dbname),
+ deletions_enabled_(true),
+ sequence_number_(0) {}
+
+ SequenceNumber GetLatestSequenceNumber() const override {
+ return ++sequence_number_;
+ }
+
+ const std::string& GetName() const override { return dbname_; }
+
+ Env* GetEnv() const override { return options_.env; }
+
+ using DB::GetOptions;
+ Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+ return options_;
+ }
+
+ DBOptions GetDBOptions() const override { return DBOptions(options_); }
+
+ Status EnableFileDeletions(bool /*force*/) override {
+ EXPECT_TRUE(!deletions_enabled_);
+ deletions_enabled_ = true;
+ return Status::OK();
+ }
+
+ Status DisableFileDeletions() override {
+ EXPECT_TRUE(deletions_enabled_);
+ deletions_enabled_ = false;
+ return Status::OK();
+ }
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ Status GetLiveFilesStorageInfo(
+ const LiveFilesStorageInfoOptions& opts,
+ std::vector<LiveFileStorageInfo>* files) override {
+ uint64_t number;
+ FileType type;
+ files->clear();
+ for (auto& f : live_files_) {
+ bool success = ParseFileName(f, &number, &type);
+ if (!success) {
+ return Status::InvalidArgument("Bad file name: " + f);
+ }
+ files->emplace_back();
+ LiveFileStorageInfo& info = files->back();
+ info.relative_filename = f;
+ info.directory = dbname_;
+ info.file_number = number;
+ info.file_type = type;
+ if (type == kDescriptorFile) {
+ info.size = 100; // See TestFs::GetChildrenFileAttributes below
+ info.trim_to_size = true;
+ } else if (type == kCurrentFile) {
+ info.size = 0;
+ info.trim_to_size = true;
+ } else {
+ info.size = 200; // See TestFs::GetChildrenFileAttributes below
+ }
+ if (opts.include_checksum_info) {
+ info.file_checksum = kUnknownFileChecksum;
+ info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+ }
+ }
+ return Status::OK();
+ }
+
+ // To avoid FlushWAL called on stacked db which is nullptr
+ Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
+
+ std::vector<std::string> live_files_;
+
+ private:
+ Options options_;
+ std::string dbname_;
+ bool deletions_enabled_;
+ mutable SequenceNumber sequence_number_;
+}; // DummyDB
+
+class TestFs : public FileSystemWrapper {
+ public:
+ explicit TestFs(const std::shared_ptr<FileSystem>& t)
+ : FileSystemWrapper(t) {}
+ const char* Name() const override { return "TestFs"; }
+
+ class DummySequentialFile : public FSSequentialFile {
+ public:
+ explicit DummySequentialFile(bool fail_reads)
+ : FSSequentialFile(), rnd_(5), fail_reads_(fail_reads) {}
+ IOStatus Read(size_t n, const IOOptions&, Slice* result, char* scratch,
+ IODebugContext*) override {
+ if (fail_reads_) {
+ return IOStatus::IOError();
+ }
+ size_t read_size = (n > size_left) ? size_left : n;
+ for (size_t i = 0; i < read_size; ++i) {
+ scratch[i] = rnd_.Next() & 255;
+ }
+ *result = Slice(scratch, read_size);
+ size_left -= read_size;
+ return IOStatus::OK();
+ }
+
+ IOStatus Skip(uint64_t n) override {
+ size_left = (n > size_left) ? size_left - n : 0;
+ return IOStatus::OK();
+ }
+
+ private:
+ size_t size_left = 200;
+ Random rnd_;
+ bool fail_reads_;
+ };
+
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override {
+ MutexLock l(&mutex_);
+ if (dummy_sequential_file_) {
+ r->reset(
+ new TestFs::DummySequentialFile(dummy_sequential_file_fail_reads_));
+ return IOStatus::OK();
+ } else {
+ IOStatus s = FileSystemWrapper::NewSequentialFile(f, file_opts, r, dbg);
+ if (s.ok()) {
+ if ((*r)->use_direct_io()) {
+ ++num_direct_seq_readers_;
+ }
+ ++num_seq_readers_;
+ }
+ return s;
+ }
+ }
+
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override {
+ MutexLock l(&mutex_);
+ written_files_.push_back(f);
+ if (limit_written_files_ == 0) {
+ return IOStatus::NotSupported("Limit on written files reached");
+ }
+ limit_written_files_--;
+ IOStatus s = FileSystemWrapper::NewWritableFile(f, file_opts, r, dbg);
+ if (s.ok()) {
+ if ((*r)->use_direct_io()) {
+ ++num_direct_writers_;
+ }
+ ++num_writers_;
+ }
+ return s;
+ }
+
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* dbg) override {
+ MutexLock l(&mutex_);
+ IOStatus s = FileSystemWrapper::NewRandomAccessFile(f, file_opts, r, dbg);
+ if (s.ok()) {
+ if ((*r)->use_direct_io()) {
+ ++num_direct_rand_readers_;
+ }
+ ++num_rand_readers_;
+ }
+ return s;
+ }
+
+ IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+ IODebugContext* dbg) override {
+ MutexLock l(&mutex_);
+ if (fail_delete_files_) {
+ return IOStatus::IOError();
+ }
+ EXPECT_GT(limit_delete_files_, 0U);
+ limit_delete_files_--;
+ return FileSystemWrapper::DeleteFile(f, options, dbg);
+ }
+
+ IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ MutexLock l(&mutex_);
+ if (fail_delete_files_) {
+ return IOStatus::IOError();
+ }
+ return FileSystemWrapper::DeleteDir(d, options, dbg);
+ }
+
+ void AssertWrittenFiles(std::vector<std::string>& should_have_written) {
+ MutexLock l(&mutex_);
+ std::sort(should_have_written.begin(), should_have_written.end());
+ std::sort(written_files_.begin(), written_files_.end());
+
+ ASSERT_EQ(should_have_written, written_files_);
+ }
+
+ void ClearWrittenFiles() {
+ MutexLock l(&mutex_);
+ written_files_.clear();
+ }
+
+ void SetLimitWrittenFiles(uint64_t limit) {
+ MutexLock l(&mutex_);
+ limit_written_files_ = limit;
+ }
+
+ void SetLimitDeleteFiles(uint64_t limit) {
+ MutexLock l(&mutex_);
+ limit_delete_files_ = limit;
+ }
+
+ void SetDeleteFileFailure(bool fail) {
+ MutexLock l(&mutex_);
+ fail_delete_files_ = fail;
+ }
+
+ void SetDummySequentialFile(bool dummy_sequential_file) {
+ MutexLock l(&mutex_);
+ dummy_sequential_file_ = dummy_sequential_file;
+ }
+ void SetDummySequentialFileFailReads(bool dummy_sequential_file_fail_reads) {
+ MutexLock l(&mutex_);
+ dummy_sequential_file_fail_reads_ = dummy_sequential_file_fail_reads;
+ }
+
+ void SetGetChildrenFailure(bool fail) { get_children_failure_ = fail; }
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override {
+ if (get_children_failure_) {
+ return IOStatus::IOError("SimulatedFailure");
+ }
+ return FileSystemWrapper::GetChildren(dir, io_opts, r, dbg);
+ }
+
+ // Some test cases do not actually create the test files (e.g., see
+ // DummyDB::live_files_) - for those cases, we mock those files' attributes
+ // so CreateNewBackup() can get their attributes.
+ void SetFilenamesForMockedAttrs(const std::vector<std::string>& filenames) {
+ filenames_for_mocked_attrs_ = filenames;
+ }
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override {
+ if (filenames_for_mocked_attrs_.size() > 0) {
+ for (const auto& filename : filenames_for_mocked_attrs_) {
+ uint64_t size_bytes = 200; // Match TestFs
+ if (filename.find("MANIFEST") == 0) {
+ size_bytes = 100; // Match DummyDB::GetLiveFiles
+ }
+ result->push_back({dir + "/" + filename, size_bytes});
+ }
+ return IOStatus::OK();
+ }
+ return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+ dbg);
+ }
+
+ IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+ uint64_t* s, IODebugContext* dbg) override {
+ if (filenames_for_mocked_attrs_.size() > 0) {
+ auto fname = f.substr(f.find_last_of('/') + 1);
+ auto filename_iter = std::find(filenames_for_mocked_attrs_.begin(),
+ filenames_for_mocked_attrs_.end(), fname);
+ if (filename_iter != filenames_for_mocked_attrs_.end()) {
+ *s = 200; // Match TestFs
+ if (fname.find("MANIFEST") == 0) {
+ *s = 100; // Match DummyDB::GetLiveFiles
+ }
+ return IOStatus::OK();
+ }
+ return IOStatus::NotFound(fname);
+ }
+ return FileSystemWrapper::GetFileSize(f, options, s, dbg);
+ }
+
+ void SetCreateDirIfMissingFailure(bool fail) {
+ create_dir_if_missing_failure_ = fail;
+ }
+ IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+ IODebugContext* dbg) override {
+ if (create_dir_if_missing_failure_) {
+ return IOStatus::IOError("SimulatedFailure");
+ }
+ return FileSystemWrapper::CreateDirIfMissing(d, options, dbg);
+ }
+
+ void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; }
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override {
+ if (new_directory_failure_) {
+ return IOStatus::IOError("SimulatedFailure");
+ }
+ return FileSystemWrapper::NewDirectory(name, io_opts, result, dbg);
+ }
+
+ void ClearFileOpenCounters() {
+ MutexLock l(&mutex_);
+ num_rand_readers_ = 0;
+ num_direct_rand_readers_ = 0;
+ num_seq_readers_ = 0;
+ num_direct_seq_readers_ = 0;
+ num_writers_ = 0;
+ num_direct_writers_ = 0;
+ }
+
+ int num_rand_readers() { return num_rand_readers_; }
+ int num_direct_rand_readers() { return num_direct_rand_readers_; }
+ int num_seq_readers() { return num_seq_readers_; }
+ int num_direct_seq_readers() { return num_direct_seq_readers_; }
+ int num_writers() { return num_writers_; }
+ // FIXME(?): unused
+ int num_direct_writers() { return num_direct_writers_; }
+
+ private:
+ port::Mutex mutex_;
+ bool dummy_sequential_file_ = false;
+ bool dummy_sequential_file_fail_reads_ = false;
+ std::vector<std::string> written_files_;
+ std::vector<std::string> filenames_for_mocked_attrs_;
+ uint64_t limit_written_files_ = 1000000;
+ uint64_t limit_delete_files_ = 1000000;
+ bool fail_delete_files_ = false;
+
+ bool get_children_failure_ = false;
+ bool create_dir_if_missing_failure_ = false;
+ bool new_directory_failure_ = false;
+
+ // Keeps track of how many files of each type were successfully opened, and
+ // out of those, how many were opened with direct I/O.
+ std::atomic<int> num_rand_readers_{};
+ std::atomic<int> num_direct_rand_readers_{};
+ std::atomic<int> num_seq_readers_{};
+ std::atomic<int> num_direct_seq_readers_{};
+ std::atomic<int> num_writers_{};
+ std::atomic<int> num_direct_writers_{};
+}; // TestFs
+
+class FileManager : public EnvWrapper {
+ public:
+ explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
+ const char* Name() const override { return "FileManager"; }
+
+ Status GetRandomFileInDir(const std::string& dir, std::string* fname,
+ uint64_t* fsize) {
+ std::vector<FileAttributes> children;
+ auto s = GetChildrenFileAttributes(dir, &children);
+ if (!s.ok()) {
+ return s;
+ } else if (children.size() <= 2) { // . and ..
+ return Status::NotFound("Empty directory: " + dir);
+ }
+ assert(fname != nullptr);
+ while (true) {
+ int i = rnd_.Next() % children.size();
+ fname->assign(dir + "/" + children[i].name);
+ *fsize = children[i].size_bytes;
+ return Status::OK();
+ }
+ // should never get here
+ assert(false);
+ return Status::NotFound("");
+ }
+
+ Status DeleteRandomFileInDir(const std::string& dir) {
+ std::vector<std::string> children;
+ Status s = GetChildren(dir, &children);
+ if (!s.ok()) {
+ return s;
+ }
+ while (true) {
+ int i = rnd_.Next() % children.size();
+ return DeleteFile(dir + "/" + children[i]);
+ }
+ // should never get here
+ assert(false);
+ return Status::NotFound("");
+ }
+
+ Status AppendToRandomFileInDir(const std::string& dir,
+ const std::string& data) {
+ std::vector<std::string> children;
+ Status s = GetChildren(dir, &children);
+ if (!s.ok()) {
+ return s;
+ }
+ while (true) {
+ int i = rnd_.Next() % children.size();
+ return WriteToFile(dir + "/" + children[i], data);
+ }
+ // should never get here
+ assert(false);
+ return Status::NotFound("");
+ }
+
+ Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
+ std::string file_contents;
+ Status s = ReadFileToString(this, fname, &file_contents);
+ if (!s.ok()) {
+ return s;
+ }
+ s = DeleteFile(fname);
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (uint64_t i = 0; i < bytes_to_corrupt; ++i) {
+ std::string tmp = rnd_.RandomString(1);
+ file_contents[rnd_.Next() % file_contents.size()] = tmp[0];
+ }
+ return WriteToFile(fname, file_contents);
+ }
+
+ Status CorruptFileStart(const std::string& fname) {
+ std::string to_xor = "blah";
+ std::string file_contents;
+ Status s = ReadFileToString(this, fname, &file_contents);
+ if (!s.ok()) {
+ return s;
+ }
+ s = DeleteFile(fname);
+ if (!s.ok()) {
+ return s;
+ }
+ for (size_t i = 0; i < to_xor.size(); ++i) {
+ file_contents[i] ^= to_xor[i];
+ }
+ return WriteToFile(fname, file_contents);
+ }
+
+ Status CorruptChecksum(const std::string& fname, bool appear_valid) {
+ std::string metadata;
+ Status s = ReadFileToString(this, fname, &metadata);
+ if (!s.ok()) {
+ return s;
+ }
+ s = DeleteFile(fname);
+ if (!s.ok()) {
+ return s;
+ }
+
+ auto pos = metadata.find("private");
+ if (pos == std::string::npos) {
+ return Status::Corruption("private file is expected");
+ }
+ pos = metadata.find(" crc32 ", pos + 6);
+ if (pos == std::string::npos) {
+ return Status::Corruption("checksum not found");
+ }
+
+ if (metadata.size() < pos + 7) {
+ return Status::Corruption("bad CRC32 checksum value");
+ }
+
+ if (appear_valid) {
+ if (metadata[pos + 8] == '\n') {
+ // single digit value, safe to insert one more digit
+ metadata.insert(pos + 8, 1, '0');
+ } else {
+ metadata.erase(pos + 8, 1);
+ }
+ } else {
+ metadata[pos + 7] = 'a';
+ }
+
+ return WriteToFile(fname, metadata);
+ }
+
+ Status WriteToFile(const std::string& fname, const std::string& data) {
+ std::unique_ptr<WritableFile> file;
+ EnvOptions env_options;
+ env_options.use_mmap_writes = false;
+ Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
+ if (!s.ok()) {
+ return s;
+ }
+ return file->Append(Slice(data));
+ }
+
+ private:
+ Random rnd_;
+}; // FileManager
+
+// utility functions
+namespace {
+
+enum FillDBFlushAction {
+ kFlushMost,
+ kFlushAll,
+ kAutoFlushOnly,
+};
+
+// Many tests in this file expect FillDB to write at least one sst file,
+// so the default behavior (if not kAutoFlushOnly) of FillDB is to force
+// a flush. But to ensure coverage of the WAL file case, we also (by default)
+// do one Put after the Flush (kFlushMost).
+size_t FillDB(DB* db, int from, int to,
+ FillDBFlushAction flush_action = kFlushMost) {
+ size_t bytes_written = 0;
+ for (int i = from; i < to; ++i) {
+ std::string key = "testkey" + std::to_string(i);
+ std::string value = "testvalue" + std::to_string(i);
+ bytes_written += key.size() + value.size();
+
+ EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+
+ if (flush_action == kFlushMost && i == to - 2) {
+ EXPECT_OK(db->Flush(FlushOptions()));
+ }
+ }
+ if (flush_action == kFlushAll) {
+ EXPECT_OK(db->Flush(FlushOptions()));
+ }
+ return bytes_written;
+}
+
+void AssertExists(DB* db, int from, int to) {
+ for (int i = from; i < to; ++i) {
+ std::string key = "testkey" + std::to_string(i);
+ std::string value;
+ Status s = db->Get(ReadOptions(), Slice(key), &value);
+ ASSERT_EQ(value, "testvalue" + std::to_string(i));
+ }
+}
+
+void AssertEmpty(DB* db, int from, int to) {
+ for (int i = from; i < to; ++i) {
+ std::string key = "testkey" + std::to_string(i);
+ std::string value = "testvalue" + std::to_string(i);
+
+ Status s = db->Get(ReadOptions(), Slice(key), &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+}
+} // namespace
+
+class BackupEngineTest : public testing::Test {
+ public:
+ enum ShareOption {
+ kNoShare,
+ kShareNoChecksum,
+ kShareWithChecksum,
+ };
+
+ const std::vector<ShareOption> kAllShareOptions = {kNoShare, kShareNoChecksum,
+ kShareWithChecksum};
+
+ BackupEngineTest() {
+ // set up files
+ std::string db_chroot = test::PerThreadDBPath("db_for_backup");
+ std::string backup_chroot = test::PerThreadDBPath("db_backups");
+ EXPECT_OK(Env::Default()->CreateDirIfMissing(db_chroot));
+ EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_chroot));
+ dbname_ = "/tempdb";
+ backupdir_ = "/tempbk";
+ latest_backup_ = backupdir_ + "/LATEST_BACKUP";
+
+ // set up FileSystem & Envs
+ db_chroot_fs_ = NewChrootFileSystem(FileSystem::Default(), db_chroot);
+ backup_chroot_fs_ =
+ NewChrootFileSystem(FileSystem::Default(), backup_chroot);
+ test_db_fs_ = std::make_shared<TestFs>(db_chroot_fs_);
+ test_backup_fs_ = std::make_shared<TestFs>(backup_chroot_fs_);
+ SetEnvsFromFileSystems();
+
+ // set up db options
+ options_.create_if_missing = true;
+ options_.paranoid_checks = true;
+ options_.write_buffer_size = 1 << 17; // 128KB
+ options_.wal_dir = dbname_;
+ options_.enable_blob_files = true;
+
+ // The sync option is not easily testable in unit tests, but should be
+ // smoke tested across all the other backup tests. However, it is
+ // certainly not worth doubling the runtime of backup tests for it.
+ // Thus, we can enable sync for one of our alternate testing
+ // configurations.
+ constexpr bool kUseSync =
+#ifdef ROCKSDB_MODIFY_NPHASH
+ true;
+#else
+ false;
+#endif // ROCKSDB_MODIFY_NPHASH
+
+ // set up backup db options
+ engine_options_.reset(new BackupEngineOptions(
+ backupdir_, test_backup_env_.get(), /*share_table_files*/ true,
+ logger_.get(), kUseSync));
+
+ // most tests will use multi-threaded backups
+ engine_options_->max_background_operations = 7;
+
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // delete old LATEST_BACKUP file, which some tests create for compatibility
+ // testing.
+ backup_chroot_env_->DeleteFile(latest_backup_).PermitUncheckedError();
+ }
+
+ void SetEnvsFromFileSystems() {
+ db_chroot_env_.reset(
+ new CompositeEnvWrapper(Env::Default(), db_chroot_fs_));
+ backup_chroot_env_.reset(
+ new CompositeEnvWrapper(Env::Default(), backup_chroot_fs_));
+ test_db_env_.reset(new CompositeEnvWrapper(Env::Default(), test_db_fs_));
+ options_.env = test_db_env_.get();
+ test_backup_env_.reset(
+ new CompositeEnvWrapper(Env::Default(), test_backup_fs_));
+ if (engine_options_) {
+ engine_options_->backup_env = test_backup_env_.get();
+ }
+ file_manager_.reset(new FileManager(backup_chroot_env_.get()));
+ db_file_manager_.reset(new FileManager(db_chroot_env_.get()));
+
+ // Create logger
+ DBOptions logger_options;
+ logger_options.env = db_chroot_env_.get();
+ ASSERT_OK(CreateLoggerFromOptions(dbname_, logger_options, &logger_));
+ }
+
+ DB* OpenDB() {
+ DB* db;
+ EXPECT_OK(DB::Open(options_, dbname_, &db));
+ return db;
+ }
+
+ void CloseAndReopenDB(bool read_only = false) {
+ // Close DB
+ db_.reset();
+
+ // Open DB
+ test_db_fs_->SetLimitWrittenFiles(1000000);
+ DB* db;
+ if (read_only) {
+ ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+ } else {
+ ASSERT_OK(DB::Open(options_, dbname_, &db));
+ }
+ db_.reset(db);
+ }
+
+ void InitializeDBAndBackupEngine(bool dummy = false) {
+ // reset all the db env defaults
+ test_db_fs_->SetLimitWrittenFiles(1000000);
+ test_db_fs_->SetDummySequentialFile(dummy);
+
+ DB* db;
+ if (dummy) {
+ dummy_db_ = new DummyDB(options_, dbname_);
+ db = dummy_db_;
+ } else {
+ ASSERT_OK(DB::Open(options_, dbname_, &db));
+ }
+ db_.reset(db);
+ }
+
+ virtual void OpenDBAndBackupEngine(
+ bool destroy_old_data = false, bool dummy = false,
+ ShareOption shared_option = kShareNoChecksum) {
+ InitializeDBAndBackupEngine(dummy);
+ // reset backup env defaults
+ test_backup_fs_->SetLimitWrittenFiles(1000000);
+ engine_options_->destroy_old_data = destroy_old_data;
+ engine_options_->share_table_files = shared_option != kNoShare;
+ engine_options_->share_files_with_checksum =
+ shared_option == kShareWithChecksum;
+ OpenBackupEngine(destroy_old_data);
+ }
+
+ void CloseDBAndBackupEngine() {
+ db_.reset();
+ backup_engine_.reset();
+ }
+
+ void OpenBackupEngine(bool destroy_old_data = false) {
+ engine_options_->destroy_old_data = destroy_old_data;
+ engine_options_->info_log = logger_.get();
+ BackupEngine* backup_engine;
+ ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ backup_engine_.reset(backup_engine);
+ }
+
+ void CloseBackupEngine() { backup_engine_.reset(nullptr); }
+
+ // cross-cutting test of GetBackupInfo
+ void AssertBackupInfoConsistency() {
+ std::vector<BackupInfo> backup_info;
+ backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true);
+ std::map<std::string, uint64_t> file_sizes;
+
+ // Find the files that are supposed to be there
+ for (auto& backup : backup_info) {
+ uint64_t sum_for_backup = 0;
+ for (auto& file : backup.file_details) {
+ auto e = file_sizes.find(file.relative_filename);
+ if (e == file_sizes.end()) {
+ // fprintf(stderr, "Adding %s -> %u\n",
+ // file.relative_filename.c_str(), (unsigned)file.size);
+ file_sizes[file.relative_filename] = file.size;
+ } else {
+ ASSERT_EQ(file_sizes[file.relative_filename], file.size);
+ }
+ sum_for_backup += file.size;
+ }
+ ASSERT_EQ(backup.size, sum_for_backup);
+ }
+
+ std::vector<BackupID> corrupt_backup_ids;
+ backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+ bool has_corrupt = corrupt_backup_ids.size() > 0;
+
+ // Compare with what's in backup dir
+ std::vector<std::string> child_dirs;
+ ASSERT_OK(
+ test_backup_env_->GetChildren(backupdir_ + "/private", &child_dirs));
+ for (auto& dir : child_dirs) {
+ dir = "private/" + dir;
+ }
+ child_dirs.push_back("shared"); // might not exist
+ child_dirs.push_back("shared_checksum"); // might not exist
+ for (auto& dir : child_dirs) {
+ std::vector<std::string> children;
+ test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children)
+ .PermitUncheckedError();
+ // fprintf(stderr, "ls %s\n", (backupdir_ + "/" + dir).c_str());
+ for (auto& file : children) {
+ uint64_t size;
+ size = UINT64_MAX; // appease clang-analyze
+ std::string rel_file = dir + "/" + file;
+ // fprintf(stderr, "stat %s\n", (backupdir_ + "/" + rel_file).c_str());
+ ASSERT_OK(
+ test_backup_env_->GetFileSize(backupdir_ + "/" + rel_file, &size));
+ auto e = file_sizes.find(rel_file);
+ if (e == file_sizes.end()) {
+ // The only case in which we should find files not reported
+ ASSERT_TRUE(has_corrupt);
+ } else {
+ ASSERT_EQ(e->second, size);
+ file_sizes.erase(e);
+ }
+ }
+ }
+
+ // Everything should have been matched
+ ASSERT_EQ(file_sizes.size(), 0);
+ }
+
+ // restores backup backup_id and asserts the existence of
+ // [start_exist, end_exist> and not-existence of
+ // [end_exist, end>
+ //
+ // if backup_id == 0, it means restore from latest
+ // if end == 0, don't check AssertEmpty
+ void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist,
+ uint32_t end_exist, uint32_t end = 0,
+ bool keep_log_files = false) {
+ RestoreOptions restore_options(keep_log_files);
+ bool opened_backup_engine = false;
+ if (backup_engine_.get() == nullptr) {
+ opened_backup_engine = true;
+ OpenBackupEngine();
+ }
+ AssertBackupInfoConsistency();
+
+ // Now perform restore
+ if (backup_id > 0) {
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
+ restore_options));
+ } else {
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_,
+ restore_options));
+ }
+ DB* db = OpenDB();
+ // Check DB contents
+ AssertExists(db, start_exist, end_exist);
+ if (end != 0) {
+ AssertEmpty(db, end_exist, end);
+ }
+ delete db;
+ if (opened_backup_engine) {
+ CloseBackupEngine();
+ }
+ }
+
+ void DeleteLogFiles() {
+ std::vector<std::string> delete_logs;
+ ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs));
+ for (auto f : delete_logs) {
+ uint64_t number;
+ FileType type;
+ bool ok = ParseFileName(f, &number, &type);
+ if (ok && type == kWalFile) {
+ ASSERT_OK(db_chroot_env_->DeleteFile(dbname_ + "/" + f));
+ }
+ }
+ }
+
+ Status GetDataFilesInDB(const FileType& file_type,
+ std::vector<FileAttributes>* files) {
+ std::vector<std::string> live;
+ uint64_t ignore_manifest_size;
+ Status s = db_->GetLiveFiles(live, &ignore_manifest_size, /*flush*/ false);
+ if (!s.ok()) {
+ return s;
+ }
+ std::vector<FileAttributes> children;
+ s = test_db_env_->GetChildrenFileAttributes(dbname_, &children);
+ for (const auto& child : children) {
+ FileType type;
+ uint64_t number = 0;
+ if (ParseFileName(child.name, &number, &type) && type == file_type &&
+ std::find(live.begin(), live.end(), "/" + child.name) != live.end()) {
+ files->push_back(child);
+ }
+ }
+ return s;
+ }
+
+ Status GetRandomDataFileInDB(const FileType& file_type,
+ std::string* fname_out,
+ uint64_t* fsize_out = nullptr) {
+ Random rnd(6); // NB: hardly "random"
+ std::vector<FileAttributes> files;
+ Status s = GetDataFilesInDB(file_type, &files);
+ if (!s.ok()) {
+ return s;
+ }
+ if (files.empty()) {
+ return Status::NotFound("");
+ }
+ size_t i = rnd.Uniform(static_cast<int>(files.size()));
+ *fname_out = dbname_ + "/" + files[i].name;
+ if (fsize_out) {
+ *fsize_out = files[i].size_bytes;
+ }
+ return Status::OK();
+ }
+
+ Status CorruptRandomDataFileInDB(const FileType& file_type) {
+ std::string fname;
+ uint64_t fsize = 0;
+ Status s = GetRandomDataFileInDB(file_type, &fname, &fsize);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::string file_contents;
+ s = ReadFileToString(test_db_env_.get(), fname, &file_contents);
+ if (!s.ok()) {
+ return s;
+ }
+ s = test_db_env_->DeleteFile(fname);
+ if (!s.ok()) {
+ return s;
+ }
+
+ file_contents[0] = (file_contents[0] + 257) % 256;
+ return WriteStringToFile(test_db_env_.get(), file_contents, fname);
+ }
+
+ void AssertDirectoryFilesMatchRegex(const std::string& dir,
+ const TestRegex& pattern,
+ const std::string& file_type,
+ int minimum_count) {
+ std::vector<FileAttributes> children;
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+ int found_count = 0;
+ for (const auto& child : children) {
+ if (EndsWith(child.name, file_type)) {
+ ASSERT_MATCHES_REGEX(child.name, pattern);
+ ++found_count;
+ }
+ }
+ ASSERT_GE(found_count, minimum_count);
+ }
+
+ void AssertDirectoryFilesSizeIndicators(const std::string& dir,
+ int minimum_count) {
+ std::vector<FileAttributes> children;
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+ int found_count = 0;
+ for (const auto& child : children) {
+ auto last_underscore = child.name.find_last_of('_');
+ auto last_dot = child.name.find_last_of('.');
+ ASSERT_NE(child.name, child.name.substr(0, last_underscore));
+ ASSERT_NE(child.name, child.name.substr(0, last_dot));
+ ASSERT_LT(last_underscore, last_dot);
+ std::string s = child.name.substr(last_underscore + 1,
+ last_dot - (last_underscore + 1));
+ ASSERT_EQ(s, std::to_string(child.size_bytes));
+ ++found_count;
+ }
+ ASSERT_GE(found_count, minimum_count);
+ }
+
+ // files
+ std::string dbname_;
+ std::string backupdir_;
+ std::string latest_backup_;
+
+ // logger_ must be above backup_engine_ such that the engine's destructor,
+ // which uses a raw pointer to the logger, executes first.
+ std::shared_ptr<Logger> logger_;
+
+ // FileSystems
+ std::shared_ptr<FileSystem> db_chroot_fs_;
+ std::shared_ptr<FileSystem> backup_chroot_fs_;
+ std::shared_ptr<TestFs> test_db_fs_;
+ std::shared_ptr<TestFs> test_backup_fs_;
+
+ // Env wrappers
+ std::unique_ptr<Env> db_chroot_env_;
+ std::unique_ptr<Env> backup_chroot_env_;
+ std::unique_ptr<Env> test_db_env_;
+ std::unique_ptr<Env> test_backup_env_;
+ std::unique_ptr<FileManager> file_manager_;
+ std::unique_ptr<FileManager> db_file_manager_;
+
+ // all the dbs!
+ DummyDB* dummy_db_; // owned as db_ when present
+ std::unique_ptr<DB> db_;
+ std::unique_ptr<BackupEngine> backup_engine_;
+
+ // options
+ Options options_;
+
+ protected:
+ void DestroyDBWithoutCheck(const std::string& dbname,
+ const Options& options) {
+ // DestroyDB may fail because the db might not be existed for some tests
+ DestroyDB(dbname, options).PermitUncheckedError();
+ }
+
+ std::unique_ptr<BackupEngineOptions> engine_options_;
+}; // BackupEngineTest
+
+void AppendPath(const std::string& path, std::vector<std::string>& v) {
+ for (auto& f : v) {
+ f = path + f;
+ }
+}
+
+class BackupEngineTestWithParam : public BackupEngineTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ BackupEngineTestWithParam() {
+ engine_options_->share_files_with_checksum = GetParam();
+ }
+ void OpenDBAndBackupEngine(
+ bool destroy_old_data = false, bool dummy = false,
+ ShareOption shared_option = kShareNoChecksum) override {
+ BackupEngineTest::InitializeDBAndBackupEngine(dummy);
+ // reset backup env defaults
+ test_backup_fs_->SetLimitWrittenFiles(1000000);
+ engine_options_->destroy_old_data = destroy_old_data;
+ engine_options_->share_table_files = shared_option != kNoShare;
+ // NOTE: keep share_files_with_checksum setting from constructor
+ OpenBackupEngine(destroy_old_data);
+ }
+};
+
+TEST_F(BackupEngineTest, FileCollision) {
+ const int keys_iteration = 100;
+ for (const auto& sopt : kAllShareOptions) {
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+ FillDB(db_.get(), 0, keys_iteration);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // If the db directory has been cleaned up, it is sensitive to file
+ // collision.
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // open fresh DB, but old backups present
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+ sopt);
+ FillDB(db_.get(), 0, keys_iteration);
+ ASSERT_OK(db_->Flush(FlushOptions())); // like backup would do
+ FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+ if (sopt != kShareNoChecksum) {
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ } else {
+ // The new table files created in FillDB() will clash with the old
+ // backup and sharing tables with no checksum will have the file
+ // collision problem.
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+ ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ }
+ CloseDBAndBackupEngine();
+
+ // delete old data
+ DestroyDBWithoutCheck(dbname_, options_);
+ }
+}
+
+// This test verifies that the verifyBackup method correctly identifies
+// invalid backups
+TEST_P(BackupEngineTestWithParam, VerifyBackup) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true);
+ // create five backups
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ // ---------- case 1. - valid backup -----------
+ ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok());
+
+ // ---------- case 2. - delete a file -----------i
+ ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1"));
+ ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound());
+
+ // ---------- case 3. - corrupt a file -----------
+ std::string append_data = "Corrupting a random file";
+ ASSERT_OK(file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
+ append_data));
+ ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption());
+
+ // ---------- case 4. - invalid backup -----------
+ ASSERT_TRUE(backup_engine_->VerifyBackup(6).IsNotFound());
+ CloseDBAndBackupEngine();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// open DB, write, close DB, backup, restore, repeat
+TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
+ // has to be a big number, so that it triggers the memtable flush
+ const int keys_iteration = 5000;
+ const int max_key = keys_iteration * 4 + 10;
+ // first iter -- flush before backup
+ // second iter -- don't flush before backup
+ for (int iter = 0; iter < 2; ++iter) {
+ // delete old data
+ DestroyDBWithoutCheck(dbname_, options_);
+ bool destroy_data = true;
+
+ // every iteration --
+ // 1. insert new data in the DB
+ // 2. backup the DB
+ // 3. destroy the db
+ // 4. restore the db, check everything is still there
+ for (int i = 0; i < 5; ++i) {
+ // in last iteration, put smaller amount of data,
+ int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+ // ---- insert new data and back up ----
+ OpenDBAndBackupEngine(destroy_data);
+ destroy_data = false;
+ // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+ FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0))
+ << "iter: " << iter << ", idx: " << i;
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // ---- make sure it's empty ----
+ DB* db = OpenDB();
+ AssertEmpty(db, 0, fill_up_to);
+ delete db;
+
+ // ---- restore the DB ----
+ OpenBackupEngine();
+ if (i >= 3) { // test purge old backups
+ // when i == 4, purge to only 1 backup
+ // when i == 3, purge to 2 backups
+ ASSERT_OK(backup_engine_->PurgeOldBackups(5 - i));
+ }
+ // ---- make sure the data is there ---
+ AssertBackupConsistency(0, 0, fill_up_to, max_key);
+ CloseBackupEngine();
+ }
+ }
+}
+
+// open DB, write, backup, write, backup, close, restore
+TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
+ // has to be a big number, so that it triggers the memtable flush
+ const int keys_iteration = 5000;
+ const int max_key = keys_iteration * 4 + 10;
+ Random rnd(7);
+ // delete old data
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // TODO: Implement & test db_paths support in backup (not supported in
+ // restore)
+ // options_.db_paths.emplace_back(dbname_, 500 * 1024);
+ // options_.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+
+ OpenDBAndBackupEngine(true);
+ // write some data, backup, repeat
+ for (int i = 0; i < 5; ++i) {
+ if (i == 4) {
+ // delete backup number 2, online delete!
+ ASSERT_OK(backup_engine_->DeleteBackup(2));
+ }
+ // in last iteration, put smaller amount of data,
+ // so that backups can share sst files
+ int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+ // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+ FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
+ // we should get consistent results with flush_before_backup
+ // set to both true and false
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ }
+ // close and destroy
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // ---- make sure it's empty ----
+ DB* db = OpenDB();
+ AssertEmpty(db, 0, max_key);
+ delete db;
+
+ // ---- restore every backup and verify all the data is there ----
+ OpenBackupEngine();
+ for (int i = 1; i <= 5; ++i) {
+ if (i == 2) {
+ // we deleted backup 2
+ Status s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
+ ASSERT_TRUE(!s.ok());
+ } else {
+ int fill_up_to = std::min(keys_iteration * i, max_key);
+ AssertBackupConsistency(i, 0, fill_up_to, max_key);
+ }
+ }
+
+ // delete some backups -- this should leave only backups 3 and 5 alive
+ ASSERT_OK(backup_engine_->DeleteBackup(4));
+ ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+ std::vector<BackupInfo> backup_info;
+ backup_engine_->GetBackupInfo(&backup_info);
+ ASSERT_EQ(2UL, backup_info.size());
+
+ // check backup 3
+ AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key);
+ // check backup 5
+ AssertBackupConsistency(5, 0, max_key);
+
+ CloseBackupEngine();
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+INSTANTIATE_TEST_CASE_P(BackupEngineTestWithParam, BackupEngineTestWithParam,
+ ::testing::Bool());
+
+// this will make sure that backup does not copy the same file twice
+TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) {
+ OpenDBAndBackupEngine(true, true);
+
+ // should write 5 DB files + one meta file
+ test_backup_fs_->SetLimitWrittenFiles(7);
+ test_backup_fs_->ClearWrittenFiles();
+ test_db_fs_->SetLimitWrittenFiles(0);
+ dummy_db_->live_files_ = {"00010.sst", "00011.sst", "CURRENT", "MANIFEST-01",
+ "00011.log"};
+ test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+ std::vector<std::string> should_have_written = {
+ "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", "/private/1/CURRENT",
+ "/private/1/MANIFEST-01", "/private/1/00011.log", "/meta/.1.tmp"};
+ AppendPath(backupdir_, should_have_written);
+ test_backup_fs_->AssertWrittenFiles(should_have_written);
+
+ char db_number = '1';
+
+ for (std::string other_sst : {"00015.sst", "00017.sst", "00019.sst"}) {
+ // should write 4 new DB files + one meta file
+ // should not write/copy 00010.sst, since it's already there!
+ test_backup_fs_->SetLimitWrittenFiles(6);
+ test_backup_fs_->ClearWrittenFiles();
+
+ dummy_db_->live_files_ = {"00010.sst", other_sst, "CURRENT", "MANIFEST-01",
+ "00011.log"};
+ test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+ // should not open 00010.sst - it's already there
+
+ ++db_number;
+ std::string private_dir = std::string("/private/") + db_number;
+ should_have_written = {
+ "/shared/." + other_sst + ".tmp", private_dir + "/CURRENT",
+ private_dir + "/MANIFEST-01", private_dir + "/00011.log",
+ std::string("/meta/.") + db_number + ".tmp"};
+ AppendPath(backupdir_, should_have_written);
+ test_backup_fs_->AssertWrittenFiles(should_have_written);
+ }
+
+ ASSERT_OK(backup_engine_->DeleteBackup(1));
+ ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+
+ // 00011.sst was only in backup 1, should be deleted
+ ASSERT_EQ(Status::NotFound(),
+ test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+ ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+
+ // MANIFEST file size should be only 100
+ uint64_t size = 0;
+ ASSERT_OK(test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01",
+ &size));
+ ASSERT_EQ(100UL, size);
+ ASSERT_OK(
+ test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size));
+ ASSERT_EQ(200UL, size);
+
+ CloseBackupEngine();
+
+ //
+ // Now simulate incomplete delete by removing just meta
+ //
+ ASSERT_OK(test_backup_env_->DeleteFile(backupdir_ + "/meta/2"));
+
+ OpenBackupEngine();
+
+ // 1 appears to be removed, so
+ // 2 non-corrupt and 0 corrupt seen
+ std::vector<BackupInfo> backup_info;
+ std::vector<BackupID> corrupt_backup_ids;
+ backup_engine_->GetBackupInfo(&backup_info);
+ backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+ ASSERT_EQ(2UL, backup_info.size());
+ ASSERT_EQ(0UL, corrupt_backup_ids.size());
+
+ // Keep the two we see, but this should suffice to purge unreferenced
+ // shared files from incomplete delete.
+ ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+ // Make sure dangling sst file has been removed (somewhere along this
+ // process). GarbageCollect should not be needed.
+ ASSERT_EQ(Status::NotFound(),
+ test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+ ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+ ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+ // Now actually purge a good one
+ ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+
+ ASSERT_EQ(Status::NotFound(),
+ test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+ ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+ CloseDBAndBackupEngine();
+}
+
+// test various kind of corruptions that may happen:
+// 1. Not able to write a file for backup - that backup should fail,
+// everything else should work
+// 2. Corrupted backup meta file or missing backuped file - we should
+// not be able to open that backup, but all other backups should be
+// fine
+// 3. Corrupted checksum value - if the checksum is not a valid uint32_t,
+// db open should fail, otherwise, it aborts during the restore process.
+TEST_F(BackupEngineTest, CorruptionsTest) {
+ const int keys_iteration = 5000;
+ Random rnd(6);
+ Status s;
+
+ OpenDBAndBackupEngine(true);
+ // create five backups
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ }
+
+ // ---------- case 1. - fail a write -----------
+ // try creating backup 6, but fail a write
+ FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+ test_backup_fs_->SetLimitWrittenFiles(2);
+ // should fail
+ s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2));
+ ASSERT_NOK(s);
+ test_backup_fs_->SetLimitWrittenFiles(1000000);
+ // latest backup should have all the keys
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+
+ // --------- case 2. corrupted backup meta or missing backuped file ----
+ ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
+ // since 5 meta is now corrupted, latest backup should be 4
+ AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
+ OpenBackupEngine();
+ s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_);
+ ASSERT_NOK(s);
+ CloseBackupEngine();
+ ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
+ // 4 is corrupted, 3 is the latest backup now
+ AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
+ OpenBackupEngine();
+ s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_);
+ CloseBackupEngine();
+ ASSERT_NOK(s);
+
+ // --------- case 3. corrupted checksum value ----
+ ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false));
+ // checksum of backup 3 is an invalid value, this can be detected at
+ // db open time, and it reverts to the previous backup automatically
+ AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5);
+ // checksum of the backup 2 appears to be valid, this can cause checksum
+ // mismatch and abort restore process
+ ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+ OpenBackupEngine();
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+ s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
+ ASSERT_NOK(s);
+
+ // make sure that no corrupt backups have actually been deleted!
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/3"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/4"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/1"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/2"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/3"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/4"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+ // delete the corrupt backups and then make sure they're actually deleted
+ ASSERT_OK(backup_engine_->DeleteBackup(5));
+ ASSERT_OK(backup_engine_->DeleteBackup(4));
+ ASSERT_OK(backup_engine_->DeleteBackup(3));
+ ASSERT_OK(backup_engine_->DeleteBackup(2));
+ // Should not be needed anymore with auto-GC on DeleteBackup
+ //(void)backup_engine_->GarbageCollect();
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/meta/5"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/private/5"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/meta/4"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/private/4"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/meta/3"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/private/3"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/meta/2"));
+ ASSERT_EQ(Status::NotFound(),
+ file_manager_->FileExists(backupdir_ + "/private/2"));
+ CloseBackupEngine();
+ AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
+
+ // new backup should be 2!
+ OpenDBAndBackupEngine();
+ FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
+}
+
+// Corrupt a file but maintain its size
+TEST_F(BackupEngineTest, CorruptFileMaintainSize) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true);
+ // create a backup
+ FillDB(db_.get(), 0, keys_iteration);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ // verify with file size
+ ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+ // verify with file checksum
+ ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+ std::string file_to_corrupt;
+ uint64_t file_size = 0;
+ // under normal circumstance, there should be at least one nonempty file
+ while (file_size == 0) {
+ // get a random file in /private/1
+ assert(file_manager_
+ ->GetRandomFileInDir(backupdir_ + "/private/1", &file_to_corrupt,
+ &file_size)
+ .ok());
+ // corrupt the file by replacing its content by file_size random bytes
+ ASSERT_OK(file_manager_->CorruptFile(file_to_corrupt, file_size));
+ }
+ // file sizes match
+ ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+ // file checksums mismatch
+ ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+ // sanity check, use default second argument
+ ASSERT_OK(backup_engine_->VerifyBackup(1));
+ CloseDBAndBackupEngine();
+
+ // an extra challenge
+ // set share_files_with_checksum to true and do two more backups
+ // corrupt all the table files in shared_checksum but maintain their sizes
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum);
+ // creat two backups
+ for (int i = 1; i < 3; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ std::vector<FileAttributes> children;
+ const std::string dir = backupdir_ + "/shared_checksum";
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+ for (const auto& child : children) {
+ if (child.size_bytes == 0) {
+ continue;
+ }
+ // corrupt the file by replacing its content by file_size random bytes
+ ASSERT_OK(
+ file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+ }
+ // file sizes match
+ ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+ ASSERT_OK(backup_engine_->VerifyBackup(2, false));
+ // file checksums mismatch
+ ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+ ASSERT_NOK(backup_engine_->VerifyBackup(2, true));
+ CloseDBAndBackupEngine();
+}
+
+// Corrupt a blob file but maintain its size
+TEST_P(BackupEngineTestWithParam, CorruptBlobFileMaintainSize) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true);
+ // create a backup
+ FillDB(db_.get(), 0, keys_iteration);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ // verify with file size
+ ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+ // verify with file checksum
+ ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+ std::string file_to_corrupt;
+ std::vector<FileAttributes> children;
+
+ std::string dir = backupdir_;
+ if (engine_options_->share_files_with_checksum) {
+ dir += "/shared_checksum";
+ } else {
+ dir += "/shared";
+ }
+
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+
+ for (const auto& child : children) {
+ if (EndsWith(child.name, ".blob") && child.size_bytes != 0) {
+ // corrupt the blob files by replacing its content by file_size random
+ // bytes
+ ASSERT_OK(
+ file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+ }
+ }
+
+ // file sizes match
+ ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+ // file checksums mismatch
+ ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+ // sanity check, use default second argument
+ ASSERT_OK(backup_engine_->VerifyBackup(1));
+ CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, TableFileCorruptedBeforeBackup) {
+ const int keys_iteration = 50000;
+
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kNoShare);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random table file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+ // file_checksum_gen_factory is null, and thus table checksum is not
+ // verified for creating a new backup; no correction is detected
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // Enable table file checksum in DB manifest
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kNoShare);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random table file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+ // table file checksum is enabled so we should be able to detect any
+ // corruption
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files has
+// been corrupted and the blob file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, BlobFileCorruptedBeforeBackup) {
+ const int keys_iteration = 50000;
+
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kNoShare);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random blob file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+ // file_checksum_gen_factory is null, and thus blob checksum is not
+ // verified for creating a new backup; no correction is detected
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // Enable file checksum in DB manifest
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kNoShare);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random blob file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+
+ // file checksum is enabled so we should be able to detect any
+ // corruption
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest for the
+// case when backup table files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, TableFileCorruptedBeforeBackup) {
+ const int keys_iteration = 50000;
+
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random table file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+ // cannot detect corruption since DB manifest has no table checksums
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // Enable table checksums in DB manifest
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random table file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+ // corruption is detected
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files have
+// been corrupted and the blob file checksum is stored in the DB manifest for
+// the case when backup blob files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, BlobFileCorruptedBeforeBackup) {
+ const int keys_iteration = 50000;
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random blob file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+ // cannot detect corruption since DB manifest has no blob file checksums
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ // Enable blob file checksums in DB manifest
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseAndReopenDB(/*read_only*/ true);
+ // corrupt a random blob file in the DB directory
+ ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+ // corruption is detected
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
+ const int keys_iteration = 50000;
+ engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+ // When share_files_with_checksum is on, we calculate checksums of table
+ // files before and after copying. So we can test whether a corruption has
+ // happened during the file is copied to backup directory.
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum);
+
+ FillDB(db_.get(), 0, keys_iteration);
+ std::atomic<bool> corrupted{false};
+ // corrupt files when copying to the backup directory
+ SyncPoint::GetInstance()->SetCallBack(
+ "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+ [&](void* data) {
+ if (data != nullptr) {
+ Slice* d = reinterpret_cast<Slice*>(data);
+ if (!d->empty()) {
+ d->remove_suffix(1);
+ corrupted = true;
+ }
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Status s = backup_engine_->CreateNewBackup(db_.get());
+ if (corrupted) {
+ ASSERT_NOK(s);
+ } else {
+ // should not in this path in normal cases
+ ASSERT_OK(s);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ CloseDBAndBackupEngine();
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) {
+ const int keys_iteration = 50000;
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ for (auto& sopt : kAllShareOptions) {
+ // Since the default DB table file checksum is on, we obtain checksums of
+ // table files from the DB manifest before copying and verify it with the
+ // one calculated during copying.
+ // Therefore, we can test whether a corruption has happened during the file
+ // being copied to backup directory.
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+
+ FillDB(db_.get(), 0, keys_iteration);
+
+ // corrupt files when copying to the backup directory
+ SyncPoint::GetInstance()->SetCallBack(
+ "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+ [&](void* data) {
+ if (data != nullptr) {
+ Slice* d = reinterpret_cast<Slice*>(data);
+ if (!d->empty()) {
+ d->remove_suffix(1);
+ }
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ // The only case that we can't detect a corruption is when the file
+ // being backed up is empty. But as keys_iteration is large, such
+ // a case shouldn't have happened and we should be able to detect
+ // the corruption.
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ CloseDBAndBackupEngine();
+ // delete old files in db
+ DestroyDBWithoutCheck(dbname_, options_);
+ }
+}
+
+TEST_F(BackupEngineTest, InterruptCreationTest) {
+ // Interrupt backup creation by failing new writes and failing cleanup of the
+ // partial state. Then verify a subsequent backup can still succeed.
+ const int keys_iteration = 5000;
+ Random rnd(6);
+
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, keys_iteration);
+ test_backup_fs_->SetLimitWrittenFiles(2);
+ test_backup_fs_->SetDeleteFileFailure(true);
+ // should fail creation
+ ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ CloseDBAndBackupEngine();
+ // should also fail cleanup so the tmp directory stays behind
+ ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/"));
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */);
+ test_backup_fs_->SetLimitWrittenFiles(1000000);
+ test_backup_fs_->SetDeleteFileFailure(false);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ // latest backup should have all the keys
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(0, 0, keys_iteration);
+}
+
+TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
+ const int keys_iteration = 5000;
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ for (const auto& sopt : kAllShareOptions) {
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+ FillDB(db_.get(), 0, keys_iteration);
+ // That FillDB leaves a mix of flushed and unflushed data
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1",
+ "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"},
+ {"BackupEngineTest::FlushCompactDuringBackupCheckpoint:After",
+ "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() {
+ TEST_SYNC_POINT(
+ "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
+ FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+ ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbi->TEST_WaitForCompact());
+ TEST_SYNC_POINT(
+ "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
+ }};
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ flush_thread.join();
+ CloseDBAndBackupEngine();
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ /* FIXME(peterd): reinstate with option for checksum in file names
+ if (sopt == kShareWithChecksum) {
+ // Ensure we actually got DB manifest checksums by inspecting
+ // shared_checksum file names for hex checksum component
+ TestRegex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst");
+ std::vector<FileAttributes> children;
+ const std::string dir = backupdir_ + "/shared_checksum";
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+ for (const auto& child : children) {
+ if (child.size_bytes == 0) {
+ continue;
+ }
+ EXPECT_MATCHES_REGEX(child.name, expected);
+ }
+ }
+ */
+ AssertBackupConsistency(0, 0, keys_iteration);
+ }
+}
+
+inline std::string OptionsPath(std::string ret, int backupID) {
+ ret += "/private/";
+ ret += std::to_string(backupID);
+ ret += "/";
+ return ret;
+}
+
+// Backup the LATEST options file to
+// "<backup_dir>/private/<backup_id>/OPTIONS<number>"
+
+TEST_F(BackupEngineTest, BackupOptions) {
+ OpenDBAndBackupEngine(true);
+ for (int i = 1; i < 5; i++) {
+ std::string name;
+ std::vector<std::string> filenames;
+ // Must reset() before reset(OpenDB()) again.
+ // Calling OpenDB() while *db_ is existing will cause LOCK issue
+ db_.reset();
+ db_.reset(OpenDB());
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
+ options_.env, &name));
+ ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name));
+ ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i),
+ &filenames));
+ for (auto fn : filenames) {
+ if (fn.compare(0, 7, "OPTIONS") == 0) {
+ ASSERT_EQ(name, fn);
+ }
+ }
+ }
+
+ CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
+ OpenDBAndBackupEngine(true);
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+ "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
+ {"BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions",
+ "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
+ TEST_SYNC_POINT(
+ "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
+ DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+ // Change arbitrary option to trigger OPTIONS file deletion
+ ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "false"}}));
+ ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "true"}}));
+ ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+ {{"paranoid_file_checks", "false"}}));
+ TEST_SYNC_POINT(
+ "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
+ }};
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ setoptions_thread.join();
+ CloseDBAndBackupEngine();
+}
+
+// This test verifies we don't delete the latest backup when read-only option is
+// set
+TEST_F(BackupEngineTest, NoDeleteWithReadOnly) {
+ const int keys_iteration = 5000;
+ Random rnd(6);
+
+ OpenDBAndBackupEngine(true);
+ // create five backups
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+ }
+ CloseDBAndBackupEngine();
+ ASSERT_OK(file_manager_->WriteToFile(latest_backup_, "4"));
+
+ engine_options_->destroy_old_data = false;
+ BackupEngineReadOnly* read_only_backup_engine;
+ ASSERT_OK(BackupEngineReadOnly::Open(
+ backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine));
+
+ // assert that data from backup 5 is still here (even though LATEST_BACKUP
+ // says 4 is latest)
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+ ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+ // Behavior change: We now ignore LATEST_BACKUP contents. This means that
+ // we should have 5 backups, even if LATEST_BACKUP says 4.
+ std::vector<BackupInfo> backup_info;
+ read_only_backup_engine->GetBackupInfo(&backup_info);
+ ASSERT_EQ(5UL, backup_info.size());
+ delete read_only_backup_engine;
+}
+
+TEST_F(BackupEngineTest, FailOverwritingBackups) {
+ options_.write_buffer_size = 1024 * 1024 * 1024; // 1GB
+ options_.disable_auto_compactions = true;
+
+ // create backups 1, 2, 3, 4, 5
+ OpenDBAndBackupEngine(true);
+ for (int i = 0; i < 5; ++i) {
+ CloseDBAndBackupEngine();
+ DeleteLogFiles();
+ OpenDBAndBackupEngine(false);
+ FillDB(db_.get(), 100 * i, 100 * (i + 1), kFlushAll);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ }
+ CloseDBAndBackupEngine();
+
+ // restore 3
+ OpenBackupEngine();
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(3, dbname_, dbname_));
+ CloseBackupEngine();
+
+ OpenDBAndBackupEngine(false);
+ // More data, bigger SST
+ FillDB(db_.get(), 1000, 1300, kFlushAll);
+ Status s = backup_engine_->CreateNewBackup(db_.get());
+ // the new backup fails because new table files
+ // clash with old table files from backups 4 and 5
+ // (since write_buffer_size is huge, we can be sure that
+ // each backup will generate only one sst file and that
+ // a file generated here would have the same name as an
+ // sst file generated by backup 4, and will be bigger)
+ ASSERT_TRUE(s.IsCorruption());
+ ASSERT_OK(backup_engine_->DeleteBackup(4));
+ ASSERT_OK(backup_engine_->DeleteBackup(5));
+ // now, the backup can succeed
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, NoShareTableFiles) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true, false, kNoShare);
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < 5; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * 6);
+ }
+}
+
+// Verify that you can backup and restore with share_files_with_checksum on
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksums) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < 5; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * 6);
+ }
+}
+
+// Verify that you can backup and restore using share_files_with_checksum set to
+// false and then transition this option to true
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsTransition) {
+ const int keys_iteration = 5000;
+ // set share_files_with_checksum to false
+ OpenDBAndBackupEngine(true, false, kShareNoChecksum);
+ for (int i = 0; i < 5; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < 5; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * 6);
+ }
+
+ // set share_files_with_checksum to true and do some more backups
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ for (int i = 5; i < 10; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ // Verify first (about to delete)
+ AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11);
+
+ // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+ // is OK even if we open without share_table_files
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+ ASSERT_OK(backup_engine_->DeleteBackup(1));
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ CloseDBAndBackupEngine();
+
+ // Verify rest (not deleted)
+ for (int i = 1; i < 10; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * 11);
+ }
+}
+
+// Verify backup and restore with various naming options, check names
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
+ ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+ kNamingDefault);
+
+ const int keys_iteration = 5000;
+
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseDBAndBackupEngine();
+
+ static const std::map<ShareFilesNaming, TestRegex> option_to_expected = {
+ {kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+ // kFlagIncludeFileSize redundant here
+ {kLegacyCrc32cAndFileSize | kFlagIncludeFileSize,
+ "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+ {kUseDbSessionId, "[0-9]+_s[0-9A-Z]{20}[.]sst"},
+ {kUseDbSessionId | kFlagIncludeFileSize,
+ "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
+ };
+
+ const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+ for (const auto& pair : option_to_expected) {
+ CloseAndReopenDB();
+ engine_options_->share_files_with_checksum_naming = pair.first;
+ OpenBackupEngine(true /*destroy_old_data*/);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+ AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", pair.second,
+ ".sst", 1 /* minimum_count */);
+ if (std::string::npos != pair.second.GetPattern().find("_[0-9]+[.]sst")) {
+ AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
+ 1 /* minimum_count */);
+ }
+
+ AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+ blobfile_pattern, ".blob",
+ 1 /* minimum_count */);
+ }
+}
+
+// Mimic SST file generated by pre-6.12 releases and verify that
+// old names are always used regardless of naming option.
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
+ const int keys_iteration = 5000;
+
+ // Pre-6.12 release did not include db id and db session id properties.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+ auto props = static_cast<TableProperties*>(props_vs);
+ props->db_id = "";
+ props->db_session_id = "";
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Corrupting the table properties corrupts the unique id.
+ // Ignore the unique id recorded in the manifest.
+ options_.verify_sst_unique_id_in_manifest = false;
+
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+ FillDB(db_.get(), 0, keys_iteration);
+ CloseDBAndBackupEngine();
+
+ // Old names should always be used on old files
+ const TestRegex sstfile_pattern("[0-9]+_[0-9]+_[0-9]+[.]sst");
+
+ const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+ for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
+ CloseAndReopenDB();
+ engine_options_->share_files_with_checksum_naming = option;
+ OpenBackupEngine(true /*destroy_old_data*/);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+ AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+ sstfile_pattern, ".sst",
+ 1 /* minimum_count */);
+ AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+ blobfile_pattern, ".blob",
+ 1 /* minimum_count */);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test how naming options interact with detecting DB corruption
+// between incremental backups
+TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
+ const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+
+ for (bool corrupt_before_first_backup : {false, true}) {
+ for (ShareFilesNaming option :
+ {share_no_checksum, kLegacyCrc32cAndFileSize, kNamingDefault}) {
+ auto share =
+ option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+ if (option != share_no_checksum) {
+ engine_options_->share_files_with_checksum_naming = option;
+ }
+ OpenDBAndBackupEngine(true, false, share);
+ DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+ // A small SST file
+ ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
+ ASSERT_OK(dbi->Flush(FlushOptions()));
+ // And a bigger one
+ ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+ ASSERT_OK(dbi->Flush(FlushOptions()));
+ ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+ CloseAndReopenDB(/*read_only*/ true);
+
+ std::vector<FileAttributes> table_files;
+ ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files));
+ ASSERT_EQ(table_files.size(), 2);
+ std::string tf0 = dbname_ + "/" + table_files[0].name;
+ std::string tf1 = dbname_ + "/" + table_files[1].name;
+
+ CloseDBAndBackupEngine();
+
+ if (corrupt_before_first_backup) {
+ // This corrupts a data block, which does not cause DB open
+ // failure, only failure on accessing the block.
+ ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+ }
+
+ OpenDBAndBackupEngine(false, false, share);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ CloseDBAndBackupEngine();
+
+ // if corrupt_before_first_backup, this undoes the initial corruption
+ ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+
+ OpenDBAndBackupEngine(false, false, share);
+ Status s = backup_engine_->CreateNewBackup(db_.get());
+
+ // Even though none of the naming options catch the inconsistency
+ // between the first and second time backing up fname, in the case
+ // of kUseDbSessionId (kNamingDefault), this is an intentional
+ // trade-off to avoid full scan of files from the DB that are
+ // already backed up. If we did the scan, kUseDbSessionId could catch
+ // the corruption. kLegacyCrc32cAndFileSize does the scan (to
+ // compute checksum for name) without catching the corruption,
+ // because the corruption means the names don't merge.
+ EXPECT_OK(s);
+
+ // VerifyBackup doesn't check DB integrity or table file internal
+ // checksums
+ EXPECT_OK(backup_engine_->VerifyBackup(1, true));
+ EXPECT_OK(backup_engine_->VerifyBackup(2, true));
+
+ db_.reset();
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_));
+ {
+ DB* db = OpenDB();
+ s = db->VerifyChecksum();
+ delete db;
+ }
+ if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) {
+ // Second backup is OK because it used (uncorrupt) file from first
+ // backup instead of (corrupt) file from DB.
+ // This is arguably a good trade-off vs. treating the file as distinct
+ // from the old version, because a file should be more likely to be
+ // corrupt as it ages. Although the backed-up file might also corrupt
+ // with age, the alternative approach (checksum in file name computed
+ // from current DB file contents) wouldn't detect that case at backup
+ // time either. Although you would have both copies of the file with
+ // the alternative approach, that would only last until the older
+ // backup is deleted.
+ ASSERT_OK(s);
+ } else if (option == kLegacyCrc32cAndFileSize &&
+ corrupt_before_first_backup) {
+ // Second backup is OK because it saved the updated (uncorrupt)
+ // file from DB, instead of the sharing with first backup.
+ // Recall: if corrupt_before_first_backup, [second CorruptFileStart]
+ // undoes the initial corruption.
+ // This is arguably a bad trade-off vs. sharing the old version of the
+ // file because a file should be more likely to corrupt as it ages.
+ // (Not likely that the previously backed-up version was already
+ // corrupt and the new version is non-corrupt. This approach doesn't
+ // help if backed-up version is corrupted after taking the backup.)
+ ASSERT_OK(s);
+ } else {
+ // Something is legitimately corrupted, but we can't be sure what
+ // with information available (TODO? unless one passes block checksum
+ // test and other doesn't. Probably better to use end-to-end full file
+ // checksum anyway.)
+ ASSERT_TRUE(s.IsCorruption());
+ }
+
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+ }
+ }
+}
+
+// Test how naming options interact with detecting file size corruption
+// between incremental backups
+TEST_F(BackupEngineTest, FileSizeForIncremental) {
+ const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+ // TODO: enable blob files once Integrated BlobDB supports DB session id.
+ options_.enable_blob_files = false;
+
+ for (ShareFilesNaming option : {share_no_checksum, kLegacyCrc32cAndFileSize,
+ kNamingDefault, kUseDbSessionId}) {
+ auto share =
+ option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+ if (option != share_no_checksum) {
+ engine_options_->share_files_with_checksum_naming = option;
+ }
+ OpenDBAndBackupEngine(true, false, share);
+
+ std::vector<FileAttributes> children;
+ const std::string shared_dir =
+ backupdir_ +
+ (option == share_no_checksum ? "/shared" : "/shared_checksum");
+
+ // A single small SST file
+ ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+
+ // First, test that we always detect file size corruption on the shared
+ // backup side on incremental. (Since sizes aren't really part of backup
+ // meta file, this works by querying the filesystem for the sizes.)
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+ CloseDBAndBackupEngine();
+
+ // Corrupt backup SST file
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+ ASSERT_EQ(children.size(), 1U); // one sst
+ for (const auto& child : children) {
+ if (child.name.size() > 4 && child.size_bytes > 0) {
+ ASSERT_OK(
+ file_manager_->WriteToFile(shared_dir + "/" + child.name, "asdf"));
+ break;
+ }
+ }
+
+ OpenDBAndBackupEngine(false, false, share);
+ Status s = backup_engine_->CreateNewBackup(db_.get());
+ EXPECT_TRUE(s.IsCorruption());
+
+ ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+ CloseDBAndBackupEngine();
+
+ // Second, test that a hypothetical db session id collision would likely
+ // not suffice to corrupt a backup, because there's a good chance of
+ // file size difference (in this test, guaranteed) so either no name
+ // collision or detected collision.
+
+ // Create backup 1
+ OpenDBAndBackupEngine(false, false, share);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ // Even though we have "the same" DB state as backup 1, we need
+ // to restore to recreate the same conditions as later restore.
+ db_.reset();
+ DestroyDBWithoutCheck(dbname_, options_);
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+ CloseDBAndBackupEngine();
+
+ // Forge session id
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SetDbSessionId", [](void* sid_void_star) {
+ std::string* sid = static_cast<std::string*>(sid_void_star);
+ *sid = "01234567890123456789";
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // Create another SST file
+ OpenDBAndBackupEngine(false, false, share);
+ ASSERT_OK(db_->Put(WriteOptions(), "y", "x"));
+
+ // Create backup 2
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+
+ // Restore backup 1 (again)
+ db_.reset();
+ DestroyDBWithoutCheck(dbname_, options_);
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+ CloseDBAndBackupEngine();
+
+ // Create another SST file with same number and db session id, only bigger
+ OpenDBAndBackupEngine(false, false, share);
+ ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+
+ // Count backup SSTs files.
+ children.clear();
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+ ASSERT_EQ(children.size(), 2U); // two sst files
+
+ // Try create backup 3
+ s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/);
+
+ // Re-count backup SSTs
+ children.clear();
+ ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+
+ if (option == kUseDbSessionId) {
+ // Acceptable to call it corruption if size is not in name and
+ // db session id collision is practically impossible.
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_EQ(children.size(), 2U); // no SST file added
+ } else if (option == share_no_checksum) {
+ // Good to call it corruption if both backups cannot be
+ // accommodated.
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_EQ(children.size(), 2U); // no SST file added
+ } else {
+ // Since opening a DB seems sufficient for detecting size corruption
+ // on the DB side, this should be a good thing, ...
+ EXPECT_OK(s);
+ // ... as long as we did actually treat it as a distinct SST file.
+ EXPECT_EQ(children.size(), 3U); // Another SST added
+ }
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+}
+
+// Verify backup and restore with share_files_with_checksum off and then
+// transition this option to on and share_files_with_checksum_naming to be
+// based on kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingTransition) {
+ const int keys_iteration = 5000;
+ // We may set share_files_with_checksum_naming to kLegacyCrc32cAndFileSize
+ // here but even if we don't, it should have no effect when
+ // share_files_with_checksum is false
+ ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+ kNamingDefault);
+ // set share_files_with_checksum to false
+ OpenDBAndBackupEngine(true, false, kShareNoChecksum);
+ int j = 3;
+ for (int i = 0; i < j; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < j; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * (j + 1));
+ }
+
+ // set share_files_with_checksum to true and do some more backups
+ // and use session id in the name of SST file backup
+ ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+ kNamingDefault);
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+ // Use checksum in the name as well
+ ++j;
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ // Verify first (about to delete)
+ AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+ // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+ // is OK even if we open without share_table_files but with
+ // share_files_with_checksum_naming based on kUseDbSessionId
+ ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+ kNamingDefault);
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+ ASSERT_OK(backup_engine_->DeleteBackup(1));
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ CloseDBAndBackupEngine();
+
+ // Verify second (about to delete)
+ AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+ // Use checksum and file size for backup table file names and open without
+ // share_table_files
+ // Again, make sure that GarbageCollect / DeleteBackup is OK
+ engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+ ASSERT_OK(backup_engine_->DeleteBackup(2));
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ CloseDBAndBackupEngine();
+
+ // Verify rest (not deleted)
+ for (int i = 2; i < j; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * (j + 1));
+ }
+}
+
+// Verify backup and restore with share_files_with_checksum on and transition
+// from kLegacyCrc32cAndFileSize to kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
+ engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+ const int keys_iteration = 5000;
+ // set share_files_with_checksum to true
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+ int j = 3;
+ for (int i = 0; i < j; ++i) {
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < j; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * (j + 1));
+ }
+
+ engine_options_->share_files_with_checksum_naming = kUseDbSessionId;
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ ++j;
+ options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ // Verify first (about to delete)
+ AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+ // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+ // is OK even if we open without share_table_files
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+ ASSERT_OK(backup_engine_->DeleteBackup(1));
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ CloseDBAndBackupEngine();
+
+ // Verify second (about to delete)
+ AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+ // Use checksum and file size for backup table file names and open without
+ // share_table_files
+ // Again, make sure that GarbageCollect / DeleteBackup is OK
+ engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+ ASSERT_OK(backup_engine_->DeleteBackup(2));
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ CloseDBAndBackupEngine();
+
+ // Verify rest (not deleted)
+ for (int i = 2; i < j; ++i) {
+ AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+ keys_iteration * (j + 1));
+ }
+}
+
+// This test simulates cleaning up after aborted or incomplete creation
+// of a new backup.
+TEST_F(BackupEngineTest, DeleteTmpFiles) {
+ for (int cleanup_fn : {1, 2, 3, 4}) {
+ for (ShareOption shared_option : kAllShareOptions) {
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+ shared_option);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ BackupID next_id = 1;
+ BackupID oldest_id = std::numeric_limits<BackupID>::max();
+ {
+ std::vector<BackupInfo> backup_info;
+ backup_engine_->GetBackupInfo(&backup_info);
+ for (const auto& bi : backup_info) {
+ next_id = std::max(next_id, bi.backup_id + 1);
+ oldest_id = std::min(oldest_id, bi.backup_id);
+ }
+ }
+ CloseDBAndBackupEngine();
+
+ // An aborted or incomplete new backup will always be in the next
+ // id (maybe more)
+ std::string next_private = "private/" + std::to_string(next_id);
+
+ // NOTE: both shared and shared_checksum should be cleaned up
+ // regardless of how the backup engine is opened.
+ std::vector<std::string> tmp_files_and_dirs;
+ for (const auto& dir_and_file : {
+ std::make_pair(std::string("shared"),
+ std::string(".00006.sst.tmp")),
+ std::make_pair(std::string("shared_checksum"),
+ std::string(".00007.sst.tmp")),
+ std::make_pair(next_private, std::string("00003.sst")),
+ }) {
+ std::string dir = backupdir_ + "/" + dir_and_file.first;
+ ASSERT_OK(file_manager_->CreateDirIfMissing(dir));
+ ASSERT_OK(file_manager_->FileExists(dir));
+
+ std::string file = dir + "/" + dir_and_file.second;
+ ASSERT_OK(file_manager_->WriteToFile(file, "tmp"));
+ ASSERT_OK(file_manager_->FileExists(file));
+
+ tmp_files_and_dirs.push_back(file);
+ }
+ if (cleanup_fn != /*CreateNewBackup*/ 4) {
+ // This exists after CreateNewBackup because it's deleted then
+ // re-created.
+ tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private);
+ }
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+ shared_option);
+ // Need to call one of these explicitly to delete tmp files
+ switch (cleanup_fn) {
+ case 1:
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ break;
+ case 2:
+ ASSERT_OK(backup_engine_->DeleteBackup(oldest_id));
+ break;
+ case 3:
+ ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+ break;
+ case 4:
+ // Does a garbage collect if it sees that next private dir exists
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+ break;
+ default:
+ assert(false);
+ }
+ CloseDBAndBackupEngine();
+ for (std::string file_or_dir : tmp_files_and_dirs) {
+ if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) {
+ FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn;
+ }
+ }
+ }
+ }
+}
+
+TEST_F(BackupEngineTest, KeepLogFiles) {
+ engine_options_->backup_log_files = false;
+ // basically infinite
+ options_.WAL_ttl_seconds = 24 * 60 * 60;
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100, kFlushAll);
+ FillDB(db_.get(), 100, 200, kFlushAll);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+ FillDB(db_.get(), 200, 300, kFlushAll);
+ FillDB(db_.get(), 300, 400, kFlushAll);
+ FillDB(db_.get(), 400, 500, kFlushAll);
+ CloseDBAndBackupEngine();
+
+ // all data should be there if we call with keep_log_files = true
+ AssertBackupConsistency(0, 0, 500, 600, true);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class BackupEngineRateLimitingTestWithParam
+ : public BackupEngineTest,
+ public testing::WithParamInterface<
+ std::tuple<bool /* make throttle */,
+ int /* 0 = single threaded, 1 = multi threaded*/,
+ std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+ BackupEngineRateLimitingTestWithParam() {}
+};
+
+uint64_t const MB = 1024 * 1024;
+
+INSTANTIATE_TEST_CASE_P(
+ RateLimiting, BackupEngineRateLimitingTestWithParam,
+ ::testing::Values(std::make_tuple(false, 0, std::make_pair(1 * MB, 5 * MB)),
+ std::make_tuple(false, 0, std::make_pair(2 * MB, 3 * MB)),
+ std::make_tuple(false, 1, std::make_pair(1 * MB, 5 * MB)),
+ std::make_tuple(false, 1, std::make_pair(2 * MB, 3 * MB)),
+ std::make_tuple(true, 0, std::make_pair(1 * MB, 5 * MB)),
+ std::make_tuple(true, 0, std::make_pair(2 * MB, 3 * MB)),
+ std::make_tuple(true, 1, std::make_pair(1 * MB, 5 * MB)),
+ std::make_tuple(true, 1,
+ std::make_pair(2 * MB, 3 * MB))));
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) {
+ size_t const kMicrosPerSec = 1000 * 1000LL;
+ const bool custom_rate_limiter = std::get<0>(GetParam());
+ // iter 0 -- single threaded
+ // iter 1 -- multi threaded
+ const int iter = std::get<1>(GetParam());
+ const std::pair<uint64_t, uint64_t> limit = std::get<2>(GetParam());
+ std::unique_ptr<Env> special_env(
+ new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true));
+ // destroy old data
+ Options options;
+ options.env = special_env.get();
+ DestroyDBWithoutCheck(dbname_, options);
+
+ if (custom_rate_limiter) {
+ std::shared_ptr<RateLimiter> backup_rate_limiter =
+ std::make_shared<GenericRateLimiter>(
+ limit.first, 100 * 1000 /* refill_period_us */, 10 /* fairness */,
+ RateLimiter::Mode::kWritesOnly /* mode */,
+ special_env->GetSystemClock(), false /* auto_tuned */);
+ std::shared_ptr<RateLimiter> restore_rate_limiter =
+ std::make_shared<GenericRateLimiter>(
+ limit.second, 100 * 1000 /* refill_period_us */, 10 /* fairness */,
+ RateLimiter::Mode::kWritesOnly /* mode */,
+ special_env->GetSystemClock(), false /* auto_tuned */);
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+ engine_options_->restore_rate_limiter = restore_rate_limiter;
+ } else {
+ engine_options_->backup_rate_limit = limit.first;
+ engine_options_->restore_rate_limit = limit.second;
+ }
+
+ engine_options_->max_background_operations = (iter == 0) ? 1 : 10;
+ options_.compression = kNoCompression;
+
+ // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+ // `Env` to advance its time according to the fake wait duration. The
+ // workaround is to install a callback that advance the `Env`'s mock time.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+ int64_t time_waited_us = *static_cast<int64_t*>(arg);
+ special_env->SleepForMicroseconds(static_cast<int>(time_waited_us));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ OpenDBAndBackupEngine(true);
+ TEST_SetDefaultRateLimitersClock(backup_engine_.get(),
+ special_env->GetSystemClock());
+
+ size_t bytes_written = FillDB(db_.get(), 0, 10000);
+
+ auto start_backup = special_env->NowMicros();
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+ auto backup_time = special_env->NowMicros() - start_backup;
+ CloseDBAndBackupEngine();
+ auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / limit.first;
+ ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
+
+ OpenBackupEngine();
+ TEST_SetDefaultRateLimitersClock(
+ backup_engine_.get(),
+ special_env->GetSystemClock() /* backup_rate_limiter_clock */,
+ special_env->GetSystemClock() /* restore_rate_limiter_clock */);
+
+ auto start_restore = special_env->NowMicros();
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+ auto restore_time = special_env->NowMicros() - start_restore;
+ CloseBackupEngine();
+ auto rate_limited_restore_time =
+ (bytes_written * kMicrosPerSec) / limit.second;
+ ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
+
+ AssertBackupConsistency(0, 0, 10000, 10100);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "GenericRateLimiter::Request:PostTimedWait");
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) {
+ const std::size_t kMicrosPerSec = 1000 * 1000LL;
+ const bool custom_rate_limiter = std::get<0>(GetParam());
+ const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+ const bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+ std::unique_ptr<Env> special_env(
+ new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true));
+
+ if (custom_rate_limiter) {
+ std::shared_ptr<RateLimiter> backup_rate_limiter =
+ std::make_shared<GenericRateLimiter>(
+ backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+ special_env->GetSystemClock(), false /* auto_tuned */);
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+ } else {
+ engine_options_->backup_rate_limit = backup_rate_limiter_limit;
+ }
+
+ engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+ Options options;
+ options.env = special_env.get();
+ DestroyDBWithoutCheck(dbname_, options);
+ // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+ // `Env` to advance its time according to the fake wait duration. The
+ // workaround is to install a callback that advance the `Env`'s mock time.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+ int64_t time_waited_us = *static_cast<int64_t*>(arg);
+ special_env->SleepForMicroseconds(static_cast<int>(time_waited_us));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ TEST_SetDefaultRateLimitersClock(backup_engine_.get(),
+ special_env->GetSystemClock(), nullptr);
+ FillDB(db_.get(), 0, 10000);
+
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+
+ std::vector<BackupInfo> backup_infos;
+ BackupInfo backup_info;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(1, backup_infos.size());
+ const int backup_id = 1;
+ ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+ ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+ true /* include_file_details */));
+
+ std::uint64_t bytes_read_during_verify_backup = 0;
+ for (BackupFileInfo backup_file_info : backup_info.file_details) {
+ bytes_read_during_verify_backup += backup_file_info.size;
+ }
+ auto start_verify_backup = special_env->NowMicros();
+ ASSERT_OK(
+ backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+ auto verify_backup_time = special_env->NowMicros() - start_verify_backup;
+ auto rate_limited_verify_backup_time =
+ (bytes_read_during_verify_backup * kMicrosPerSec) /
+ backup_rate_limiter_limit;
+ if (custom_rate_limiter) {
+ EXPECT_GE(verify_backup_time, 0.8 * rate_limited_verify_backup_time);
+ }
+
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(backup_id, 0, 10000, 10010);
+ DestroyDBWithoutCheck(dbname_, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "GenericRateLimiter::Request:PostTimedWait");
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInBackup) {
+ bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+ engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+ const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+ std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+ backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+ DestroyDBWithoutCheck(dbname_, Options());
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum /* shared_option */);
+ FillDB(db_.get(), 0, 10);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ std::int64_t total_bytes_through_with_no_read_charged =
+ backup_rate_limiter->GetTotalBytesThrough();
+ CloseBackupEngine();
+
+ backup_rate_limiter.reset(NewGenericRateLimiter(
+ backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+ OpenBackupEngine(true);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ std::int64_t total_bytes_through_with_read_charged =
+ backup_rate_limiter->GetTotalBytesThrough();
+ EXPECT_GT(total_bytes_through_with_read_charged,
+ total_bytes_through_with_no_read_charged);
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(1, 0, 10, 20);
+ DestroyDBWithoutCheck(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInRestore) {
+ bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+ engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+ const std::uint64_t restore_rate_limiter_limit =
+ std::get<2>(GetParam()).second;
+ std::shared_ptr<RateLimiter> restore_rate_limiter(NewGenericRateLimiter(
+ restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+ engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+ DestroyDBWithoutCheck(dbname_, Options());
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, 10);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, Options());
+
+ OpenBackupEngine(false /* destroy_old_data */);
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+ std::int64_t total_bytes_through_with_no_read_charged =
+ restore_rate_limiter->GetTotalBytesThrough();
+ CloseBackupEngine();
+ DestroyDBWithoutCheck(dbname_, Options());
+
+ restore_rate_limiter.reset(NewGenericRateLimiter(
+ restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+ engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+ OpenBackupEngine(false /* destroy_old_data */);
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+ std::int64_t total_bytes_through_with_read_charged =
+ restore_rate_limiter->GetTotalBytesThrough();
+ EXPECT_EQ(total_bytes_through_with_read_charged,
+ total_bytes_through_with_no_read_charged * 2);
+ CloseBackupEngine();
+ AssertBackupConsistency(1, 0, 10, 20);
+ DestroyDBWithoutCheck(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam,
+ RateLimitingChargeReadInInitialize) {
+ bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+ engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+ const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+ std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+ backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+ DestroyDBWithoutCheck(dbname_, Options());
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ FillDB(db_.get(), 0, 10);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(1, 0, 10, 20);
+
+ std::int64_t total_bytes_through_before_initialize =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ OpenDBAndBackupEngine(false /* destroy_old_data */);
+ // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+ // which is called in BackupEngineImpl::Initialize() during
+ // OpenBackupEngine(false)
+ EXPECT_GT(engine_options_->backup_rate_limiter->GetTotalBytesThrough(),
+ total_bytes_through_before_initialize);
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, Options());
+}
+
+class BackupEngineRateLimitingTestWithParam2
+ : public BackupEngineTest,
+ public testing::WithParamInterface<
+ std::tuple<std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+ BackupEngineRateLimitingTestWithParam2() {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+ LowRefillBytesPerPeriod, BackupEngineRateLimitingTestWithParam2,
+ ::testing::Values(std::make_tuple(std::make_pair(1, 1))));
+// To verify we don't request over-sized bytes relative to
+// refill_bytes_per_period_ in each RateLimiter::Request() called in
+// BackupEngine through verifying we don't trigger assertion
+// failure on over-sized request in GenericRateLimiter in debug builds
+TEST_P(BackupEngineRateLimitingTestWithParam2,
+ RateLimitingWithLowRefillBytesPerPeriod) {
+ SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true);
+
+ engine_options_->max_background_operations = 1;
+ const uint64_t backup_rate_limiter_limit = std::get<0>(GetParam()).first;
+ std::shared_ptr<RateLimiter> backup_rate_limiter(
+ std::make_shared<GenericRateLimiter>(
+ backup_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+ special_env.GetSystemClock(), false /* auto_tuned */));
+
+ engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+ const uint64_t restore_rate_limiter_limit = std::get<0>(GetParam()).second;
+ std::shared_ptr<RateLimiter> restore_rate_limiter(
+ std::make_shared<GenericRateLimiter>(
+ restore_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+ special_env.GetSystemClock(), false /* auto_tuned */));
+
+ engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+ // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+ // `Env` to advance its time according to the fake wait duration. The
+ // workaround is to install a callback that advance the `Env`'s mock time.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+ int64_t time_waited_us = *static_cast<int64_t*>(arg);
+ special_env.SleepForMicroseconds(static_cast<int>(time_waited_us));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ DestroyDBWithoutCheck(dbname_, Options());
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum /* shared_option */);
+
+ FillDB(db_.get(), 0, 100);
+ int64_t total_bytes_through_before_backup =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ EXPECT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ int64_t total_bytes_through_after_backup =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ ASSERT_GT(total_bytes_through_after_backup,
+ total_bytes_through_before_backup);
+
+ std::vector<BackupInfo> backup_infos;
+ BackupInfo backup_info;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(1, backup_infos.size());
+ const int backup_id = 1;
+ ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+ ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+ true /* include_file_details */));
+ int64_t total_bytes_through_before_verify_backup =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ EXPECT_OK(
+ backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+ int64_t total_bytes_through_after_verify_backup =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ ASSERT_GT(total_bytes_through_after_verify_backup,
+ total_bytes_through_before_verify_backup);
+
+ CloseDBAndBackupEngine();
+ AssertBackupConsistency(backup_id, 0, 100, 101);
+
+ int64_t total_bytes_through_before_initialize =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ OpenDBAndBackupEngine(false /* destroy_old_data */);
+ // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+ // which is called in BackupEngineImpl::Initialize() during
+ // OpenBackupEngine(false)
+ int64_t total_bytes_through_after_initialize =
+ engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+ ASSERT_GT(total_bytes_through_after_initialize,
+ total_bytes_through_before_initialize);
+ CloseDBAndBackupEngine();
+
+ DestroyDBWithoutCheck(dbname_, Options());
+ OpenBackupEngine(false /* destroy_old_data */);
+ int64_t total_bytes_through_before_restore =
+ engine_options_->restore_rate_limiter->GetTotalBytesThrough();
+ EXPECT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+ int64_t total_bytes_through_after_restore =
+ engine_options_->restore_rate_limiter->GetTotalBytesThrough();
+ ASSERT_GT(total_bytes_through_after_restore,
+ total_bytes_through_before_restore);
+ CloseBackupEngine();
+
+ DestroyDBWithoutCheck(dbname_, Options());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+ "GenericRateLimiter::Request:PostTimedWait");
+}
+
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
+ DestroyDBWithoutCheck(dbname_, options_);
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100);
+ // Also test read-only DB with CreateNewBackup and flush=true (no flush)
+ CloseAndReopenDB(/*read_only*/ true);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
+ CloseAndReopenDB(/*read_only*/ false);
+ FillDB(db_.get(), 100, 200);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+
+ engine_options_->destroy_old_data = false;
+ test_backup_fs_->ClearWrittenFiles();
+ test_backup_fs_->SetLimitDeleteFiles(0);
+ BackupEngineReadOnly* read_only_backup_engine;
+ ASSERT_OK(BackupEngineReadOnly::Open(db_chroot_env_.get(), *engine_options_,
+ &read_only_backup_engine));
+ std::vector<BackupInfo> backup_info;
+ read_only_backup_engine->GetBackupInfo(&backup_info);
+ ASSERT_EQ(backup_info.size(), 2U);
+
+ RestoreOptions restore_options(false);
+ ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup(
+ dbname_, dbname_, restore_options));
+ delete read_only_backup_engine;
+ std::vector<std::string> should_have_written;
+ test_backup_fs_->AssertWrittenFiles(should_have_written);
+
+ DB* db = OpenDB();
+ AssertExists(db, 0, 200);
+ delete db;
+}
+
+TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
+ DestroyDBWithoutCheck(dbname_, options_);
+ options_.write_dbid_to_manifest = false;
+
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+
+ options_.write_dbid_to_manifest = true; // exercises some read-only DB code
+ CloseAndReopenDB();
+
+ FillDB(db_.get(), 100, 200);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+ db_.reset(); // CloseDB
+ DestroyDBWithoutCheck(dbname_, options_);
+ BackupInfo backup_info;
+ // First, check that we get empty fields without include_file_details
+ ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+ /*with file details*/ false));
+ ASSERT_EQ(backup_info.name_for_open, "");
+ ASSERT_FALSE(backup_info.env_for_open);
+
+ // Now for the real test
+ backup_info = BackupInfo();
+ ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+ /*with file details*/ true));
+
+ // Caution: DBOptions only holds a raw pointer to Env, so something else
+ // must keep it alive.
+ // Case 1: Keeping BackupEngine open suffices to keep Env alive
+ DB* db = nullptr;
+ Options opts = options_;
+ // Ensure some key defaults are set
+ opts.wal_dir = "";
+ opts.create_if_missing = false;
+ opts.info_log.reset();
+
+ opts.env = backup_info.env_for_open.get();
+ std::string name = backup_info.name_for_open;
+ backup_info = BackupInfo();
+ ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+ AssertExists(db, 0, 100);
+ AssertEmpty(db, 100, 200);
+
+ delete db;
+ db = nullptr;
+
+ // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
+ ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
+ /*with file details*/ true));
+ CloseBackupEngine();
+ opts.create_if_missing = true; // check also OK (though pointless)
+ opts.env = backup_info.env_for_open.get();
+ name = backup_info.name_for_open;
+ // Note: keeping backup_info alive
+ ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+ AssertExists(db, 0, 200);
+ delete db;
+ db = nullptr;
+
+ // Now try opening read-write and make sure it fails, for safety.
+ ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+}
+
+TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
+ DestroyDBWithoutCheck(dbname_, options_);
+ // Too big for this small DB
+ engine_options_->callback_trigger_interval_size = 100000;
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100);
+ bool is_callback_invoked = false;
+ ASSERT_OK(backup_engine_->CreateNewBackup(
+ db_.get(), true,
+ [&is_callback_invoked]() { is_callback_invoked = true; }));
+ ASSERT_FALSE(is_callback_invoked);
+ CloseBackupEngine();
+
+ // Easily small enough for this small DB
+ engine_options_->callback_trigger_interval_size = 1000;
+ OpenBackupEngine();
+ ASSERT_OK(backup_engine_->CreateNewBackup(
+ db_.get(), true,
+ [&is_callback_invoked]() { is_callback_invoked = true; }));
+ ASSERT_TRUE(is_callback_invoked);
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, GarbageCollectionBeforeBackup) {
+ DestroyDBWithoutCheck(dbname_, options_);
+ OpenDBAndBackupEngine(true);
+
+ ASSERT_OK(backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared"));
+ std::string file_five = backupdir_ + "/shared/000009.sst";
+ std::string file_five_contents = "I'm not really a sst file";
+ // this depends on the fact that 00009.sst is the first file created by the DB
+ ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
+
+ FillDB(db_.get(), 0, 100);
+ // backup overwrites file 000009.sst
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+
+ std::string new_file_five_contents;
+ ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five,
+ &new_file_five_contents));
+ // file 000009.sst was overwritten
+ ASSERT_TRUE(new_file_five_contents != file_five_contents);
+
+ CloseDBAndBackupEngine();
+
+ AssertBackupConsistency(0, 0, 100);
+}
+
+// Test that we properly propagate Env failures
+TEST_F(BackupEngineTest, EnvFailures) {
+ BackupEngine* backup_engine;
+
+ // get children failure
+ {
+ test_backup_fs_->SetGetChildrenFailure(true);
+ ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ test_backup_fs_->SetGetChildrenFailure(false);
+ }
+
+ // created dir failure
+ {
+ test_backup_fs_->SetCreateDirIfMissingFailure(true);
+ ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ test_backup_fs_->SetCreateDirIfMissingFailure(false);
+ }
+
+ // new directory failure
+ {
+ test_backup_fs_->SetNewDirectoryFailure(true);
+ ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ test_backup_fs_->SetNewDirectoryFailure(false);
+ }
+
+ // Read from meta-file failure
+ {
+ DestroyDBWithoutCheck(dbname_, options_);
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+ test_backup_fs_->SetDummySequentialFile(true);
+ test_backup_fs_->SetDummySequentialFileFailReads(true);
+ engine_options_->destroy_old_data = false;
+ ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ test_backup_fs_->SetDummySequentialFile(false);
+ test_backup_fs_->SetDummySequentialFileFailReads(false);
+ }
+
+ // no failure
+ {
+ ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+ &backup_engine));
+ delete backup_engine;
+ }
+}
+
+// Verify manifest can roll while a backup is being created with the old
+// manifest.
+TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
+ DestroyDBWithoutCheck(dbname_, options_);
+ options_.max_manifest_file_size = 0; // always rollover manifest for file add
+ OpenDBAndBackupEngine(true);
+ FillDB(db_.get(), 0, 100, kAutoFlushOnly);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+ "VersionSet::LogAndApply:WriteManifest"},
+ {"VersionSet::LogAndApply:WriteManifestDone",
+ "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread flush_thread{
+ [this]() { ASSERT_OK(db_->Flush(FlushOptions())); }};
+
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+
+ flush_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // The last manifest roll would've already been cleaned up by the full scan
+ // that happens when CreateNewBackup invokes EnableFileDeletions. We need to
+ // trigger another roll to verify non-full scan purges stale manifests.
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
+ std::string prev_manifest_path =
+ DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo());
+ FillDB(db_.get(), 0, 100, kAutoFlushOnly);
+ ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ // Even though manual flush completed above, the background thread may not
+ // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait
+ // until all the background thread's work has completed, including cleanup.
+ ASSERT_OK(db_impl->TEST_WaitForBackgroundWork());
+ ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound());
+
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+ AssertBackupConsistency(0, 0, 100);
+}
+
+// see https://github.com/facebook/rocksdb/issues/921
+TEST_F(BackupEngineTest, Issue921Test) {
+ BackupEngine* backup_engine;
+ engine_options_->share_table_files = false;
+ ASSERT_OK(
+ backup_chroot_env_->CreateDirIfMissing(engine_options_->backup_dir));
+ engine_options_->backup_dir += "/new_dir";
+ ASSERT_OK(BackupEngine::Open(backup_chroot_env_.get(), *engine_options_,
+ &backup_engine));
+
+ delete backup_engine;
+}
+
+TEST_F(BackupEngineTest, BackupWithMetadata) {
+ const int keys_iteration = 5000;
+ OpenDBAndBackupEngine(true);
+ // create five backups
+ for (int i = 0; i < 5; ++i) {
+ const std::string metadata = std::to_string(i);
+ FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+ // Here also test CreateNewBackupWithMetadata with CreateBackupOptions
+ // and outputting saved BackupID.
+ CreateBackupOptions opts;
+ opts.flush_before_backup = true;
+ BackupID new_id = 0;
+ ASSERT_OK(backup_engine_->CreateNewBackupWithMetadata(opts, db_.get(),
+ metadata, &new_id));
+ ASSERT_EQ(new_id, static_cast<BackupID>(i + 1));
+ }
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ { // Verify in bulk BackupInfo
+ std::vector<BackupInfo> backup_infos;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(5, backup_infos.size());
+ for (int i = 0; i < 5; i++) {
+ ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+ }
+ }
+ // Also verify in individual BackupInfo
+ for (int i = 0; i < 5; i++) {
+ BackupInfo backup_info;
+ ASSERT_OK(backup_engine_->GetBackupInfo(static_cast<BackupID>(i + 1),
+ &backup_info));
+ ASSERT_EQ(std::to_string(i), backup_info.app_metadata);
+ }
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, BinaryMetadata) {
+ OpenDBAndBackupEngine(true);
+ std::string binaryMetadata = "abc\ndef";
+ binaryMetadata.push_back('\0');
+ binaryMetadata.append("ghi");
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), binaryMetadata));
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ std::vector<BackupInfo> backup_infos;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(1, backup_infos.size());
+ ASSERT_EQ(binaryMetadata, backup_infos[0].app_metadata);
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, MetadataTooLarge) {
+ OpenDBAndBackupEngine(true);
+ std::string largeMetadata(1024 * 1024 + 1, 0);
+ ASSERT_NOK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), largeMetadata));
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_SizeCorruption) {
+ engine_options_->schema_version = 1;
+ OpenDBAndBackupEngine(/*destroy_old_data*/ true);
+
+ // Backup 1: no future schema, no sizes, with checksums
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ CloseDBAndBackupEngine();
+ engine_options_->schema_version = 2;
+ OpenDBAndBackupEngine(/*destroy_old_data*/ false);
+
+ // Backup 2: no checksums, no sizes
+ TEST_BackupMetaSchemaOptions test_opts;
+ test_opts.crc32c_checksums = false;
+ test_opts.file_sizes = false;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ // Backup 3: no checksums, with sizes
+ test_opts.file_sizes = true;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ // Backup 4: with checksums and sizes
+ test_opts.crc32c_checksums = true;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ CloseDBAndBackupEngine();
+
+ // Corrupt all the CURRENT files with the wrong size
+ const std::string private_dir = backupdir_ + "/private";
+
+ for (int id = 1; id <= 3; ++id) {
+ ASSERT_OK(file_manager_->WriteToFile(
+ private_dir + "/" + std::to_string(id) + "/CURRENT", "x"));
+ }
+ // Except corrupt Backup 4 with same size CURRENT file
+ {
+ uint64_t size = 0;
+ ASSERT_OK(test_backup_env_->GetFileSize(private_dir + "/4/CURRENT", &size));
+ ASSERT_OK(file_manager_->WriteToFile(private_dir + "/4/CURRENT",
+ std::string(size, 'x')));
+ }
+
+ OpenBackupEngine();
+
+ // Only the one with sizes in metadata will be immediately detected
+ // as corrupt
+ std::vector<BackupID> corrupted;
+ backup_engine_->GetCorruptedBackups(&corrupted);
+ ASSERT_EQ(corrupted.size(), 1);
+ ASSERT_EQ(corrupted[0], 3);
+
+ // Size corruption detected on Restore with checksum
+ ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(1 /*id*/, dbname_, dbname_)
+ .IsCorruption());
+
+ // Size corruption not detected without checksums nor sizes
+ ASSERT_OK(backup_engine_->RestoreDBFromBackup(2 /*id*/, dbname_, dbname_));
+
+ // Non-size corruption detected on Restore with checksum
+ ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(4 /*id*/, dbname_, dbname_)
+ .IsCorruption());
+
+ CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_NotSupported) {
+ engine_options_->schema_version = 2;
+ TEST_BackupMetaSchemaOptions test_opts;
+ std::string app_metadata = "abc\ndef";
+
+ OpenDBAndBackupEngine(true);
+ // Start with supported
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+
+ // Because we are injecting badness with a TEST API, the badness is only
+ // detected on attempt to restore.
+ // Not supported versions
+ test_opts.version = "3";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+ test_opts.version = "23.45.67";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+ test_opts.version = "2";
+
+ // Non-ignorable fields
+ test_opts.meta_fields["ni::blah"] = "123";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+ test_opts.meta_fields.clear();
+
+ test_opts.file_fields["ni::123"] = "xyz";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+ test_opts.file_fields.clear();
+
+ test_opts.footer_fields["ni::123"] = "xyz";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(
+ backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+ test_opts.footer_fields.clear();
+ CloseDBAndBackupEngine();
+
+ OpenBackupEngine();
+ std::vector<BackupID> corrupted;
+ backup_engine_->GetCorruptedBackups(&corrupted);
+ ASSERT_EQ(corrupted.size(), 5);
+
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+ CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_Restore) {
+ engine_options_->schema_version = 2;
+ TEST_BackupMetaSchemaOptions test_opts;
+ const int keys_iteration = 5000;
+
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+ FillDB(db_.get(), 0, keys_iteration);
+ // Start with minimum metadata to ensure it works without it being filled
+ // based on shared files also in other backups with the metadata.
+ test_opts.crc32c_checksums = false;
+ test_opts.file_sizes = false;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ AssertBackupConsistency(1 /* id */, 0, keys_iteration, keys_iteration * 2);
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ test_opts.file_sizes = true;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ for (int id = 1; id <= 2; ++id) {
+ AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+ }
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ test_opts.crc32c_checksums = true;
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ for (int id = 1; id <= 3; ++id) {
+ AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+ }
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ // No TEST_EnableWriteFutureSchemaVersion2
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ for (int id = 1; id <= 4; ++id) {
+ AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+ }
+
+ OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+ kShareWithChecksum);
+ // Minor version updates should be forward-compatible
+ test_opts.version = "2.5.70";
+ test_opts.meta_fields["asdf.3456"] = "-42";
+ test_opts.meta_fields["__QRST"] = " 1 $ %%& ";
+ test_opts.file_fields["z94._"] = "^\\";
+ test_opts.file_fields["_7yyyyyyyyy"] = "111111111111";
+ test_opts.footer_fields["Qwzn.tz89"] = "ASDF!!@# ##=\t ";
+ test_opts.footer_fields["yes"] = "no!";
+ TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ for (int id = 1; id <= 5; ++id) {
+ AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+ }
+}
+
+TEST_F(BackupEngineTest, Concurrency) {
+ // Check that we can simultaneously:
+ // * Run several read operations in different threads on a single
+ // BackupEngine object, and
+ // * With another BackupEngine object on the same
+ // backup_dir, run the same read operations in another thread, and
+ // * With yet another BackupEngine object on the same
+ // backup_dir, create two new backups in parallel threads.
+ //
+ // Because of the challenges of integrating this into db_stress,
+ // this is a non-deterministic mini-stress test here instead.
+
+ // To check for a race condition in handling buffer size based on byte
+ // burst limit, we need a (generous) rate limiter
+ std::shared_ptr<RateLimiter> limiter{NewGenericRateLimiter(1000000000)};
+ engine_options_->backup_rate_limiter = limiter;
+ engine_options_->restore_rate_limiter = limiter;
+
+ OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+
+ static constexpr int keys_iteration = 5000;
+ FillDB(db_.get(), 0, keys_iteration);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ static constexpr int max_factor = 3;
+ FillDB(db_.get(), 2 * keys_iteration, max_factor * keys_iteration);
+ // will create another backup soon...
+
+ Options db_opts = options_;
+ db_opts.wal_dir = "";
+ db_opts.create_if_missing = false;
+ BackupEngineOptions be_opts = *engine_options_;
+ be_opts.destroy_old_data = false;
+
+ std::mt19937 rng{std::random_device()()};
+
+ std::array<std::thread, 4> read_threads;
+ std::array<std::thread, 4> restore_verify_threads;
+ for (uint32_t i = 0; i < read_threads.size(); ++i) {
+ uint32_t sleep_micros = rng() % 100000;
+ read_threads[i] = std::thread([this, i, sleep_micros, &db_opts, &be_opts,
+ &restore_verify_threads, &limiter] {
+ test_db_env_->SleepForMicroseconds(sleep_micros);
+
+ // Whether to also re-open the BackupEngine, potentially seeing
+ // additional backups
+ bool reopen = i == 3;
+ // Whether we are going to restore "latest"
+ bool latest = i > 1;
+
+ BackupEngine* my_be;
+ if (reopen) {
+ ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be));
+ } else {
+ my_be = backup_engine_.get();
+ }
+
+ // Verify metadata (we don't receive updates from concurrently
+ // creating a new backup)
+ std::vector<BackupInfo> infos;
+ my_be->GetBackupInfo(&infos);
+ const uint32_t count = static_cast<uint32_t>(infos.size());
+ infos.clear();
+ if (reopen) {
+ ASSERT_GE(count, 2U);
+ ASSERT_LE(count, 4U);
+ fprintf(stderr, "Reopen saw %u backups\n", count);
+ } else {
+ ASSERT_EQ(count, 2U);
+ }
+ std::vector<BackupID> ids;
+ my_be->GetCorruptedBackups(&ids);
+ ASSERT_EQ(ids.size(), 0U);
+
+ // (Eventually, see below) Restore one of the backups, or "latest"
+ std::string restore_db_dir = dbname_ + "/restore" + std::to_string(i);
+ DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError();
+ BackupID to_restore;
+ if (latest) {
+ to_restore = count;
+ } else {
+ to_restore = i + 1;
+ }
+
+ // Open restored DB to verify its contents, but test atomic restore
+ // by doing it async and ensuring we either get OK or InvalidArgument
+ restore_verify_threads[i] =
+ std::thread([this, &db_opts, restore_db_dir, to_restore] {
+ DB* restored;
+ Status s;
+ for (;;) {
+ s = DB::Open(db_opts, restore_db_dir, &restored);
+ if (s.IsInvalidArgument()) {
+ // Restore hasn't finished
+ test_db_env_->SleepForMicroseconds(1000);
+ continue;
+ } else {
+ // We should only get InvalidArgument if restore is
+ // incomplete, or OK if complete
+ ASSERT_OK(s);
+ break;
+ }
+ }
+ int factor = std::min(static_cast<int>(to_restore), max_factor);
+ AssertExists(restored, 0, factor * keys_iteration);
+ AssertEmpty(restored, factor * keys_iteration,
+ (factor + 1) * keys_iteration);
+ delete restored;
+ });
+
+ // (Ok now) Restore one of the backups, or "latest"
+ if (latest) {
+ ASSERT_OK(
+ my_be->RestoreDBFromLatestBackup(restore_db_dir, restore_db_dir));
+ } else {
+ ASSERT_OK(my_be->VerifyBackup(to_restore, true));
+ ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir,
+ restore_db_dir));
+ }
+
+ // Test for race condition in reconfiguring limiter
+ // FIXME: this could set to a different value in all threads, except
+ // GenericRateLimiter::SetBytesPerSecond has a write-write race
+ // reported by TSAN
+ if (i == 0) {
+ limiter->SetBytesPerSecond(2000000000);
+ }
+
+ // Re-verify metadata (we don't receive updates from concurrently
+ // creating a new backup)
+ my_be->GetBackupInfo(&infos);
+ ASSERT_EQ(infos.size(), count);
+ my_be->GetCorruptedBackups(&ids);
+ ASSERT_EQ(ids.size(), 0);
+ // fprintf(stderr, "Finished read thread\n");
+
+ if (reopen) {
+ delete my_be;
+ }
+ });
+ }
+
+ BackupEngine* alt_be;
+ ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &alt_be));
+
+ std::array<std::thread, 2> append_threads;
+ for (unsigned i = 0; i < append_threads.size(); ++i) {
+ uint32_t sleep_micros = rng() % 100000;
+ append_threads[i] = std::thread([this, sleep_micros, alt_be] {
+ test_db_env_->SleepForMicroseconds(sleep_micros);
+ // WART: CreateNewBackup doesn't tell you the BackupID it just created,
+ // which is ugly for multithreaded setting.
+ // TODO: add delete backup also when that is added
+ ASSERT_OK(alt_be->CreateNewBackup(db_.get()));
+ // fprintf(stderr, "Finished append thread\n");
+ });
+ }
+
+ for (auto& t : append_threads) {
+ t.join();
+ }
+ // Verify metadata
+ std::vector<BackupInfo> infos;
+ alt_be->GetBackupInfo(&infos);
+ ASSERT_EQ(infos.size(), 2 + append_threads.size());
+
+ for (auto& t : read_threads) {
+ t.join();
+ }
+
+ delete alt_be;
+
+ for (auto& t : restore_verify_threads) {
+ t.join();
+ }
+
+ CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, LimitBackupsOpened) {
+ // Verify the specified max backups are opened, including skipping over
+ // corrupted backups.
+ //
+ // Setup:
+ // - backups 1, 2, and 4 are valid
+ // - backup 3 is corrupt
+ // - max_valid_backups_to_open == 2
+ //
+ // Expectation: the engine opens backups 4 and 2 since those are latest two
+ // non-corrupt backups.
+ const int kNumKeys = 5000;
+ OpenDBAndBackupEngine(true);
+ for (int i = 1; i <= 4; ++i) {
+ FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ if (i == 3) {
+ ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+ }
+ }
+ CloseDBAndBackupEngine();
+
+ engine_options_->max_valid_backups_to_open = 2;
+ engine_options_->destroy_old_data = false;
+ BackupEngineReadOnly* read_only_backup_engine;
+ ASSERT_OK(BackupEngineReadOnly::Open(
+ backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine));
+
+ std::vector<BackupInfo> backup_infos;
+ read_only_backup_engine->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(2, backup_infos.size());
+ ASSERT_EQ(2, backup_infos[0].backup_id);
+ ASSERT_EQ(4, backup_infos[1].backup_id);
+ delete read_only_backup_engine;
+}
+
+TEST_F(BackupEngineTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+ // Verify the specified max_valid_backups_to_open is ignored if the engine
+ // is not read-only.
+ //
+ // Setup:
+ // - backups 1, 2, and 4 are valid
+ // - backup 3 is corrupt
+ // - max_valid_backups_to_open == 2
+ //
+ // Expectation: the engine opens backups 4, 2, and 1 since those are latest
+ // non-corrupt backups, by ignoring max_valid_backups_to_open == 2.
+ const int kNumKeys = 5000;
+ OpenDBAndBackupEngine(true);
+ for (int i = 1; i <= 4; ++i) {
+ FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ if (i == 3) {
+ ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+ }
+ }
+ CloseDBAndBackupEngine();
+
+ engine_options_->max_valid_backups_to_open = 2;
+ OpenDBAndBackupEngine();
+ std::vector<BackupInfo> backup_infos;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(3, backup_infos.size());
+ ASSERT_EQ(1, backup_infos[0].backup_id);
+ ASSERT_EQ(2, backup_infos[1].backup_id);
+ ASSERT_EQ(4, backup_infos[2].backup_id);
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, CreateWhenLatestBackupCorrupted) {
+ // we should pick an ID greater than corrupted backups' IDs so creation can
+ // succeed even when latest backup is corrupted.
+ const int kNumKeys = 5000;
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ BackupInfo backup_info;
+ ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+ FillDB(db_.get(), 0 /* from */, kNumKeys);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ true /* flush_before_backup */));
+ ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/1",
+ 3 /* bytes_to_corrupt */));
+ CloseDBAndBackupEngine();
+
+ OpenDBAndBackupEngine();
+ ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ true /* flush_before_backup */));
+
+ ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).ok());
+ ASSERT_EQ(2, backup_info.backup_id);
+
+ std::vector<BackupInfo> backup_infos;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(1, backup_infos.size());
+ ASSERT_EQ(2, backup_infos[0].backup_id);
+
+ // Verify individual GetBackupInfo by ID
+ ASSERT_TRUE(backup_engine_->GetBackupInfo(0U, &backup_info).IsNotFound());
+ ASSERT_TRUE(backup_engine_->GetBackupInfo(1U, &backup_info).IsCorruption());
+ ASSERT_TRUE(backup_engine_->GetBackupInfo(2U, &backup_info).ok());
+ ASSERT_TRUE(backup_engine_->GetBackupInfo(3U, &backup_info).IsNotFound());
+ ASSERT_TRUE(
+ backup_engine_->GetBackupInfo(999999U, &backup_info).IsNotFound());
+}
+
+TEST_F(BackupEngineTest, WriteOnlyEngineNoSharedFileDeletion) {
+ // Verifies a write-only BackupEngine does not delete files belonging to valid
+ // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
+ const int kNumKeys = 5000;
+ for (int i = 0; i < 3; ++i) {
+ OpenDBAndBackupEngine(i == 0 /* destroy_old_data */);
+ FillDB(db_.get(), i * kNumKeys, (i + 1) * kNumKeys);
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+ CloseDBAndBackupEngine();
+
+ engine_options_->max_valid_backups_to_open = 0;
+ OpenDBAndBackupEngine();
+ switch (i) {
+ case 0:
+ ASSERT_OK(backup_engine_->GarbageCollect());
+ break;
+ case 1:
+ ASSERT_OK(backup_engine_->PurgeOldBackups(1 /* num_backups_to_keep */));
+ break;
+ case 2:
+ ASSERT_OK(backup_engine_->DeleteBackup(2 /* backup_id */));
+ break;
+ default:
+ assert(false);
+ }
+ CloseDBAndBackupEngine();
+
+ engine_options_->max_valid_backups_to_open =
+ std::numeric_limits<int32_t>::max();
+ AssertBackupConsistency(i + 1, 0, (i + 1) * kNumKeys);
+ }
+}
+
+TEST_P(BackupEngineTestWithParam, BackupUsingDirectIO) {
+ // Tests direct I/O on the backup engine's reads and writes on the DB env and
+ // backup env
+ // We use ChrootEnv underneath so the below line checks for direct I/O support
+ // in the chroot directory, not the true filesystem root.
+ if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+ ROCKSDB_GTEST_SKIP("Test requires Direct I/O Support");
+ return;
+ }
+ const int kNumKeysPerBackup = 100;
+ const int kNumBackups = 3;
+ options_.use_direct_reads = true;
+ OpenDBAndBackupEngine(true /* destroy_old_data */);
+ for (int i = 0; i < kNumBackups; ++i) {
+ FillDB(db_.get(), i * kNumKeysPerBackup /* from */,
+ (i + 1) * kNumKeysPerBackup /* to */, kFlushAll);
+
+ // Clear the file open counters and then do a bunch of backup engine ops.
+ // For all ops, files should be opened in direct mode.
+ test_backup_fs_->ClearFileOpenCounters();
+ test_db_fs_->ClearFileOpenCounters();
+ CloseBackupEngine();
+ OpenBackupEngine();
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ ASSERT_OK(backup_engine_->VerifyBackup(i + 1));
+ CloseBackupEngine();
+ OpenBackupEngine();
+ std::vector<BackupInfo> backup_infos;
+ backup_engine_->GetBackupInfo(&backup_infos);
+ ASSERT_EQ(static_cast<size_t>(i + 1), backup_infos.size());
+
+ // Verify backup engine always opened files with direct I/O
+ ASSERT_EQ(0, test_db_fs_->num_writers());
+ ASSERT_GE(test_db_fs_->num_direct_rand_readers(), 0);
+ ASSERT_GT(test_db_fs_->num_direct_seq_readers(), 0);
+ // Currently the DB doesn't support reading WALs or manifest with direct
+ // I/O, so subtract two.
+ ASSERT_EQ(test_db_fs_->num_seq_readers() - 2,
+ test_db_fs_->num_direct_seq_readers());
+ ASSERT_EQ(test_db_fs_->num_rand_readers(),
+ test_db_fs_->num_direct_rand_readers());
+ }
+ CloseDBAndBackupEngine();
+
+ for (int i = 0; i < kNumBackups; ++i) {
+ AssertBackupConsistency(i + 1 /* backup_id */,
+ i * kNumKeysPerBackup /* start_exist */,
+ (i + 1) * kNumKeysPerBackup /* end_exist */,
+ (i + 2) * kNumKeysPerBackup /* end */);
+ }
+}
+
+TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) {
+ std::atomic<CpuPriority> priority(CpuPriority::kNormal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) {
+ priority.store(*reinterpret_cast<CpuPriority*>(new_priority));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // 1 thread is easier to test, otherwise, we may not be sure which thread
+ // actually does the work during CreateNewBackup.
+ engine_options_->max_background_operations = 1;
+ OpenDBAndBackupEngine(true);
+
+ {
+ FillDB(db_.get(), 0, 100);
+
+ // by default, cpu priority is not changed.
+ CreateBackupOptions options;
+ ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+ ASSERT_EQ(priority, CpuPriority::kNormal);
+ }
+
+ {
+ FillDB(db_.get(), 101, 200);
+
+ // decrease cpu priority from normal to low.
+ CreateBackupOptions options;
+ options.decrease_background_thread_cpu_priority = true;
+ options.background_thread_cpu_priority = CpuPriority::kLow;
+ ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+ ASSERT_EQ(priority, CpuPriority::kLow);
+ }
+
+ {
+ FillDB(db_.get(), 201, 300);
+
+ // try to upgrade cpu priority back to normal,
+ // the priority should still low.
+ CreateBackupOptions options;
+ options.decrease_background_thread_cpu_priority = true;
+ options.background_thread_cpu_priority = CpuPriority::kNormal;
+ ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+ ASSERT_EQ(priority, CpuPriority::kLow);
+ }
+
+ {
+ FillDB(db_.get(), 301, 400);
+
+ // decrease cpu priority from low to idle.
+ CreateBackupOptions options;
+ options.decrease_background_thread_cpu_priority = true;
+ options.background_thread_cpu_priority = CpuPriority::kIdle;
+ ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+ ASSERT_EQ(priority, CpuPriority::kIdle);
+ }
+
+ {
+ FillDB(db_.get(), 301, 400);
+
+ // reset priority to later verify that it's not updated by SetCpuPriority.
+ priority = CpuPriority::kNormal;
+
+ // setting the same cpu priority won't call SetCpuPriority.
+ CreateBackupOptions options;
+ options.decrease_background_thread_cpu_priority = true;
+ options.background_thread_cpu_priority = CpuPriority::kIdle;
+
+ // Also check output backup_id with CreateNewBackup
+ BackupID new_id = 0;
+ ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get(), &new_id));
+ ASSERT_EQ(new_id, 5U);
+
+ ASSERT_EQ(priority, CpuPriority::kNormal);
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ CloseDBAndBackupEngine();
+ DestroyDBWithoutCheck(dbname_, options_);
+}
+
+// Populates `*total_size` with the size of all files under `backup_dir`.
+// We don't go through `BackupEngine` currently because it's hard to figure out
+// the metadata file size.
+Status GetSizeOfBackupFiles(FileSystem* backup_fs,
+ const std::string& backup_dir, size_t* total_size) {
+ *total_size = 0;
+ std::vector<std::string> dir_stack = {backup_dir};
+ Status s;
+ while (s.ok() && !dir_stack.empty()) {
+ std::string dir = std::move(dir_stack.back());
+ dir_stack.pop_back();
+ std::vector<std::string> children;
+ s = backup_fs->GetChildren(dir, IOOptions(), &children, nullptr /* dbg */);
+ for (size_t i = 0; s.ok() && i < children.size(); ++i) {
+ std::string path = dir + "/" + children[i];
+ bool is_dir;
+ s = backup_fs->IsDirectory(path, IOOptions(), &is_dir, nullptr /* dbg */);
+ uint64_t file_size = 0;
+ if (s.ok()) {
+ if (is_dir) {
+ dir_stack.emplace_back(std::move(path));
+ } else {
+ s = backup_fs->GetFileSize(path, IOOptions(), &file_size,
+ nullptr /* dbg */);
+ }
+ }
+ if (s.ok()) {
+ *total_size += file_size;
+ }
+ }
+ }
+ return s;
+}
+
+TEST_F(BackupEngineTest, IOStats) {
+ // Tests the `BACKUP_READ_BYTES` and `BACKUP_WRITE_BYTES` ticker stats have
+ // the expected values according to the files in the backups.
+
+ // These ticker stats are expected to be populated regardless of `PerfLevel`
+ // in user thread
+ SetPerfLevel(kDisable);
+
+ options_.statistics = CreateDBStatistics();
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum);
+
+ FillDB(db_.get(), 0 /* from */, 100 /* to */, kFlushMost);
+
+ ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+ ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+
+ size_t orig_backup_files_size;
+ ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+ backupdir_, &orig_backup_files_size));
+ size_t expected_bytes_written = orig_backup_files_size;
+ ASSERT_EQ(expected_bytes_written,
+ options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+ // Bytes read is more difficult to pin down since there are reads for many
+ // purposes other than creating file, like `GetSortedWalFiles()` to find first
+ // sequence number, or `CreateNewBackup()` thread to find SST file session ID.
+ // So we loosely require there are at least as many reads as needed for
+ // copying, but not as many as twice that.
+ ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+ expected_bytes_written);
+ ASSERT_LT(expected_bytes_written,
+ 2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+
+ FillDB(db_.get(), 100 /* from */, 200 /* to */, kFlushMost);
+
+ ASSERT_OK(options_.statistics->Reset());
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+ false /* flush_before_backup */));
+ size_t final_backup_files_size;
+ ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+ backupdir_, &final_backup_files_size));
+ expected_bytes_written = final_backup_files_size - orig_backup_files_size;
+ ASSERT_EQ(expected_bytes_written,
+ options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+ // See above for why these bounds were chosen.
+ ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+ expected_bytes_written);
+ ASSERT_LT(expected_bytes_written,
+ 2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+}
+
+TEST_F(BackupEngineTest, FileTemperatures) {
+ CloseDBAndBackupEngine();
+
+ // Required for recording+restoring temperatures
+ engine_options_->schema_version = 2;
+
+ // More file IO instrumentation
+ auto my_db_fs = std::make_shared<FileTemperatureTestFS>(db_chroot_fs_);
+ test_db_fs_ = std::make_shared<TestFs>(my_db_fs);
+ SetEnvsFromFileSystems();
+
+ // Use temperatures
+ options_.bottommost_temperature = Temperature::kWarm;
+ options_.level0_file_num_compaction_trigger = 2;
+ // set dynamic_level to true so the compaction would compact the data to the
+ // last level directly which will have the last_level_temperature
+ options_.level_compaction_dynamic_level_bytes = true;
+
+ OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+ kShareWithChecksum);
+
+ // generate a bottommost file (combined from 2) and a non-bottommost file
+ DBImpl* dbi = static_cast_with_check<DBImpl>(db_.get());
+ ASSERT_OK(db_->Put(WriteOptions(), "a", "val"));
+ ASSERT_OK(db_->Put(WriteOptions(), "c", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+ ASSERT_OK(db_->Put(WriteOptions(), "d", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+ ASSERT_OK(dbi->TEST_WaitForCompact());
+ ASSERT_OK(db_->Put(WriteOptions(), "e", "val"));
+ ASSERT_OK(db_->Flush(FlushOptions()));
+
+ // Get temperatures from manifest
+ std::map<uint64_t, Temperature> manifest_temps;
+ std::map<Temperature, int> manifest_temp_counts;
+ {
+ std::vector<LiveFileStorageInfo> infos;
+ ASSERT_OK(
+ db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
+ for (auto info : infos) {
+ if (info.file_type == kTableFile) {
+ manifest_temps.emplace(info.file_number, info.temperature);
+ manifest_temp_counts[info.temperature]++;
+ }
+ }
+ }
+
+ // Verify expected manifest temperatures
+ ASSERT_EQ(manifest_temp_counts.size(), 2);
+ ASSERT_EQ(manifest_temp_counts[Temperature::kWarm], 1);
+ ASSERT_EQ(manifest_temp_counts[Temperature::kUnknown], 1);
+
+ // Verify manifest temperatures match FS temperatures
+ std::map<uint64_t, Temperature> current_temps;
+ my_db_fs->CopyCurrentSstFileTemperatures(&current_temps);
+ for (const auto& manifest_temp : manifest_temps) {
+ ASSERT_EQ(current_temps[manifest_temp.first], manifest_temp.second);
+ }
+
+ // Try a few different things
+ for (int i = 1; i <= 5; ++i) {
+ // Expected temperatures after restore are based on manifest temperatures
+ std::map<uint64_t, Temperature> expected_temps = manifest_temps;
+
+ if (i >= 2) {
+ // For iterations 2 & 3, override current temperature of one file
+ // and vary which temperature is authoritative (current or manifest).
+ // For iterations 4 & 5, override current temperature of both files
+ // but make sure an current temperate always takes precedence over
+ // unknown regardless of current_temperatures_override_manifest setting.
+ bool use_current = ((i % 2) == 1);
+ engine_options_->current_temperatures_override_manifest = use_current;
+ CloseBackupEngine();
+ OpenBackupEngine();
+ for (const auto& manifest_temp : manifest_temps) {
+ if (i <= 3) {
+ if (manifest_temp.second == Temperature::kWarm) {
+ my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+ Temperature::kCold);
+ if (use_current) {
+ expected_temps[manifest_temp.first] = Temperature::kCold;
+ }
+ }
+ } else {
+ assert(i <= 5);
+ if (manifest_temp.second == Temperature::kWarm) {
+ my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+ Temperature::kUnknown);
+ } else {
+ ASSERT_EQ(manifest_temp.second, Temperature::kUnknown);
+ my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+ Temperature::kHot);
+ // regardless of use_current
+ expected_temps[manifest_temp.first] = Temperature::kHot;
+ }
+ }
+ }
+ }
+
+ // Sample requested temperatures in opening files for backup
+ my_db_fs->PopRequestedSstFileTemperatures();
+ ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+ // Verify requested temperatures against manifest temperatures (before
+ // retry with kUnknown if needed, and before backup finds out current
+ // temperatures in FileSystem)
+ std::vector<std::pair<uint64_t, Temperature>> requested_temps;
+ my_db_fs->PopRequestedSstFileTemperatures(&requested_temps);
+ std::set<uint64_t> distinct_requests;
+ for (const auto& requested_temp : requested_temps) {
+ // Matching manifest temperatures, except allow retry request with
+ // kUnknown
+ auto manifest_temp = manifest_temps.at(requested_temp.first);
+ if (manifest_temp == Temperature::kUnknown ||
+ requested_temp.second != Temperature::kUnknown) {
+ ASSERT_EQ(manifest_temp, requested_temp.second);
+ }
+ distinct_requests.insert(requested_temp.first);
+ }
+ // Two distinct requests
+ ASSERT_EQ(distinct_requests.size(), 2);
+
+ // Verify against backup info file details API
+ BackupInfo info;
+ ASSERT_OK(backup_engine_->GetLatestBackupInfo(
+ &info, /*include_file_details*/ true));
+ ASSERT_GT(info.file_details.size(), 2);
+ for (auto& e : info.file_details) {
+ ASSERT_EQ(expected_temps[e.file_number], e.temperature);
+ }
+
+ // Restore backup to another virtual (tiered) dir
+ const std::string restore_dir = "/restore" + std::to_string(i);
+ ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(
+ RestoreOptions(), restore_dir, restore_dir));
+
+ // Verify restored FS temperatures match expectation
+ // (FileTemperatureTestFS doesn't distinguish directories when reporting
+ // current temperatures, just whatever SST was written or overridden last
+ // with that file number.)
+ my_db_fs->CopyCurrentSstFileTemperatures(&current_temps);
+ for (const auto& expected_temp : expected_temps) {
+ ASSERT_EQ(current_temps[expected_temp.first], expected_temp.second);
+ }
+
+ // Delete backup to force next backup to copy files
+ ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+ }
+}
+
+} // namespace
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc
new file mode 100644
index 000000000..86907e979
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc
@@ -0,0 +1,490 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_compaction_filter.h"
+
+#include <cinttypes>
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() {
+ if (blob_file_) {
+ CloseAndRegisterNewBlobFile();
+ }
+ RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
+ RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
+ RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
+ RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
+}
+
+CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
+ int level, const Slice& key, ValueType value_type, const Slice& value,
+ std::string* new_value, std::string* skip_until) const {
+ const CompactionFilter* ucf = user_comp_filter();
+ if (value_type != kBlobIndex) {
+ if (ucf == nullptr) {
+ return Decision::kKeep;
+ }
+ // Apply user compaction filter for inlined data.
+ CompactionFilter::Decision decision =
+ ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
+ if (decision == Decision::kChangeValue) {
+ return HandleValueChange(key, new_value);
+ }
+ return decision;
+ }
+ BlobIndex blob_index;
+ Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ // Unable to decode blob index. Keeping the value.
+ return Decision::kKeep;
+ }
+ if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
+ // Expired
+ expired_count_++;
+ expired_size_ += key.size() + value.size();
+ return Decision::kRemove;
+ }
+ if (!blob_index.IsInlined() &&
+ blob_index.file_number() < context_.next_file_number &&
+ context_.current_blob_files.count(blob_index.file_number()) == 0) {
+ // Corresponding blob file gone (most likely, evicted by FIFO eviction).
+ evicted_count_++;
+ evicted_size_ += key.size() + value.size();
+ return Decision::kRemove;
+ }
+ if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
+ blob_index.expiration() < context_.evict_expiration_up_to) {
+ // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+ // get sequence number.
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(
+ key, &ikey,
+ context_.blob_db_impl->db_options_.allow_data_in_errors)
+ .ok()) {
+ assert(false);
+ return Decision::kKeep;
+ }
+ // Remove keys that could have been remove by last FIFO eviction.
+ // If get error while parsing key, ignore and continue.
+ if (ikey.sequence < context_.fifo_eviction_seq) {
+ evicted_count_++;
+ evicted_size_ += key.size() + value.size();
+ return Decision::kRemove;
+ }
+ }
+ // Apply user compaction filter for all non-TTL blob data.
+ if (ucf != nullptr && !blob_index.HasTTL()) {
+ // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+ // get sequence number.
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(
+ key, &ikey,
+ context_.blob_db_impl->db_options_.allow_data_in_errors)
+ .ok()) {
+ assert(false);
+ return Decision::kKeep;
+ }
+ // Read value from blob file.
+ PinnableSlice blob;
+ CompressionType compression_type = kNoCompression;
+ constexpr bool need_decompress = true;
+ if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
+ &compression_type)) {
+ return Decision::kIOError;
+ }
+ CompactionFilter::Decision decision = ucf->FilterV2(
+ level, ikey.user_key, kValue, blob, new_value, skip_until);
+ if (decision == Decision::kChangeValue) {
+ return HandleValueChange(ikey.user_key, new_value);
+ }
+ return decision;
+ }
+ return Decision::kKeep;
+}
+
+CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
+ const Slice& key, std::string* new_value) const {
+ BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
+ // Keep new_value inlined.
+ return Decision::kChangeValue;
+ }
+ if (!OpenNewBlobFileIfNeeded()) {
+ return Decision::kIOError;
+ }
+ Slice new_blob_value(*new_value);
+ std::string compression_output;
+ if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+ new_blob_value =
+ blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
+ }
+ uint64_t new_blob_file_number = 0;
+ uint64_t new_blob_offset = 0;
+ if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
+ &new_blob_offset)) {
+ return Decision::kIOError;
+ }
+ if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+ return Decision::kIOError;
+ }
+ BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+ new_blob_value.size(),
+ blob_db_impl->bdb_options_.compression);
+ return Decision::kChangeBlobIndex;
+}
+
+BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
+ assert(context().blob_db_impl);
+
+ ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log,
+ "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64
+ " bytes), relocated %" PRIu64 " blobs (%" PRIu64
+ " bytes), created %" PRIu64 " new blob file(s)",
+ !gc_stats_.HasError() ? "successfully" : "with failure",
+ gc_stats_.AllBlobs(), gc_stats_.AllBytes(),
+ gc_stats_.RelocatedBlobs(), gc_stats_.RelocatedBytes(),
+ gc_stats_.NewFiles());
+
+ RecordTick(statistics(), BLOB_DB_GC_NUM_KEYS_RELOCATED,
+ gc_stats_.RelocatedBlobs());
+ RecordTick(statistics(), BLOB_DB_GC_BYTES_RELOCATED,
+ gc_stats_.RelocatedBytes());
+ RecordTick(statistics(), BLOB_DB_GC_NUM_NEW_FILES, gc_stats_.NewFiles());
+ RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError());
+}
+
+bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const {
+ if (blob_file_) {
+ assert(writer_);
+ return true;
+ }
+ return false;
+}
+
+bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
+ if (IsBlobFileOpened()) {
+ return true;
+ }
+
+ BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ const Status s = blob_db_impl->CreateBlobFileAndWriter(
+ /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_,
+ &writer_);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ blob_db_impl->db_options_.info_log,
+ "Error opening new blob file during compaction/GC, status: %s",
+ s.ToString().c_str());
+ blob_file_.reset();
+ writer_.reset();
+ return false;
+ }
+
+ assert(blob_file_);
+ assert(writer_);
+
+ return true;
+}
+
+bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
+ const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
+ bool need_decompress, CompressionType* compression_type) const {
+ BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ Status s = blob_db_impl->GetRawBlobFromFile(
+ key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
+ blob, compression_type);
+
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ blob_db_impl->db_options_.info_log,
+ "Error reading blob during compaction/GC, key: %s (%s), status: %s",
+ key.ToString(/* output_hex */ true).c_str(),
+ blob_index.DebugString(/* output_hex */ true).c_str(),
+ s.ToString().c_str());
+
+ return false;
+ }
+
+ if (need_decompress && *compression_type != kNoCompression) {
+ s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ blob_db_impl->db_options_.info_log,
+ "Uncompression error during blob read from file: %" PRIu64
+ " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+ " key: %s status: '%s'",
+ blob_index.file_number(), blob_index.offset(), blob_index.size(),
+ key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool BlobIndexCompactionFilterBase::WriteBlobToNewFile(
+ const Slice& key, const Slice& blob, uint64_t* new_blob_file_number,
+ uint64_t* new_blob_offset) const {
+ TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile");
+ assert(new_blob_file_number);
+ assert(new_blob_offset);
+
+ assert(blob_file_);
+ *new_blob_file_number = blob_file_->BlobFileNumber();
+
+ assert(writer_);
+ uint64_t new_key_offset = 0;
+ const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset,
+ new_blob_offset);
+
+ if (!s.ok()) {
+ const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
+ "Error writing blob to new file %s during compaction/GC, "
+ "key: %s, status: %s",
+ blob_file_->PathName().c_str(),
+ key.ToString(/* output_hex */ true).c_str(),
+ s.ToString().c_str());
+ return false;
+ }
+
+ const uint64_t new_size =
+ BlobLogRecord::kHeaderSize + key.size() + blob.size();
+ blob_file_->BlobRecordAdded(new_size);
+
+ BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ blob_db_impl->total_blob_size_ += new_size;
+
+ return true;
+}
+
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded()
+ const {
+ const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+
+ assert(blob_file_);
+ if (blob_file_->GetFileSize() < blob_db_impl->bdb_options_.blob_file_size) {
+ return true;
+ }
+
+ return CloseAndRegisterNewBlobFile();
+}
+
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
+ BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+ assert(blob_db_impl);
+ assert(blob_file_);
+
+ Status s;
+
+ {
+ WriteLock wl(&blob_db_impl->mutex_);
+
+ s = blob_db_impl->CloseBlobFile(blob_file_);
+
+ // Note: we delay registering the new blob file until it's closed to
+ // prevent FIFO eviction from processing it during compaction/GC.
+ blob_db_impl->RegisterBlobFile(blob_file_);
+ }
+
+ assert(blob_file_->Immutable());
+
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ blob_db_impl->db_options_.info_log,
+ "Error closing new blob file %s during compaction/GC, status: %s",
+ blob_file_->PathName().c_str(), s.ToString().c_str());
+ }
+
+ blob_file_.reset();
+ return s.ok();
+}
+
+CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
+ const Slice& key, const Slice& existing_value,
+ std::string* new_value) const {
+ assert(new_value);
+
+ const BlobDBImpl* const blob_db_impl = context().blob_db_impl;
+ (void)blob_db_impl;
+
+ assert(blob_db_impl);
+ assert(blob_db_impl->bdb_options_.enable_garbage_collection);
+
+ BlobIndex blob_index;
+ const Status s = blob_index.DecodeFrom(existing_value);
+ if (!s.ok()) {
+ gc_stats_.SetError();
+ return BlobDecision::kCorruption;
+ }
+
+ if (blob_index.IsInlined()) {
+ gc_stats_.AddBlob(blob_index.value().size());
+
+ return BlobDecision::kKeep;
+ }
+
+ gc_stats_.AddBlob(blob_index.size());
+
+ if (blob_index.HasTTL()) {
+ return BlobDecision::kKeep;
+ }
+
+ if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
+ return BlobDecision::kKeep;
+ }
+
+ // Note: each compaction generates its own blob files, which, depending on the
+ // workload, might result in many small blob files. The total number of files
+ // is bounded though (determined by the number of compactions and the blob
+ // file size option).
+ if (!OpenNewBlobFileIfNeeded()) {
+ gc_stats_.SetError();
+ return BlobDecision::kIOError;
+ }
+
+ PinnableSlice blob;
+ CompressionType compression_type = kNoCompression;
+ std::string compression_output;
+ if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
+ gc_stats_.SetError();
+ return BlobDecision::kIOError;
+ }
+
+ // If the compression_type is changed, re-compress it with the new compression
+ // type.
+ if (compression_type != blob_db_impl->bdb_options_.compression) {
+ if (compression_type != kNoCompression) {
+ const Status status =
+ blob_db_impl->DecompressSlice(blob, compression_type, &blob);
+ if (!status.ok()) {
+ gc_stats_.SetError();
+ return BlobDecision::kCorruption;
+ }
+ }
+ if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+ blob_db_impl->GetCompressedSlice(blob, &compression_output);
+ blob = PinnableSlice(&compression_output);
+ blob.PinSelf();
+ }
+ }
+
+ uint64_t new_blob_file_number = 0;
+ uint64_t new_blob_offset = 0;
+ if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
+ gc_stats_.SetError();
+ return BlobDecision::kIOError;
+ }
+
+ if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+ gc_stats_.SetError();
+ return BlobDecision::kIOError;
+ }
+
+ BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+ blob.size(), compression_type);
+
+ gc_stats_.AddRelocatedBlob(blob_index.size());
+
+ return BlobDecision::kChangeValue;
+}
+
+bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
+ if (IsBlobFileOpened()) {
+ return true;
+ }
+ bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded();
+ if (result) {
+ gc_stats_.AddNewFile();
+ }
+ return result;
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory(
+ const CompactionFilter::Context& context) const {
+ std::unique_ptr<CompactionFilter> user_comp_filter_from_factory;
+ if (user_comp_filter_factory_) {
+ user_comp_filter_from_factory =
+ user_comp_filter_factory_->CreateCompactionFilter(context);
+ }
+ return user_comp_filter_from_factory;
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactory::CreateCompactionFilter(
+ const CompactionFilter::Context& _context) {
+ assert(clock());
+
+ int64_t current_time = 0;
+ Status s = clock()->GetCurrentTime(&current_time);
+ if (!s.ok()) {
+ return nullptr;
+ }
+ assert(current_time >= 0);
+
+ assert(blob_db_impl());
+
+ BlobCompactionContext context;
+ blob_db_impl()->GetCompactionContext(&context);
+
+ std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+ CreateUserCompactionFilterFromFactory(_context);
+
+ return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
+ std::move(context), user_comp_filter(),
+ std::move(user_comp_filter_from_factory), current_time, statistics()));
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
+ const CompactionFilter::Context& _context) {
+ assert(clock());
+
+ int64_t current_time = 0;
+ Status s = clock()->GetCurrentTime(&current_time);
+ if (!s.ok()) {
+ return nullptr;
+ }
+ assert(current_time >= 0);
+
+ assert(blob_db_impl());
+
+ BlobCompactionContext context;
+ BlobCompactionContextGC context_gc;
+ blob_db_impl()->GetCompactionContext(&context, &context_gc);
+
+ std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+ CreateUserCompactionFilterFromFactory(_context);
+
+ return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilterGC(
+ std::move(context), std::move(context_gc), user_comp_filter(),
+ std::move(user_comp_filter_from_factory), current_time, statistics()));
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.h b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h
new file mode 100644
index 000000000..1493cfc1a
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h
@@ -0,0 +1,204 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <unordered_set>
+
+#include "db/blob/blob_index.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/compaction_filter.h"
+#include "utilities/blob_db/blob_db_gc_stats.h"
+#include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+namespace blob_db {
+
+struct BlobCompactionContext {
+ BlobDBImpl* blob_db_impl = nullptr;
+ uint64_t next_file_number = 0;
+ std::unordered_set<uint64_t> current_blob_files;
+ SequenceNumber fifo_eviction_seq = 0;
+ uint64_t evict_expiration_up_to = 0;
+};
+
+struct BlobCompactionContextGC {
+ uint64_t cutoff_file_number = 0;
+};
+
+// Compaction filter that deletes expired blob indexes from the base DB.
+// Comes into two varieties, one for the non-GC case and one for the GC case.
+class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
+ public:
+ BlobIndexCompactionFilterBase(
+ BlobCompactionContext&& _context,
+ const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+ uint64_t current_time, Statistics* stats)
+ : LayeredCompactionFilterBase(_user_comp_filter,
+ std::move(_user_comp_filter_from_factory)),
+ context_(std::move(_context)),
+ current_time_(current_time),
+ statistics_(stats) {}
+
+ ~BlobIndexCompactionFilterBase() override;
+
+ // Filter expired blob indexes regardless of snapshots.
+ bool IgnoreSnapshots() const override { return true; }
+
+ Decision FilterV2(int level, const Slice& key, ValueType value_type,
+ const Slice& value, std::string* new_value,
+ std::string* skip_until) const override;
+
+ bool IsStackedBlobDbInternalCompactionFilter() const override { return true; }
+
+ protected:
+ bool IsBlobFileOpened() const;
+ virtual bool OpenNewBlobFileIfNeeded() const;
+ bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
+ PinnableSlice* blob, bool need_decompress,
+ CompressionType* compression_type) const;
+ bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
+ uint64_t* new_blob_file_number,
+ uint64_t* new_blob_offset) const;
+ bool CloseAndRegisterNewBlobFileIfNeeded() const;
+ bool CloseAndRegisterNewBlobFile() const;
+
+ Statistics* statistics() const { return statistics_; }
+ const BlobCompactionContext& context() const { return context_; }
+
+ private:
+ Decision HandleValueChange(const Slice& key, std::string* new_value) const;
+
+ private:
+ BlobCompactionContext context_;
+ const uint64_t current_time_;
+ Statistics* statistics_;
+
+ mutable std::shared_ptr<BlobFile> blob_file_;
+ mutable std::shared_ptr<BlobLogWriter> writer_;
+
+ // It is safe to not using std::atomic since the compaction filter, created
+ // from a compaction filter factroy, will not be called from multiple threads.
+ mutable uint64_t expired_count_ = 0;
+ mutable uint64_t expired_size_ = 0;
+ mutable uint64_t evicted_count_ = 0;
+ mutable uint64_t evicted_size_ = 0;
+};
+
+class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase {
+ public:
+ BlobIndexCompactionFilter(
+ BlobCompactionContext&& _context,
+ const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+ uint64_t current_time, Statistics* stats)
+ : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+ std::move(_user_comp_filter_from_factory),
+ current_time, stats) {}
+
+ const char* Name() const override { return "BlobIndexCompactionFilter"; }
+};
+
+class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
+ public:
+ BlobIndexCompactionFilterGC(
+ BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc,
+ const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+ uint64_t current_time, Statistics* stats)
+ : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+ std::move(_user_comp_filter_from_factory),
+ current_time, stats),
+ context_gc_(std::move(context_gc)) {}
+
+ ~BlobIndexCompactionFilterGC() override;
+
+ const char* Name() const override { return "BlobIndexCompactionFilterGC"; }
+
+ BlobDecision PrepareBlobOutput(const Slice& key, const Slice& existing_value,
+ std::string* new_value) const override;
+
+ private:
+ bool OpenNewBlobFileIfNeeded() const override;
+
+ private:
+ BlobCompactionContextGC context_gc_;
+ mutable BlobDBGarbageCollectionStats gc_stats_;
+};
+
+// Compaction filter factory; similarly to the filters above, it comes
+// in two flavors, one that creates filters that support GC, and one
+// that creates non-GC filters.
+class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
+ public:
+ BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl,
+ SystemClock* _clock,
+ const ColumnFamilyOptions& _cf_options,
+ Statistics* _statistics)
+ : blob_db_impl_(_blob_db_impl),
+ clock_(_clock),
+ statistics_(_statistics),
+ user_comp_filter_(_cf_options.compaction_filter),
+ user_comp_filter_factory_(_cf_options.compaction_filter_factory) {}
+
+ protected:
+ std::unique_ptr<CompactionFilter> CreateUserCompactionFilterFromFactory(
+ const CompactionFilter::Context& context) const;
+
+ BlobDBImpl* blob_db_impl() const { return blob_db_impl_; }
+ SystemClock* clock() const { return clock_; }
+ Statistics* statistics() const { return statistics_; }
+ const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
+
+ private:
+ BlobDBImpl* blob_db_impl_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+ const CompactionFilter* user_comp_filter_;
+ std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class BlobIndexCompactionFilterFactory
+ : public BlobIndexCompactionFilterFactoryBase {
+ public:
+ BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl,
+ SystemClock* _clock,
+ const ColumnFamilyOptions& _cf_options,
+ Statistics* _statistics)
+ : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+ _statistics) {}
+
+ const char* Name() const override {
+ return "BlobIndexCompactionFilterFactory";
+ }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override;
+};
+
+class BlobIndexCompactionFilterFactoryGC
+ : public BlobIndexCompactionFilterFactoryBase {
+ public:
+ BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl,
+ SystemClock* _clock,
+ const ColumnFamilyOptions& _cf_options,
+ Statistics* _statistics)
+ : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+ _statistics) {}
+
+ const char* Name() const override {
+ return "BlobIndexCompactionFilterFactoryGC";
+ }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override;
+};
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db.cc b/src/rocksdb/utilities/blob_db/blob_db.cc
new file mode 100644
index 000000000..cbd02e68e
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
+ const std::string& dbname, BlobDB** blob_db) {
+ *blob_db = nullptr;
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families,
+ &handles, blob_db);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ delete handles[0];
+ }
+ return s;
+}
+
+Status BlobDB::Open(const DBOptions& db_options,
+ const BlobDBOptions& bdb_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ BlobDB** blob_db) {
+ assert(handles);
+
+ if (column_families.size() != 1 ||
+ column_families[0].name != kDefaultColumnFamilyName) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+
+ BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options,
+ column_families[0].options);
+ Status s = blob_db_impl->Open(handles);
+ if (s.ok()) {
+ *blob_db = static_cast<BlobDB*>(blob_db_impl);
+ } else {
+ if (!handles->empty()) {
+ for (ColumnFamilyHandle* cfh : *handles) {
+ blob_db_impl->DestroyColumnFamilyHandle(cfh);
+ }
+
+ handles->clear();
+ }
+
+ delete blob_db_impl;
+ *blob_db = nullptr;
+ }
+ return s;
+}
+
+BlobDB::BlobDB() : StackableDB(nullptr) {}
+
+void BlobDBOptions::Dump(Logger* log) const {
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.blob_dir: %s",
+ blob_dir.c_str());
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.path_relative: %d",
+ path_relative);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.is_fifo: %d",
+ is_fifo);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.max_db_size: %" PRIu64,
+ max_db_size);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.ttl_range_secs: %" PRIu64,
+ ttl_range_secs);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.min_blob_size: %" PRIu64,
+ min_blob_size);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.bytes_per_sync: %" PRIu64,
+ bytes_per_sync);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.blob_file_size: %" PRIu64,
+ blob_file_size);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.compression: %d",
+ static_cast<int>(compression));
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.enable_garbage_collection: %d",
+ enable_garbage_collection);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.garbage_collection_cutoff: %f",
+ garbage_collection_cutoff);
+ ROCKS_LOG_HEADER(
+ log, " BlobDBOptions.disable_background_tasks: %d",
+ disable_background_tasks);
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/utilities/blob_db/blob_db.h b/src/rocksdb/utilities/blob_db/blob_db.h
new file mode 100644
index 000000000..e9d92486f
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db.h
@@ -0,0 +1,266 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+// A wrapped database which puts values of KV pairs in a separate log
+// and store location to the log in the underlying DB.
+//
+// The factory needs to be moved to include/rocksdb/utilities to allow
+// users to use blob DB.
+
+constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
+
+struct BlobDBOptions {
+ // Name of the directory under the base DB where blobs will be stored. Using
+ // a directory where the base DB stores its SST files is not supported.
+ // Default is "blob_dir"
+ std::string blob_dir = "blob_dir";
+
+ // whether the blob_dir path is relative or absolute.
+ bool path_relative = true;
+
+ // When max_db_size is reached, evict blob files to free up space
+ // instead of returnning NoSpace error on write. Blob files will be
+ // evicted from oldest to newest, based on file creation time.
+ bool is_fifo = false;
+
+ // Maximum size of the database (including SST files and blob files).
+ //
+ // Default: 0 (no limits)
+ uint64_t max_db_size = 0;
+
+ // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
+ // (10 minutes), and the first bucket starts at 1471542000
+ // then the blob buckets will be
+ // first bucket is 1471542000 - 1471542600
+ // second bucket is 1471542600 - 1471543200
+ // and so on
+ uint64_t ttl_range_secs = 3600;
+
+ // The smallest value to store in blob log. Values smaller than this threshold
+ // will be inlined in base DB together with the key.
+ uint64_t min_blob_size = 0;
+
+ // Allows OS to incrementally sync blob files to disk for every
+ // bytes_per_sync bytes written. Users shouldn't rely on it for
+ // persistency guarantee.
+ uint64_t bytes_per_sync = 512 * 1024;
+
+ // the target size of each blob file. File will become immutable
+ // after it exceeds that size
+ uint64_t blob_file_size = 256 * 1024 * 1024;
+
+ // what compression to use for Blob's
+ CompressionType compression = kNoCompression;
+
+ // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
+ // by rewriting the remaining live blobs to new files.
+ bool enable_garbage_collection = false;
+
+ // The cutoff in terms of blob file age for garbage collection. Blobs in
+ // the oldest N non-TTL blob files will be rewritten when encountered during
+ // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
+ double garbage_collection_cutoff = 0.25;
+
+ // Disable all background job. Used for test only.
+ bool disable_background_tasks = false;
+
+ void Dump(Logger* log) const;
+};
+
+class BlobDB : public StackableDB {
+ public:
+ using ROCKSDB_NAMESPACE::StackableDB::Put;
+ virtual Status Put(const WriteOptions& options, const Slice& key,
+ const Slice& value) override = 0;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ return Put(options, key, value);
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::Delete;
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ assert(db_ != nullptr);
+ return db_->Delete(options, column_family, key);
+ }
+
+ virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t ttl) = 0;
+ virtual Status PutWithTTL(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, uint64_t ttl) {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ return PutWithTTL(options, key, value, ttl);
+ }
+
+ // Put with expiration. Key with expiration time equal to
+ // std::numeric_limits<uint64_t>::max() means the key don't expire.
+ virtual Status PutUntil(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t expiration) = 0;
+ virtual Status PutUntil(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, uint64_t expiration) {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ return PutUntil(options, key, value, expiration);
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override = 0;
+
+ // Get value and expiration.
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, uint64_t* expiration) = 0;
+ virtual Status Get(const ReadOptions& options, const Slice& key,
+ PinnableSlice* value, uint64_t* expiration) {
+ return Get(options, DefaultColumnFamily(), key, value, expiration);
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options, const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override = 0;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override {
+ for (auto column_family : column_families) {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return std::vector<Status>(
+ column_families.size(),
+ Status::NotSupported(
+ "Blob DB doesn't support non-default column family."));
+ }
+ }
+ return MultiGet(options, keys, values);
+ }
+ virtual void MultiGet(const ReadOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const size_t num_keys, const Slice* /*keys*/,
+ PinnableSlice* /*values*/, Status* statuses,
+ const bool /*sorted_input*/ = false) override {
+ for (size_t i = 0; i < num_keys; ++i) {
+ statuses[i] =
+ Status::NotSupported("Blob DB doesn't support batched MultiGet");
+ }
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& /*wopts*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in blob db.");
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in blob db.");
+ }
+
+ virtual Status Write(const WriteOptions& opts,
+ WriteBatch* updates) override = 0;
+
+ using ROCKSDB_NAMESPACE::StackableDB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ // Blob DB doesn't support non-default column family.
+ return nullptr;
+ }
+ return NewIterator(options);
+ }
+
+ Status CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override = 0;
+ Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+
+ return CompactFiles(compact_options, input_file_names, output_level,
+ output_path_id, output_file_names, compaction_job_info);
+ }
+
+ using ROCKSDB_NAMESPACE::StackableDB::Close;
+ virtual Status Close() override = 0;
+
+ // Opening blob db.
+ static Status Open(const Options& options, const BlobDBOptions& bdb_options,
+ const std::string& dbname, BlobDB** blob_db);
+
+ static Status Open(const DBOptions& db_options,
+ const BlobDBOptions& bdb_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ BlobDB** blob_db);
+
+ virtual BlobDBOptions GetBlobDBOptions() const = 0;
+
+ virtual Status SyncBlobFiles() = 0;
+
+ virtual ~BlobDB() {}
+
+ protected:
+ explicit BlobDB();
+};
+
+// Destroy the content of the database.
+Status DestroyBlobDB(const std::string& dbname, const Options& options,
+ const BlobDBOptions& bdb_options);
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h
new file mode 100644
index 000000000..fea6b0032
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+/**
+ * Statistics related to a single garbage collection pass (i.e. a single
+ * (sub)compaction).
+ */
+class BlobDBGarbageCollectionStats {
+ public:
+ uint64_t AllBlobs() const { return all_blobs_; }
+ uint64_t AllBytes() const { return all_bytes_; }
+ uint64_t RelocatedBlobs() const { return relocated_blobs_; }
+ uint64_t RelocatedBytes() const { return relocated_bytes_; }
+ uint64_t NewFiles() const { return new_files_; }
+ bool HasError() const { return error_; }
+
+ void AddBlob(uint64_t size) {
+ ++all_blobs_;
+ all_bytes_ += size;
+ }
+
+ void AddRelocatedBlob(uint64_t size) {
+ ++relocated_blobs_;
+ relocated_bytes_ += size;
+ }
+
+ void AddNewFile() { ++new_files_; }
+
+ void SetError() { error_ = true; }
+
+ private:
+ uint64_t all_blobs_ = 0;
+ uint64_t all_bytes_ = 0;
+ uint64_t relocated_blobs_ = 0;
+ uint64_t relocated_bytes_ = 0;
+ uint64_t new_files_ = 0;
+ bool error_ = false;
+};
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.cc b/src/rocksdb/utilities/blob_db/blob_db_impl.cc
new file mode 100644
index 000000000..87e294c5c
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl.cc
@@ -0,0 +1,2177 @@
+
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_impl.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_builder.h"
+#include "table/meta_blocks.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/timer_queue.h"
+#include "utilities/blob_db/blob_compaction_filter.h"
+#include "utilities/blob_db/blob_db_iterator.h"
+#include "utilities/blob_db/blob_db_listener.h"
+
+namespace {
+int kBlockBasedTableVersionFormat = 2;
+} // end namespace
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+bool BlobFileComparator::operator()(
+ const std::shared_ptr<BlobFile>& lhs,
+ const std::shared_ptr<BlobFile>& rhs) const {
+ return lhs->BlobFileNumber() > rhs->BlobFileNumber();
+}
+
+bool BlobFileComparatorTTL::operator()(
+ const std::shared_ptr<BlobFile>& lhs,
+ const std::shared_ptr<BlobFile>& rhs) const {
+ assert(lhs->HasTTL() && rhs->HasTTL());
+ if (lhs->expiration_range_.first < rhs->expiration_range_.first) {
+ return true;
+ }
+ if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
+ return false;
+ }
+ return lhs->BlobFileNumber() < rhs->BlobFileNumber();
+}
+
+BlobDBImpl::BlobDBImpl(const std::string& dbname,
+ const BlobDBOptions& blob_db_options,
+ const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options)
+ : BlobDB(),
+ dbname_(dbname),
+ db_impl_(nullptr),
+ env_(db_options.env),
+ bdb_options_(blob_db_options),
+ db_options_(db_options),
+ cf_options_(cf_options),
+ file_options_(db_options),
+ statistics_(db_options_.statistics.get()),
+ next_file_number_(1),
+ flush_sequence_(0),
+ closed_(true),
+ open_file_count_(0),
+ total_blob_size_(0),
+ live_sst_size_(0),
+ fifo_eviction_seq_(0),
+ evict_expiration_up_to_(0),
+ debug_level_(0) {
+ clock_ = env_->GetSystemClock().get();
+ blob_dir_ = (bdb_options_.path_relative)
+ ? dbname + "/" + bdb_options_.blob_dir
+ : bdb_options_.blob_dir;
+ file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+}
+
+BlobDBImpl::~BlobDBImpl() {
+ tqueue_.shutdown();
+ // CancelAllBackgroundWork(db_, true);
+ Status s __attribute__((__unused__)) = Close();
+ assert(s.ok());
+}
+
+Status BlobDBImpl::Close() {
+ if (closed_) {
+ return Status::OK();
+ }
+ closed_ = true;
+
+ // Close base DB before BlobDBImpl destructs to stop event listener and
+ // compaction filter call.
+ Status s = db_->Close();
+ // delete db_ anyway even if close failed.
+ delete db_;
+ // Reset pointers to avoid StackableDB delete the pointer again.
+ db_ = nullptr;
+ db_impl_ = nullptr;
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = SyncBlobFiles();
+ return s;
+}
+
+BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
+
+Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ assert(db_ == nullptr);
+
+ if (blob_dir_.empty()) {
+ return Status::NotSupported("No blob directory in options");
+ }
+
+ if (bdb_options_.garbage_collection_cutoff < 0.0 ||
+ bdb_options_.garbage_collection_cutoff > 1.0) {
+ return Status::InvalidArgument(
+ "Garbage collection cutoff must be in the interval [0.0, 1.0]");
+ }
+
+ // Temporarily disable compactions in the base DB during open; save the user
+ // defined value beforehand so we can restore it once BlobDB is initialized.
+ // Note: this is only needed if garbage collection is enabled.
+ const bool disable_auto_compactions = cf_options_.disable_auto_compactions;
+
+ if (bdb_options_.enable_garbage_collection) {
+ cf_options_.disable_auto_compactions = true;
+ }
+
+ Status s;
+
+ // Create info log.
+ if (db_options_.info_log == nullptr) {
+ s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
+
+ if ((cf_options_.compaction_filter != nullptr ||
+ cf_options_.compaction_filter_factory != nullptr)) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "BlobDB only support compaction filter on non-TTL values.");
+ }
+
+ // Open blob directory.
+ s = env_->CreateDirIfMissing(blob_dir_);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to create blob_dir %s, status: %s",
+ blob_dir_.c_str(), s.ToString().c_str());
+ }
+ s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_,
+ nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(),
+ s.ToString().c_str());
+ return s;
+ }
+
+ // Open blob files.
+ s = OpenAllBlobFiles();
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Update options
+ if (bdb_options_.enable_garbage_collection) {
+ db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
+ cf_options_.compaction_filter_factory =
+ std::make_shared<BlobIndexCompactionFilterFactoryGC>(
+ this, clock_, cf_options_, statistics_);
+ } else {
+ db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
+ cf_options_.compaction_filter_factory =
+ std::make_shared<BlobIndexCompactionFilterFactory>(
+ this, clock_, cf_options_, statistics_);
+ }
+
+ // Reset user compaction filter after building into compaction factory.
+ cf_options_.compaction_filter = nullptr;
+
+ // Open base db.
+ ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
+ s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
+ if (!s.ok()) {
+ return s;
+ }
+ db_impl_ = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+ // Sanitize the blob_dir provided. Using a directory where the
+ // base DB stores its files for the default CF is not supported.
+ const ColumnFamilyData* const cfd =
+ static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
+ assert(cfd);
+
+ const ImmutableCFOptions* const ioptions = cfd->ioptions();
+ assert(ioptions);
+
+ assert(env_);
+
+ for (const auto& cf_path : ioptions->cf_paths) {
+ bool blob_dir_same_as_cf_dir = false;
+ s = env_->AreFilesSame(blob_dir_, cf_path.path, &blob_dir_same_as_cf_dir);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Error while sanitizing blob_dir %s, status: %s",
+ blob_dir_.c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ if (blob_dir_same_as_cf_dir) {
+ return Status::NotSupported(
+ "Using the base DB's storage directories for BlobDB files is not "
+ "supported.");
+ }
+ }
+
+ // Initialize SST file <-> oldest blob file mapping if garbage collection
+ // is enabled.
+ if (bdb_options_.enable_garbage_collection) {
+ std::vector<LiveFileMetaData> live_files;
+ db_->GetLiveFilesMetaData(&live_files);
+
+ InitializeBlobFileToSstMapping(live_files);
+
+ MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+ if (!disable_auto_compactions) {
+ s = db_->EnableAutoCompaction(*handles);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Failed to enable automatic compactions during open, status: %s",
+ s.ToString().c_str());
+ return s;
+ }
+ }
+ }
+
+ // Add trash files in blob dir to file delete scheduler.
+ SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
+ db_impl_->immutable_db_options().sst_file_manager.get());
+ DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_);
+
+ UpdateLiveSSTSize();
+
+ // Start background jobs.
+ if (!bdb_options_.disable_background_tasks) {
+ StartBackgroundTasks();
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this);
+ bdb_options_.Dump(db_options_.info_log.get());
+ closed_ = false;
+ return s;
+}
+
+void BlobDBImpl::StartBackgroundTasks() {
+ // store a call to a member function and object
+ tqueue_.add(
+ kReclaimOpenFilesPeriodMillisecs,
+ std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
+ tqueue_.add(
+ kDeleteObsoleteFilesPeriodMillisecs,
+ std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
+ tqueue_.add(kSanityCheckPeriodMillisecs,
+ std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
+ tqueue_.add(
+ kEvictExpiredFilesPeriodMillisecs,
+ std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1));
+}
+
+Status BlobDBImpl::GetAllBlobFiles(std::set<uint64_t>* file_numbers) {
+ assert(file_numbers != nullptr);
+ std::vector<std::string> all_files;
+ Status s = env_->GetChildren(blob_dir_, &all_files);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to get list of blob files, status: %s",
+ s.ToString().c_str());
+ return s;
+ }
+
+ for (const auto& file_name : all_files) {
+ uint64_t file_number;
+ FileType type;
+ bool success = ParseFileName(file_name, &file_number, &type);
+ if (success && type == kBlobFile) {
+ file_numbers->insert(file_number);
+ } else {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Skipping file in blob directory: %s", file_name.c_str());
+ }
+ }
+
+ return s;
+}
+
+Status BlobDBImpl::OpenAllBlobFiles() {
+ std::set<uint64_t> file_numbers;
+ Status s = GetAllBlobFiles(&file_numbers);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!file_numbers.empty()) {
+ next_file_number_.store(*file_numbers.rbegin() + 1);
+ }
+
+ std::ostringstream blob_file_oss;
+ std::ostringstream live_imm_oss;
+ std::ostringstream obsolete_file_oss;
+
+ for (auto& file_number : file_numbers) {
+ std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
+ this, blob_dir_, file_number, db_options_.info_log.get());
+ blob_file->MarkImmutable(/* sequence */ 0);
+
+ // Read file header and footer
+ Status read_metadata_status =
+ blob_file->ReadMetadata(env_->GetFileSystem(), file_options_);
+ if (read_metadata_status.IsCorruption()) {
+ // Remove incomplete file.
+ if (!obsolete_files_.empty()) {
+ obsolete_file_oss << ", ";
+ }
+ obsolete_file_oss << file_number;
+
+ ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
+ continue;
+ } else if (!read_metadata_status.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Unable to read metadata of blob file %" PRIu64
+ ", status: '%s'",
+ file_number, read_metadata_status.ToString().c_str());
+ return read_metadata_status;
+ }
+
+ total_blob_size_ += blob_file->GetFileSize();
+
+ if (!blob_files_.empty()) {
+ blob_file_oss << ", ";
+ }
+ blob_file_oss << file_number;
+
+ blob_files_[file_number] = blob_file;
+
+ if (!blob_file->HasTTL()) {
+ if (!live_imm_non_ttl_blob_files_.empty()) {
+ live_imm_oss << ", ";
+ }
+ live_imm_oss << file_number;
+
+ live_imm_non_ttl_blob_files_[file_number] = blob_file;
+ }
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
+ blob_file_oss.str().c_str());
+ ROCKS_LOG_INFO(
+ db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s",
+ live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str());
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Found %" ROCKSDB_PRIszt
+ " incomplete or corrupted blob files: %s",
+ obsolete_files_.size(), obsolete_file_oss.str().c_str());
+ return s;
+}
+
+template <typename Linker>
+void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number,
+ uint64_t blob_file_number,
+ Linker linker) {
+ assert(bdb_options_.enable_garbage_collection);
+ assert(blob_file_number != kInvalidBlobFileNumber);
+
+ auto it = blob_files_.find(blob_file_number);
+ if (it == blob_files_.end()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Blob file %" PRIu64
+ " not found while trying to link "
+ "SST file %" PRIu64,
+ blob_file_number, sst_file_number);
+ return;
+ }
+
+ BlobFile* const blob_file = it->second.get();
+ assert(blob_file);
+
+ linker(blob_file, sst_file_number);
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Blob file %" PRIu64 " linked to SST file %" PRIu64,
+ blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number,
+ uint64_t blob_file_number) {
+ auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+ WriteLock file_lock(&blob_file->mutex_);
+ blob_file->LinkSstFile(sst_file);
+ };
+
+ LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+ uint64_t blob_file_number) {
+ auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+ blob_file->LinkSstFile(sst_file);
+ };
+
+ LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number,
+ uint64_t blob_file_number) {
+ assert(bdb_options_.enable_garbage_collection);
+ assert(blob_file_number != kInvalidBlobFileNumber);
+
+ auto it = blob_files_.find(blob_file_number);
+ if (it == blob_files_.end()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Blob file %" PRIu64
+ " not found while trying to unlink "
+ "SST file %" PRIu64,
+ blob_file_number, sst_file_number);
+ return;
+ }
+
+ BlobFile* const blob_file = it->second.get();
+ assert(blob_file);
+
+ {
+ WriteLock file_lock(&blob_file->mutex_);
+ blob_file->UnlinkSstFile(sst_file_number);
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Blob file %" PRIu64 " unlinked from SST file %" PRIu64,
+ blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::InitializeBlobFileToSstMapping(
+ const std::vector<LiveFileMetaData>& live_files) {
+ assert(bdb_options_.enable_garbage_collection);
+
+ for (const auto& live_file : live_files) {
+ const uint64_t sst_file_number = live_file.file_number;
+ const uint64_t blob_file_number = live_file.oldest_blob_file_number;
+
+ if (blob_file_number == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ LinkSstToBlobFileNoLock(sst_file_number, blob_file_number);
+ }
+}
+
+void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
+ assert(bdb_options_.enable_garbage_collection);
+
+ WriteLock lock(&mutex_);
+
+ if (info.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
+ }
+
+ assert(flush_sequence_ < info.largest_seqno);
+ flush_sequence_ = info.largest_seqno;
+
+ MarkUnreferencedBlobFilesObsolete();
+}
+
+void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+ assert(bdb_options_.enable_garbage_collection);
+
+ if (!info.status.ok()) {
+ return;
+ }
+
+ // Note: the same SST file may appear in both the input and the output
+ // file list in case of a trivial move. We walk through the two lists
+ // below in a fashion that's similar to merge sort to detect this.
+
+ auto cmp = [](const CompactionFileInfo& lhs, const CompactionFileInfo& rhs) {
+ return lhs.file_number < rhs.file_number;
+ };
+
+ auto inputs = info.input_file_infos;
+ auto iit = inputs.begin();
+ const auto iit_end = inputs.end();
+
+ std::sort(iit, iit_end, cmp);
+
+ auto outputs = info.output_file_infos;
+ auto oit = outputs.begin();
+ const auto oit_end = outputs.end();
+
+ std::sort(oit, oit_end, cmp);
+
+ WriteLock lock(&mutex_);
+
+ while (iit != iit_end && oit != oit_end) {
+ const auto& input = *iit;
+ const auto& output = *oit;
+
+ if (input.file_number == output.file_number) {
+ ++iit;
+ ++oit;
+ } else if (input.file_number < output.file_number) {
+ if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+ }
+
+ ++iit;
+ } else {
+ assert(output.file_number < input.file_number);
+
+ if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+ }
+
+ ++oit;
+ }
+ }
+
+ while (iit != iit_end) {
+ const auto& input = *iit;
+
+ if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+ }
+
+ ++iit;
+ }
+
+ while (oit != oit_end) {
+ const auto& output = *oit;
+
+ if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
+ LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+ }
+
+ ++oit;
+ }
+
+ MarkUnreferencedBlobFilesObsolete();
+}
+
+bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
+ const std::shared_ptr<BlobFile>& blob_file, SequenceNumber obsolete_seq) {
+ assert(blob_file);
+ assert(!blob_file->HasTTL());
+ assert(blob_file->Immutable());
+ assert(bdb_options_.enable_garbage_collection);
+
+ // Note: FIFO eviction could have marked this file obsolete already.
+ if (blob_file->Obsolete()) {
+ return true;
+ }
+
+ // We cannot mark this file (or any higher-numbered files for that matter)
+ // obsolete if it is referenced by any memtables or SSTs. We keep track of
+ // the SSTs explicitly. To account for memtables, we keep track of the highest
+ // sequence number received in flush notifications, and we do not mark the
+ // blob file obsolete if there are still unflushed memtables from before
+ // the time the blob file was closed.
+ if (blob_file->GetImmutableSequence() > flush_sequence_ ||
+ !blob_file->GetLinkedSstFiles().empty()) {
+ return false;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Blob file %" PRIu64 " is no longer needed, marking obsolete",
+ blob_file->BlobFileNumber());
+
+ ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true);
+ return true;
+}
+
+template <class Functor>
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) {
+ assert(bdb_options_.enable_garbage_collection);
+
+ // Iterate through all live immutable non-TTL blob files, and mark them
+ // obsolete assuming no SST files or memtables rely on the blobs in them.
+ // Note: we need to stop as soon as we find a blob file that has any
+ // linked SSTs (or one potentially referenced by memtables).
+
+ uint64_t obsoleted_files = 0;
+
+ auto it = live_imm_non_ttl_blob_files_.begin();
+ while (it != live_imm_non_ttl_blob_files_.end()) {
+ const auto& blob_file = it->second;
+ assert(blob_file);
+ assert(blob_file->BlobFileNumber() == it->first);
+ assert(!blob_file->HasTTL());
+ assert(blob_file->Immutable());
+
+ // Small optimization: Obsolete() does an atomic read, so we can do
+ // this check without taking a lock on the blob file's mutex.
+ if (blob_file->Obsolete()) {
+ it = live_imm_non_ttl_blob_files_.erase(it);
+ continue;
+ }
+
+ if (!mark_if_needed(blob_file)) {
+ break;
+ }
+
+ it = live_imm_non_ttl_blob_files_.erase(it);
+
+ ++obsoleted_files;
+ }
+
+ if (obsoleted_files > 0) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "%" PRIu64 " blob file(s) marked obsolete by GC",
+ obsoleted_files);
+ RecordTick(statistics_, BLOB_DB_GC_NUM_FILES, obsoleted_files);
+ }
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() {
+ const SequenceNumber obsolete_seq = GetLatestSequenceNumber();
+
+ MarkUnreferencedBlobFilesObsoleteImpl(
+ [this, obsolete_seq](const std::shared_ptr<BlobFile>& blob_file) {
+ WriteLock file_lock(&blob_file->mutex_);
+ return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
+ });
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
+ MarkUnreferencedBlobFilesObsoleteImpl(
+ [this](const std::shared_ptr<BlobFile>& blob_file) {
+ return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
+ });
+}
+
+void BlobDBImpl::CloseRandomAccessLocked(
+ const std::shared_ptr<BlobFile>& bfile) {
+ bfile->CloseRandomAccessLocked();
+ open_file_count_--;
+}
+
+Status BlobDBImpl::GetBlobFileReader(
+ const std::shared_ptr<BlobFile>& blob_file,
+ std::shared_ptr<RandomAccessFileReader>* reader) {
+ assert(reader != nullptr);
+ bool fresh_open = false;
+ Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open);
+ if (s.ok() && fresh_open) {
+ assert(*reader != nullptr);
+ open_file_count_++;
+ }
+ return s;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
+ bool has_ttl, const ExpirationRange& expiration_range,
+ const std::string& reason) {
+ assert(has_ttl == (expiration_range.first || expiration_range.second));
+
+ uint64_t file_num = next_file_number_++;
+
+ const uint32_t column_family_id =
+ static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+ auto blob_file = std::make_shared<BlobFile>(
+ this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
+ bdb_options_.compression, has_ttl, expiration_range);
+
+ ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
+ blob_file->PathName().c_str(), reason.c_str());
+ LogFlush(db_options_.info_log);
+
+ return blob_file;
+}
+
+void BlobDBImpl::RegisterBlobFile(std::shared_ptr<BlobFile> blob_file) {
+ const uint64_t blob_file_number = blob_file->BlobFileNumber();
+
+ auto it = blob_files_.lower_bound(blob_file_number);
+ assert(it == blob_files_.end() || it->first != blob_file_number);
+
+ blob_files_.insert(it,
+ std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+ blob_file_number, std::move(blob_file)));
+}
+
+Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
+ std::string fpath(bfile->PathName());
+ std::unique_ptr<FSWritableFile> wfile;
+ const auto& fs = env_->GetFileSystem();
+
+ Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to open blob file for write: %s status: '%s'"
+ " exists: '%s'",
+ fpath.c_str(), s.ToString().c_str(),
+ fs->FileExists(fpath, file_options_.io_options, nullptr)
+ .ToString()
+ .c_str());
+ return s;
+ }
+
+ std::unique_ptr<WritableFileWriter> fwriter;
+ fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_));
+
+ uint64_t boffset = bfile->GetFileSize();
+ if (debug_level_ >= 2 && boffset) {
+ ROCKS_LOG_DEBUG(db_options_.info_log,
+ "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
+ boffset);
+ }
+
+ BlobLogWriter::ElemType et = BlobLogWriter::kEtNone;
+ if (bfile->file_size_ == BlobLogHeader::kSize) {
+ et = BlobLogWriter::kEtFileHdr;
+ } else if (bfile->file_size_ > BlobLogHeader::kSize) {
+ et = BlobLogWriter::kEtRecord;
+ } else if (bfile->file_size_) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "Open blob file: %s with wrong size: %" PRIu64,
+ fpath.c_str(), boffset);
+ return Status::Corruption("Invalid blob file size");
+ }
+
+ constexpr bool do_flush = true;
+
+ bfile->log_writer_ = std::make_shared<BlobLogWriter>(
+ std::move(fwriter), clock_, statistics_, bfile->file_number_,
+ db_options_.use_fsync, do_flush, boffset);
+ bfile->log_writer_->last_elem_type_ = et;
+
+ return s;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
+ uint64_t expiration) const {
+ if (open_ttl_files_.empty()) {
+ return nullptr;
+ }
+
+ std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
+ tmp->SetHasTTL(true);
+ tmp->expiration_range_ = std::make_pair(expiration, 0);
+ tmp->file_number_ = std::numeric_limits<uint64_t>::max();
+
+ auto citr = open_ttl_files_.equal_range(tmp);
+ if (citr.first == open_ttl_files_.end()) {
+ assert(citr.second == open_ttl_files_.end());
+
+ std::shared_ptr<BlobFile> check = *(open_ttl_files_.rbegin());
+ return (check->expiration_range_.second <= expiration) ? nullptr : check;
+ }
+
+ if (citr.first != citr.second) {
+ return *(citr.first);
+ }
+
+ auto finditr = citr.second;
+ if (finditr != open_ttl_files_.begin()) {
+ --finditr;
+ }
+
+ bool b2 = (*finditr)->expiration_range_.second <= expiration;
+ bool b1 = (*finditr)->expiration_range_.first > expiration;
+
+ return (b1 || b2) ? nullptr : (*finditr);
+}
+
+Status BlobDBImpl::CheckOrCreateWriterLocked(
+ const std::shared_ptr<BlobFile>& blob_file,
+ std::shared_ptr<BlobLogWriter>* writer) {
+ assert(writer != nullptr);
+ *writer = blob_file->GetWriter();
+ if (*writer != nullptr) {
+ return Status::OK();
+ }
+ Status s = CreateWriterLocked(blob_file);
+ if (s.ok()) {
+ *writer = blob_file->GetWriter();
+ }
+ return s;
+}
+
+Status BlobDBImpl::CreateBlobFileAndWriter(
+ bool has_ttl, const ExpirationRange& expiration_range,
+ const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
+ std::shared_ptr<BlobLogWriter>* writer) {
+ TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter");
+ assert(has_ttl == (expiration_range.first || expiration_range.second));
+ assert(blob_file);
+ assert(writer);
+
+ *blob_file = NewBlobFile(has_ttl, expiration_range, reason);
+ assert(*blob_file);
+
+ // file not visible, hence no lock
+ Status s = CheckOrCreateWriterLocked(*blob_file, writer);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to get writer for blob file: %s, error: %s",
+ (*blob_file)->PathName().c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ assert(*writer);
+
+ s = (*writer)->WriteHeader((*blob_file)->header_);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to write header to new blob file: %s"
+ " status: '%s'",
+ (*blob_file)->PathName().c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ (*blob_file)->SetFileSize(BlobLogHeader::kSize);
+ total_blob_size_ += BlobLogHeader::kSize;
+
+ return s;
+}
+
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+ assert(blob_file);
+
+ {
+ ReadLock rl(&mutex_);
+
+ if (open_non_ttl_file_) {
+ assert(!open_non_ttl_file_->Immutable());
+ *blob_file = open_non_ttl_file_;
+ return Status::OK();
+ }
+ }
+
+ // Check again
+ WriteLock wl(&mutex_);
+
+ if (open_non_ttl_file_) {
+ assert(!open_non_ttl_file_->Immutable());
+ *blob_file = open_non_ttl_file_;
+ return Status::OK();
+ }
+
+ std::shared_ptr<BlobLogWriter> writer;
+ const Status s = CreateBlobFileAndWriter(
+ /* has_ttl */ false, ExpirationRange(),
+ /* reason */ "SelectBlobFile", blob_file, &writer);
+ if (!s.ok()) {
+ return s;
+ }
+
+ RegisterBlobFile(*blob_file);
+ open_non_ttl_file_ = *blob_file;
+
+ return s;
+}
+
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+ std::shared_ptr<BlobFile>* blob_file) {
+ assert(blob_file);
+ assert(expiration != kNoExpiration);
+
+ {
+ ReadLock rl(&mutex_);
+
+ *blob_file = FindBlobFileLocked(expiration);
+ if (*blob_file != nullptr) {
+ assert(!(*blob_file)->Immutable());
+ return Status::OK();
+ }
+ }
+
+ // Check again
+ WriteLock wl(&mutex_);
+
+ *blob_file = FindBlobFileLocked(expiration);
+ if (*blob_file != nullptr) {
+ assert(!(*blob_file)->Immutable());
+ return Status::OK();
+ }
+
+ const uint64_t exp_low =
+ (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
+ const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+ const ExpirationRange expiration_range(exp_low, exp_high);
+
+ std::ostringstream oss;
+ oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';
+
+ std::shared_ptr<BlobLogWriter> writer;
+ const Status s =
+ CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
+ /* reason */ oss.str(), blob_file, &writer);
+ if (!s.ok()) {
+ return s;
+ }
+
+ RegisterBlobFile(*blob_file);
+ open_ttl_files_.insert(*blob_file);
+
+ return s;
+}
+
+class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
+ private:
+ const WriteOptions& options_;
+ BlobDBImpl* blob_db_impl_;
+ uint32_t default_cf_id_;
+ WriteBatch batch_;
+
+ public:
+ BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl,
+ uint32_t default_cf_id)
+ : options_(options),
+ blob_db_impl_(blob_db_impl),
+ default_cf_id_(default_cf_id) {}
+
+ WriteBatch* batch() { return &batch_; }
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ if (column_family_id != default_cf_id_) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration,
+ &batch_);
+ return s;
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ if (column_family_id != default_cf_id_) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key);
+ return s;
+ }
+
+ virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) {
+ if (column_family_id != default_cf_id_) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id,
+ begin_key, end_key);
+ return s;
+ }
+
+ Status SingleDeleteCF(uint32_t /*column_family_id*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in blob db.");
+ }
+
+ Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in blob db.");
+ }
+
+ void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
+};
+
+Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
+ StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_WRITE);
+ uint32_t default_cf_id =
+ static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+ ->GetID();
+ Status s;
+ BlobInserter blob_inserter(options, this, default_cf_id);
+ {
+ // Release write_mutex_ before DB write to avoid race condition with
+ // flush begin listener, which also require write_mutex_ to sync
+ // blob files.
+ MutexLock l(&write_mutex_);
+ s = updates->Iterate(&blob_inserter);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ return db_->Write(options, blob_inserter.batch());
+}
+
+Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
+ const Slice& value) {
+ return PutUntil(options, key, value, kNoExpiration);
+}
+
+Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t ttl) {
+ uint64_t now = EpochNow();
+ uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
+ return PutUntil(options, key, value, expiration);
+}
+
+Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t expiration) {
+ StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_PUT);
+ Status s;
+ WriteBatch batch;
+ {
+ // Release write_mutex_ before DB write to avoid race condition with
+ // flush begin listener, which also require write_mutex_ to sync
+ // blob files.
+ MutexLock l(&write_mutex_);
+ s = PutBlobValue(options, key, value, expiration, &batch);
+ }
+ if (s.ok()) {
+ s = db_->Write(options, &batch);
+ }
+ return s;
+}
+
+Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
+ const Slice& key, const Slice& value,
+ uint64_t expiration, WriteBatch* batch) {
+ write_mutex_.AssertHeld();
+ Status s;
+ std::string index_entry;
+ uint32_t column_family_id =
+ static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+ ->GetID();
+ if (value.size() < bdb_options_.min_blob_size) {
+ if (expiration == kNoExpiration) {
+ // Put as normal value
+ s = batch->Put(key, value);
+ RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
+ } else {
+ // Inlined with TTL
+ BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
+ s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+ index_entry);
+ RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
+ }
+ } else {
+ std::string compression_output;
+ Slice value_compressed = GetCompressedSlice(value, &compression_output);
+
+ std::string headerbuf;
+ BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed,
+ expiration);
+
+ // Check DB size limit before selecting blob file to
+ // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
+ // done before calling SelectBlobFile().
+ s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() +
+ value_compressed.size());
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::shared_ptr<BlobFile> blob_file;
+ if (expiration != kNoExpiration) {
+ s = SelectBlobFileTTL(expiration, &blob_file);
+ } else {
+ s = SelectBlobFile(&blob_file);
+ }
+ if (s.ok()) {
+ assert(blob_file != nullptr);
+ assert(blob_file->GetCompressionType() == bdb_options_.compression);
+ s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration,
+ &index_entry);
+ }
+ if (s.ok()) {
+ if (expiration != kNoExpiration) {
+ WriteLock file_lock(&blob_file->mutex_);
+ blob_file->ExtendExpirationRange(expiration);
+ }
+ s = CloseBlobFileIfNeeded(blob_file);
+ }
+ if (s.ok()) {
+ s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+ index_entry);
+ }
+ if (s.ok()) {
+ if (expiration == kNoExpiration) {
+ RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+ } else {
+ RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
+ }
+ } else {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
+ " status: '%s' blob_file: '%s'",
+ blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
+ s.ToString().c_str(), blob_file->DumpState().c_str());
+ }
+ }
+
+ RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
+ RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
+ RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size());
+ RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size());
+
+ return s;
+}
+
+Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
+ std::string* compression_output) const {
+ if (bdb_options_.compression == kNoCompression) {
+ return raw;
+ }
+ StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
+ CompressionType type = bdb_options_.compression;
+ CompressionOptions opts;
+ CompressionContext context(type);
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
+ 0 /* sample_for_compression */);
+ CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
+ compression_output, nullptr, nullptr);
+ return *compression_output;
+}
+
+Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
+ CompressionType compression_type,
+ PinnableSlice* value_output) const {
+ assert(compression_type != kNoCompression);
+
+ BlockContents contents;
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+
+ {
+ StopWatch decompression_sw(clock_, statistics_,
+ BLOB_DB_DECOMPRESSION_MICROS);
+ UncompressionContext context(compression_type);
+ UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+ compression_type);
+ Status s = UncompressBlockData(
+ info, compressed_value.data(), compressed_value.size(), &contents,
+ kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+ if (!s.ok()) {
+ return Status::Corruption("Unable to decompress blob.");
+ }
+ }
+
+ value_output->PinSelf(contents.data);
+
+ return Status::OK();
+}
+
+Status BlobDBImpl::CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id, std::vector<std::string>* const output_file_names,
+ CompactionJobInfo* compaction_job_info) {
+ // Note: we need CompactionJobInfo to be able to track updates to the
+ // blob file <-> SST mappings, so we provide one if the user hasn't,
+ // assuming that GC is enabled.
+ CompactionJobInfo info{};
+ if (bdb_options_.enable_garbage_collection && !compaction_job_info) {
+ compaction_job_info = &info;
+ }
+
+ const Status s =
+ db_->CompactFiles(compact_options, input_file_names, output_level,
+ output_path_id, output_file_names, compaction_job_info);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (bdb_options_.enable_garbage_collection) {
+ assert(compaction_job_info);
+ ProcessCompactionJobInfo(*compaction_job_info);
+ }
+
+ return s;
+}
+
+void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
+ assert(context);
+
+ context->blob_db_impl = this;
+ context->next_file_number = next_file_number_.load();
+ context->current_blob_files.clear();
+ for (auto& p : blob_files_) {
+ context->current_blob_files.insert(p.first);
+ }
+ context->fifo_eviction_seq = fifo_eviction_seq_;
+ context->evict_expiration_up_to = evict_expiration_up_to_;
+}
+
+void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
+ assert(context);
+
+ ReadLock l(&mutex_);
+ GetCompactionContextCommon(context);
+}
+
+void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
+ BlobCompactionContextGC* context_gc) {
+ assert(context);
+ assert(context_gc);
+
+ ReadLock l(&mutex_);
+ GetCompactionContextCommon(context);
+
+ if (!live_imm_non_ttl_blob_files_.empty()) {
+ auto it = live_imm_non_ttl_blob_files_.begin();
+ std::advance(it, bdb_options_.garbage_collection_cutoff *
+ live_imm_non_ttl_blob_files_.size());
+ context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end()
+ ? it->first
+ : std::numeric_limits<uint64_t>::max();
+ }
+}
+
+void BlobDBImpl::UpdateLiveSSTSize() {
+ uint64_t live_sst_size = 0;
+ bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+ if (ok) {
+ live_sst_size_.store(live_sst_size);
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Updated total SST file size: %" PRIu64 " bytes.",
+ live_sst_size);
+ } else {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Failed to update total SST file size after flush or compaction.");
+ }
+ {
+ // Trigger FIFO eviction if needed.
+ MutexLock l(&write_mutex_);
+ Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/);
+ if (s.IsNoSpace()) {
+ ROCKS_LOG_WARN(db_options_.info_log,
+ "DB grow out-of-space after SST size updated. Current live"
+ " SST size: %" PRIu64
+ " , current blob files size: %" PRIu64 ".",
+ live_sst_size_.load(), total_blob_size_.load());
+ }
+ }
+}
+
+Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+ bool force_evict) {
+ write_mutex_.AssertHeld();
+
+ uint64_t live_sst_size = live_sst_size_.load();
+ if (bdb_options_.max_db_size == 0 ||
+ live_sst_size + total_blob_size_.load() + blob_size <=
+ bdb_options_.max_db_size) {
+ return Status::OK();
+ }
+
+ if (bdb_options_.is_fifo == false ||
+ (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) {
+ // FIFO eviction is disabled, or no space to insert new blob even we evict
+ // all blob files.
+ return Status::NoSpace(
+ "Write failed, as writing it would exceed max_db_size limit.");
+ }
+
+ std::vector<std::shared_ptr<BlobFile>> candidate_files;
+ CopyBlobFiles(&candidate_files);
+ std::sort(candidate_files.begin(), candidate_files.end(),
+ BlobFileComparator());
+ fifo_eviction_seq_ = GetLatestSequenceNumber();
+
+ WriteLock l(&mutex_);
+
+ while (!candidate_files.empty() &&
+ live_sst_size + total_blob_size_.load() + blob_size >
+ bdb_options_.max_db_size) {
+ std::shared_ptr<BlobFile> blob_file = candidate_files.back();
+ candidate_files.pop_back();
+ WriteLock file_lock(&blob_file->mutex_);
+ if (blob_file->Obsolete()) {
+ // File already obsoleted by someone else.
+ assert(blob_file->Immutable());
+ continue;
+ }
+ // FIFO eviction can evict open blob files.
+ if (!blob_file->Immutable()) {
+ Status s = CloseBlobFile(blob_file);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ assert(blob_file->Immutable());
+ auto expiration_range = blob_file->GetExpirationRange();
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Evict oldest blob file since DB out of space. Current "
+ "live SST file size: %" PRIu64 ", total blob size: %" PRIu64
+ ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64
+ ".",
+ live_sst_size, total_blob_size_.load(),
+ bdb_options_.max_db_size, blob_file->BlobFileNumber());
+ ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/);
+ evict_expiration_up_to_ = expiration_range.first;
+ RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
+ RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+ blob_file->BlobCount());
+ RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
+ blob_file->GetFileSize());
+ TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted");
+ }
+ if (live_sst_size + total_blob_size_.load() + blob_size >
+ bdb_options_.max_db_size) {
+ return Status::NoSpace(
+ "Write failed, as writing it would exceed max_db_size limit.");
+ }
+ return Status::OK();
+}
+
+Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+ const std::string& headerbuf, const Slice& key,
+ const Slice& value, uint64_t expiration,
+ std::string* index_entry) {
+ Status s;
+ uint64_t blob_offset = 0;
+ uint64_t key_offset = 0;
+ {
+ WriteLock lockbfile_w(&bfile->mutex_);
+ std::shared_ptr<BlobLogWriter> writer;
+ s = CheckOrCreateWriterLocked(bfile, &writer);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // write the blob to the blob log.
+ s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset,
+ &blob_offset);
+ }
+
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Invalid status in AppendBlob: %s status: '%s'",
+ bfile->PathName().c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ uint64_t size_put = headerbuf.size() + key.size() + value.size();
+ bfile->BlobRecordAdded(size_put);
+ total_blob_size_ += size_put;
+
+ if (expiration == kNoExpiration) {
+ BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
+ value.size(), bdb_options_.compression);
+ } else {
+ BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(),
+ blob_offset, value.size(),
+ bdb_options_.compression);
+ }
+
+ return s;
+}
+
+std::vector<Status> BlobDBImpl::MultiGet(const ReadOptions& read_options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) {
+ StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
+ // Get a snapshot to avoid blob file get deleted between we
+ // fetch and index entry and reading from the file.
+ ReadOptions ro(read_options);
+ bool snapshot_created = SetSnapshotIfNeeded(&ro);
+
+ std::vector<Status> statuses;
+ statuses.reserve(keys.size());
+ values->clear();
+ values->reserve(keys.size());
+ PinnableSlice value;
+ for (size_t i = 0; i < keys.size(); i++) {
+ statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value));
+ values->push_back(value.ToString());
+ value.Reset();
+ }
+ if (snapshot_created) {
+ db_->ReleaseSnapshot(ro.snapshot);
+ }
+ return statuses;
+}
+
+bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
+ assert(read_options != nullptr);
+ if (read_options->snapshot != nullptr) {
+ return false;
+ }
+ read_options->snapshot = db_->GetSnapshot();
+ return true;
+}
+
+Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
+ PinnableSlice* value, uint64_t* expiration) {
+ assert(value);
+
+ BlobIndex blob_index;
+ Status s = blob_index.DecodeFrom(index_entry);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
+ return Status::NotFound("Key expired");
+ }
+
+ if (expiration != nullptr) {
+ if (blob_index.HasTTL()) {
+ *expiration = blob_index.expiration();
+ } else {
+ *expiration = kNoExpiration;
+ }
+ }
+
+ if (blob_index.IsInlined()) {
+ // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
+ // memory buffer to avoid extra copy.
+ value->PinSelf(blob_index.value());
+ return Status::OK();
+ }
+
+ CompressionType compression_type = kNoCompression;
+ s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+ blob_index.size(), value, &compression_type);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (compression_type != kNoCompression) {
+ s = DecompressSlice(*value, compression_type, value);
+ if (!s.ok()) {
+ if (debug_level_ >= 2) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Uncompression error during blob read from file: %" PRIu64
+ " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+ " key: %s status: '%s'",
+ blob_index.file_number(), blob_index.offset(), blob_index.size(),
+ key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+ }
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+ uint64_t offset, uint64_t size,
+ PinnableSlice* value,
+ CompressionType* compression_type) {
+ assert(value);
+ assert(compression_type);
+ assert(*compression_type == kNoCompression);
+
+ if (!size) {
+ value->PinSelf("");
+ return Status::OK();
+ }
+
+ // offset has to have certain min, as we will read CRC
+ // later from the Blob Header, which needs to be also a
+ // valid offset.
+ if (offset <
+ (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
+ if (debug_level_ >= 2) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Invalid blob index file_number: %" PRIu64
+ " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+ " key: %s",
+ file_number, offset, size,
+ key.ToString(/* output_hex */ true).c_str());
+ }
+
+ return Status::NotFound("Invalid blob offset");
+ }
+
+ std::shared_ptr<BlobFile> blob_file;
+
+ {
+ ReadLock rl(&mutex_);
+ auto it = blob_files_.find(file_number);
+
+ // file was deleted
+ if (it == blob_files_.end()) {
+ return Status::NotFound("Blob Not Found as blob file missing");
+ }
+
+ blob_file = it->second;
+ }
+
+ *compression_type = blob_file->GetCompressionType();
+
+ // takes locks when called
+ std::shared_ptr<RandomAccessFileReader> reader;
+ Status s = GetBlobFileReader(blob_file, &reader);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(offset >= key.size() + sizeof(uint32_t));
+ const uint64_t record_offset = offset - key.size() - sizeof(uint32_t);
+ const uint64_t record_size = sizeof(uint32_t) + key.size() + size;
+
+ // Allocate the buffer. This is safe in C++11
+ std::string buf;
+ AlignedBuf aligned_buf;
+
+ // A partial blob record contain checksum, key and value.
+ Slice blob_record;
+
+ {
+ StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+ // TODO: rate limit old blob DB file reads.
+ if (reader->use_direct_io()) {
+ s = reader->Read(IOOptions(), record_offset,
+ static_cast<size_t>(record_size), &blob_record, nullptr,
+ &aligned_buf, Env::IO_TOTAL /* rate_limiter_priority */);
+ } else {
+ buf.reserve(static_cast<size_t>(record_size));
+ s = reader->Read(IOOptions(), record_offset,
+ static_cast<size_t>(record_size), &blob_record, &buf[0],
+ nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ }
+ RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
+ }
+
+ if (!s.ok()) {
+ ROCKS_LOG_DEBUG(
+ db_options_.info_log,
+ "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+ ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
+ file_number, offset, size, key.size(), s.ToString().c_str());
+ return s;
+ }
+
+ if (blob_record.size() != record_size) {
+ ROCKS_LOG_DEBUG(
+ db_options_.info_log,
+ "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+ ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt
+ ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes",
+ file_number, offset, size, key.size(), blob_record.size(), record_size);
+
+ return Status::Corruption("Failed to retrieve blob from blob index.");
+ }
+
+ Slice crc_slice(blob_record.data(), sizeof(uint32_t));
+ Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
+ static_cast<size_t>(size));
+
+ uint32_t crc_exp = 0;
+ if (!GetFixed32(&crc_slice, &crc_exp)) {
+ ROCKS_LOG_DEBUG(
+ db_options_.info_log,
+ "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64
+ ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
+ file_number, offset, size, key.size(), s.ToString().c_str());
+ return Status::Corruption("Unable to decode checksum.");
+ }
+
+ uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
+ blob_record.size() - sizeof(uint32_t));
+ crc = crc32c::Mask(crc); // Adjust for storage
+ if (crc != crc_exp) {
+ if (debug_level_ >= 2) {
+ ROCKS_LOG_ERROR(
+ db_options_.info_log,
+ "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
+ " blob_size: %" PRIu64 " key: %s status: '%s'",
+ file_number, offset, size,
+ key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+ }
+
+ return Status::Corruption("Corruption. Blob CRC mismatch");
+ }
+
+ value->PinSelf(blob_value);
+
+ return Status::OK();
+}
+
+Status BlobDBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ return Get(read_options, column_family, key, value,
+ static_cast<uint64_t*>(nullptr) /*expiration*/);
+}
+
+Status BlobDBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, uint64_t* expiration) {
+ StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_GET);
+ return GetImpl(read_options, column_family, key, value, expiration);
+}
+
+Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, uint64_t* expiration) {
+ if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+ return Status::NotSupported(
+ "Blob DB doesn't support non-default column family.");
+ }
+ // Get a snapshot to avoid blob file get deleted between we
+ // fetch and index entry and reading from the file.
+ // TODO(yiwu): For Get() retry if file not found would be a simpler strategy.
+ ReadOptions ro(read_options);
+ bool snapshot_created = SetSnapshotIfNeeded(&ro);
+
+ PinnableSlice index_entry;
+ Status s;
+ bool is_blob_index = false;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = &index_entry;
+ get_impl_options.is_blob_index = &is_blob_index;
+ s = db_impl_->GetImpl(ro, key, get_impl_options);
+ if (expiration != nullptr) {
+ *expiration = kNoExpiration;
+ }
+ RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
+ if (s.ok()) {
+ if (is_blob_index) {
+ s = GetBlobValue(key, index_entry, value, expiration);
+ } else {
+ // The index entry is the value itself in this case.
+ value->PinSelf(index_entry);
+ }
+ RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
+ }
+ if (snapshot_created) {
+ db_->ReleaseSnapshot(ro.snapshot);
+ }
+ return s;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
+ if (aborted) {
+ return std::make_pair(false, -1);
+ }
+
+ ReadLock rl(&mutex_);
+
+ ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
+ ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt,
+ blob_files_.size());
+ ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt,
+ open_ttl_files_.size());
+
+ for (const auto& blob_file : open_ttl_files_) {
+ (void)blob_file;
+ assert(!blob_file->Immutable());
+ }
+
+ for (const auto& pair : live_imm_non_ttl_blob_files_) {
+ const auto& blob_file = pair.second;
+ (void)blob_file;
+ assert(!blob_file->HasTTL());
+ assert(blob_file->Immutable());
+ }
+
+ uint64_t now = EpochNow();
+
+ for (auto blob_file_pair : blob_files_) {
+ auto blob_file = blob_file_pair.second;
+ std::ostringstream buf;
+
+ buf << "Blob file " << blob_file->BlobFileNumber() << ", size "
+ << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount()
+ << ", immutable " << blob_file->Immutable();
+
+ if (blob_file->HasTTL()) {
+ ExpirationRange expiration_range;
+ {
+ ReadLock file_lock(&blob_file->mutex_);
+ expiration_range = blob_file->GetExpirationRange();
+ }
+ buf << ", expiration range (" << expiration_range.first << ", "
+ << expiration_range.second << ")";
+
+ if (!blob_file->Obsolete()) {
+ buf << ", expire in " << (expiration_range.second - now) << "seconds";
+ }
+ }
+ if (blob_file->Obsolete()) {
+ buf << ", obsolete at " << blob_file->GetObsoleteSequence();
+ }
+ buf << ".";
+ ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str());
+ }
+
+ // reschedule
+ return std::make_pair(true, -1);
+}
+
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+ TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile");
+ assert(bfile);
+ assert(!bfile->Immutable());
+ assert(!bfile->Obsolete());
+
+ if (bfile->HasTTL() || bfile == open_non_ttl_file_) {
+ write_mutex_.AssertHeld();
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Closing blob file %" PRIu64 ". Path: %s",
+ bfile->BlobFileNumber(), bfile->PathName().c_str());
+
+ const SequenceNumber sequence = GetLatestSequenceNumber();
+
+ const Status s = bfile->WriteFooterAndCloseLocked(sequence);
+
+ if (s.ok()) {
+ total_blob_size_ += BlobLogFooter::kSize;
+ } else {
+ bfile->MarkImmutable(sequence);
+
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to close blob file %" PRIu64 "with error: %s",
+ bfile->BlobFileNumber(), s.ToString().c_str());
+ }
+
+ if (bfile->HasTTL()) {
+ size_t erased __attribute__((__unused__));
+ erased = open_ttl_files_.erase(bfile);
+ } else {
+ if (bfile == open_non_ttl_file_) {
+ open_non_ttl_file_ = nullptr;
+ }
+
+ const uint64_t blob_file_number = bfile->BlobFileNumber();
+ auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number);
+ assert(it == live_imm_non_ttl_blob_files_.end() ||
+ it->first != blob_file_number);
+ live_imm_non_ttl_blob_files_.insert(
+ it, std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+ blob_file_number, bfile));
+ }
+
+ return s;
+}
+
+Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
+ write_mutex_.AssertHeld();
+
+ // atomic read
+ if (bfile->GetFileSize() < bdb_options_.blob_file_size) {
+ return Status::OK();
+ }
+
+ WriteLock lock(&mutex_);
+ WriteLock file_lock(&bfile->mutex_);
+
+ assert(!bfile->Obsolete() || bfile->Immutable());
+ if (bfile->Immutable()) {
+ return Status::OK();
+ }
+
+ return CloseBlobFile(bfile);
+}
+
+void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+ SequenceNumber obsolete_seq,
+ bool update_size) {
+ assert(blob_file->Immutable());
+ assert(!blob_file->Obsolete());
+
+ // Should hold write lock of mutex_ or during DB open.
+ blob_file->MarkObsolete(obsolete_seq);
+ obsolete_files_.push_back(blob_file);
+ assert(total_blob_size_.load() >= blob_file->GetFileSize());
+ if (update_size) {
+ total_blob_size_ -= blob_file->GetFileSize();
+ }
+}
+
+bool BlobDBImpl::VisibleToActiveSnapshot(
+ const std::shared_ptr<BlobFile>& bfile) {
+ assert(bfile->Obsolete());
+
+ // We check whether the oldest snapshot is no less than the last sequence
+ // by the time the blob file become obsolete. If so, the blob file is not
+ // visible to all existing snapshots.
+ //
+ // If we keep track of the earliest sequence of the keys in the blob file,
+ // we could instead check if there's a snapshot falls in range
+ // [earliest_sequence, obsolete_sequence). But doing so will make the
+ // implementation more complicated.
+ SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
+ SequenceNumber oldest_snapshot = kMaxSequenceNumber;
+ {
+ // Need to lock DBImpl mutex before access snapshot list.
+ InstrumentedMutexLock l(db_impl_->mutex());
+ auto& snapshots = db_impl_->snapshots();
+ if (!snapshots.empty()) {
+ oldest_snapshot = snapshots.oldest()->GetSequenceNumber();
+ }
+ }
+ bool visible = oldest_snapshot < obsolete_sequence;
+ if (visible) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64
+ ") visible to oldest snapshot %" PRIu64 ".",
+ bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot);
+ }
+ return visible;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
+ if (aborted) {
+ return std::make_pair(false, -1);
+ }
+
+ TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:0");
+ TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:1");
+
+ std::vector<std::shared_ptr<BlobFile>> process_files;
+ uint64_t now = EpochNow();
+ {
+ ReadLock rl(&mutex_);
+ for (auto p : blob_files_) {
+ auto& blob_file = p.second;
+ ReadLock file_lock(&blob_file->mutex_);
+ if (blob_file->HasTTL() && !blob_file->Obsolete() &&
+ blob_file->GetExpirationRange().second <= now) {
+ process_files.push_back(blob_file);
+ }
+ }
+ }
+
+ TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:2");
+ TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:3");
+ TEST_SYNC_POINT_CALLBACK("BlobDBImpl::EvictExpiredFiles:cb", nullptr);
+
+ SequenceNumber seq = GetLatestSequenceNumber();
+ {
+ MutexLock l(&write_mutex_);
+ WriteLock lock(&mutex_);
+ for (auto& blob_file : process_files) {
+ WriteLock file_lock(&blob_file->mutex_);
+
+ // Need to double check if the file is obsolete.
+ if (blob_file->Obsolete()) {
+ assert(blob_file->Immutable());
+ continue;
+ }
+
+ if (!blob_file->Immutable()) {
+ CloseBlobFile(blob_file);
+ }
+
+ assert(blob_file->Immutable());
+
+ ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
+ }
+ }
+
+ return std::make_pair(true, -1);
+}
+
+Status BlobDBImpl::SyncBlobFiles() {
+ MutexLock l(&write_mutex_);
+
+ std::vector<std::shared_ptr<BlobFile>> process_files;
+ {
+ ReadLock rl(&mutex_);
+ for (auto fitr : open_ttl_files_) {
+ process_files.push_back(fitr);
+ }
+ if (open_non_ttl_file_ != nullptr) {
+ process_files.push_back(open_non_ttl_file_);
+ }
+ }
+
+ Status s;
+ for (auto& blob_file : process_files) {
+ s = blob_file->Fsync();
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to sync blob file %" PRIu64 ", status: %s",
+ blob_file->BlobFileNumber(), s.ToString().c_str());
+ return s;
+ }
+ }
+
+ s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions());
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "Failed to sync blob directory, status: %s",
+ s.ToString().c_str());
+ }
+ return s;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
+ if (aborted) return std::make_pair(false, -1);
+
+ if (open_file_count_.load() < kOpenFilesTrigger) {
+ return std::make_pair(true, -1);
+ }
+
+ // in the future, we should sort by last_access_
+ // instead of closing every file
+ ReadLock rl(&mutex_);
+ for (auto const& ent : blob_files_) {
+ auto bfile = ent.second;
+ if (bfile->last_access_.load() == -1) continue;
+
+ WriteLock lockbfile_w(&bfile->mutex_);
+ CloseRandomAccessLocked(bfile);
+ }
+
+ return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
+ if (aborted) {
+ return std::make_pair(false, -1);
+ }
+
+ MutexLock delete_file_lock(&delete_file_mutex_);
+ if (disable_file_deletions_ > 0) {
+ return std::make_pair(true, -1);
+ }
+
+ std::list<std::shared_ptr<BlobFile>> tobsolete;
+ {
+ WriteLock wl(&mutex_);
+ if (obsolete_files_.empty()) {
+ return std::make_pair(true, -1);
+ }
+ tobsolete.swap(obsolete_files_);
+ }
+
+ bool file_deleted = false;
+ for (auto iter = tobsolete.begin(); iter != tobsolete.end();) {
+ auto bfile = *iter;
+ {
+ ReadLock lockbfile_r(&bfile->mutex_);
+ if (VisibleToActiveSnapshot(bfile)) {
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Could not delete file due to snapshot failure %s",
+ bfile->PathName().c_str());
+ ++iter;
+ continue;
+ }
+ }
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Will delete file due to snapshot success %s",
+ bfile->PathName().c_str());
+
+ {
+ WriteLock wl(&mutex_);
+ blob_files_.erase(bfile->BlobFileNumber());
+ }
+
+ Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
+ bfile->PathName(), blob_dir_, true,
+ /*force_fg=*/false);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log,
+ "File failed to be deleted as obsolete %s",
+ bfile->PathName().c_str());
+ ++iter;
+ continue;
+ }
+
+ file_deleted = true;
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "File deleted as obsolete from blob dir %s",
+ bfile->PathName().c_str());
+
+ iter = tobsolete.erase(iter);
+ }
+
+ // directory change. Fsync
+ if (file_deleted) {
+ Status s = dir_ent_->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s",
+ blob_dir_.c_str(), s.ToString().c_str());
+ }
+ }
+
+ // put files back into obsolete if for some reason, delete failed
+ if (!tobsolete.empty()) {
+ WriteLock wl(&mutex_);
+ for (auto bfile : tobsolete) {
+ blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
+ obsolete_files_.push_front(bfile);
+ }
+ }
+
+ return std::make_pair(!aborted, -1);
+}
+
+void BlobDBImpl::CopyBlobFiles(
+ std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
+ ReadLock rl(&mutex_);
+ for (auto const& p : blob_files_) {
+ bfiles_copy->push_back(p.second);
+ }
+}
+
+Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+ ->cfd();
+ // Get a snapshot to avoid blob file get deleted between we
+ // fetch and index entry and reading from the file.
+ ManagedSnapshot* own_snapshot = nullptr;
+ const Snapshot* snapshot = read_options.snapshot;
+ if (snapshot == nullptr) {
+ own_snapshot = new ManagedSnapshot(db_);
+ snapshot = own_snapshot->snapshot();
+ }
+ auto* iter = db_impl_->NewIteratorImpl(
+ read_options, cfd, snapshot->GetSequenceNumber(),
+ nullptr /*read_callback*/, true /*expose_blob_index*/);
+ return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_);
+}
+
+Status DestroyBlobDB(const std::string& dbname, const Options& options,
+ const BlobDBOptions& bdb_options) {
+ const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+ Env* env = soptions.env;
+
+ Status status;
+ std::string blobdir;
+ blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
+ : bdb_options.blob_dir;
+
+ std::vector<std::string> filenames;
+ if (env->GetChildren(blobdir, &filenames).ok()) {
+ for (const auto& f : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(f, &number, &type) && type == kBlobFile) {
+ Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+ /*force_fg=*/false);
+ if (status.ok() && !del.ok()) {
+ status = del;
+ }
+ }
+ }
+ // TODO: What to do if we cannot delete the directory?
+ env->DeleteDir(blobdir).PermitUncheckedError();
+ }
+ Status destroy = DestroyDB(dbname, options);
+ if (status.ok() && !destroy.ok()) {
+ status = destroy;
+ }
+
+ return status;
+}
+
+#ifndef NDEBUG
+Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+ PinnableSlice* value) {
+ return GetBlobValue(key, index_entry, value);
+}
+
+void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number,
+ SequenceNumber immutable_sequence) {
+ auto blob_file = std::make_shared<BlobFile>(this, blob_dir_, blob_file_number,
+ db_options_.info_log.get());
+ blob_file->MarkImmutable(immutable_sequence);
+
+ blob_files_[blob_file_number] = blob_file;
+ live_imm_non_ttl_blob_files_[blob_file_number] = blob_file;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
+ ReadLock l(&mutex_);
+ std::vector<std::shared_ptr<BlobFile>> blob_files;
+ for (auto& p : blob_files_) {
+ blob_files.emplace_back(p.second);
+ }
+ return blob_files;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetLiveImmNonTTLFiles()
+ const {
+ ReadLock l(&mutex_);
+ std::vector<std::shared_ptr<BlobFile>> live_imm_non_ttl_files;
+ for (const auto& pair : live_imm_non_ttl_blob_files_) {
+ live_imm_non_ttl_files.emplace_back(pair.second);
+ }
+ return live_imm_non_ttl_files;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
+ const {
+ ReadLock l(&mutex_);
+ std::vector<std::shared_ptr<BlobFile>> obsolete_files;
+ for (auto& bfile : obsolete_files_) {
+ obsolete_files.emplace_back(bfile);
+ }
+ return obsolete_files;
+}
+
+void BlobDBImpl::TEST_DeleteObsoleteFiles() {
+ DeleteObsoleteFiles(false /*abort*/);
+}
+
+Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
+ MutexLock l(&write_mutex_);
+ WriteLock lock(&mutex_);
+ WriteLock file_lock(&bfile->mutex_);
+
+ return CloseBlobFile(bfile);
+}
+
+void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+ SequenceNumber obsolete_seq,
+ bool update_size) {
+ return ObsoleteBlobFile(blob_file, obsolete_seq, update_size);
+}
+
+void BlobDBImpl::TEST_EvictExpiredFiles() {
+ EvictExpiredFiles(false /*abort*/);
+}
+
+uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); }
+
+void BlobDBImpl::TEST_InitializeBlobFileToSstMapping(
+ const std::vector<LiveFileMetaData>& live_files) {
+ InitializeBlobFileToSstMapping(live_files);
+}
+
+void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) {
+ ProcessFlushJobInfo(info);
+}
+
+void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+ ProcessCompactionJobInfo(info);
+}
+
+#endif // !NDEBUG
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.h b/src/rocksdb/utilities/blob_db/blob_db_impl.h
new file mode 100644
index 000000000..0b4dbf5e5
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl.h
@@ -0,0 +1,503 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <condition_variable>
+#include <limits>
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_iter.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/wal_filter.h"
+#include "util/mutexlock.h"
+#include "util/timer_queue.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/blob_db/blob_file.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class SystemClock;
+
+struct FlushJobInfo;
+
+namespace blob_db {
+
+struct BlobCompactionContext;
+struct BlobCompactionContextGC;
+class BlobDBImpl;
+class BlobFile;
+
+// Comparator to sort "TTL" aware Blob files based on the lower value of
+// TTL range.
+struct BlobFileComparatorTTL {
+ bool operator()(const std::shared_ptr<BlobFile>& lhs,
+ const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+struct BlobFileComparator {
+ bool operator()(const std::shared_ptr<BlobFile>& lhs,
+ const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+/**
+ * The implementation class for BlobDB. It manages the blob logs, which
+ * are sequentially written files. Blob logs can be of the TTL or non-TTL
+ * varieties; the former are cleaned up when they expire, while the latter
+ * are (optionally) garbage collected.
+ */
+class BlobDBImpl : public BlobDB {
+ friend class BlobFile;
+ friend class BlobDBIterator;
+ friend class BlobDBListener;
+ friend class BlobDBListenerGC;
+ friend class BlobIndexCompactionFilterBase;
+ friend class BlobIndexCompactionFilterGC;
+
+ public:
+ // deletions check period
+ static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000;
+
+ // sanity check task
+ static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000;
+
+ // how many random access open files can we tolerate
+ static constexpr uint32_t kOpenFilesTrigger = 100;
+
+ // how often to schedule reclaim open files.
+ static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000;
+
+ // how often to schedule delete obs files periods
+ static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000;
+
+ // how often to schedule expired files eviction.
+ static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000;
+
+ // when should oldest file be evicted:
+ // on reaching 90% of blob_dir_size
+ static constexpr double kEvictOldestFileAtSize = 0.9;
+
+ using BlobDB::Put;
+ Status Put(const WriteOptions& options, const Slice& key,
+ const Slice& value) override;
+
+ using BlobDB::Get;
+ Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) override;
+
+ Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value,
+ uint64_t* expiration) override;
+
+ using BlobDB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& read_options) override;
+
+ using BlobDB::NewIterators;
+ virtual Status NewIterators(
+ const ReadOptions& /*read_options*/,
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/,
+ std::vector<Iterator*>* /*iterators*/) override {
+ return Status::NotSupported("Not implemented");
+ }
+
+ using BlobDB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& read_options, const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ using BlobDB::Write;
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+ virtual Status Close() override;
+
+ using BlobDB::PutWithTTL;
+ Status PutWithTTL(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t ttl) override;
+
+ using BlobDB::PutUntil;
+ Status PutUntil(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t expiration) override;
+
+ using BlobDB::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& compact_options,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override;
+
+ BlobDBOptions GetBlobDBOptions() const override;
+
+ BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
+ const DBOptions& db_options,
+ const ColumnFamilyOptions& cf_options);
+
+ virtual Status DisableFileDeletions() override;
+
+ virtual Status EnableFileDeletions(bool force) override;
+
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) override;
+ virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>*) override;
+
+ ~BlobDBImpl();
+
+ Status Open(std::vector<ColumnFamilyHandle*>* handles);
+
+ Status SyncBlobFiles() override;
+
+ // Common part of the two GetCompactionContext methods below.
+ // REQUIRES: read lock on mutex_
+ void GetCompactionContextCommon(BlobCompactionContext* context);
+
+ void GetCompactionContext(BlobCompactionContext* context);
+ void GetCompactionContext(BlobCompactionContext* context,
+ BlobCompactionContextGC* context_gc);
+
+#ifndef NDEBUG
+ Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+ PinnableSlice* value);
+
+ void TEST_AddDummyBlobFile(uint64_t blob_file_number,
+ SequenceNumber immutable_sequence);
+
+ std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
+
+ std::vector<std::shared_ptr<BlobFile>> TEST_GetLiveImmNonTTLFiles() const;
+
+ std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
+
+ Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
+
+ void TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+ SequenceNumber obsolete_seq = 0,
+ bool update_size = true);
+
+ void TEST_EvictExpiredFiles();
+
+ void TEST_DeleteObsoleteFiles();
+
+ uint64_t TEST_live_sst_size();
+
+ const std::string& TEST_blob_dir() const { return blob_dir_; }
+
+ void TEST_InitializeBlobFileToSstMapping(
+ const std::vector<LiveFileMetaData>& live_files);
+
+ void TEST_ProcessFlushJobInfo(const FlushJobInfo& info);
+
+ void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+#endif // !NDEBUG
+
+ private:
+ class BlobInserter;
+
+ // Create a snapshot if there isn't one in read options.
+ // Return true if a snapshot is created.
+ bool SetSnapshotIfNeeded(ReadOptions* read_options);
+
+ Status GetImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value, uint64_t* expiration = nullptr);
+
+ Status GetBlobValue(const Slice& key, const Slice& index_entry,
+ PinnableSlice* value, uint64_t* expiration = nullptr);
+
+ Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+ uint64_t offset, uint64_t size,
+ PinnableSlice* value,
+ CompressionType* compression_type);
+
+ Slice GetCompressedSlice(const Slice& raw,
+ std::string* compression_output) const;
+
+ Status DecompressSlice(const Slice& compressed_value,
+ CompressionType compression_type,
+ PinnableSlice* value_output) const;
+
+ // Close a file by appending a footer, and removes file from open files list.
+ // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
+ // and the blob file's mutex_. If called on a blob file which is visible only
+ // to a single thread (like in the case of new files written during
+ // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be
+ // avoided.
+ Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
+
+ // Close a file if its size exceeds blob_file_size
+ // REQUIRES: lock held on write_mutex_.
+ Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
+
+ // Mark file as obsolete and move the file to obsolete file list.
+ //
+ // REQUIRED: hold write lock of mutex_ or during DB open.
+ void ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+ SequenceNumber obsolete_seq, bool update_size);
+
+ Status PutBlobValue(const WriteOptions& options, const Slice& key,
+ const Slice& value, uint64_t expiration,
+ WriteBatch* batch);
+
+ Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+ const std::string& headerbuf, const Slice& key,
+ const Slice& value, uint64_t expiration,
+ std::string* index_entry);
+
+ // Create a new blob file and associated writer.
+ Status CreateBlobFileAndWriter(bool has_ttl,
+ const ExpirationRange& expiration_range,
+ const std::string& reason,
+ std::shared_ptr<BlobFile>* blob_file,
+ std::shared_ptr<BlobLogWriter>* writer);
+
+ // Get the open non-TTL blob log file, or create a new one if no such file
+ // exists.
+ Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
+
+ // Get the open TTL blob log file for a certain expiration, or create a new
+ // one if no such file exists.
+ Status SelectBlobFileTTL(uint64_t expiration,
+ std::shared_ptr<BlobFile>* blob_file);
+
+ std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
+
+ // periodic sanity check. Bunch of checks
+ std::pair<bool, int64_t> SanityCheck(bool aborted);
+
+ // Delete files that have been marked obsolete (either because of TTL
+ // or GC). Check whether any snapshots exist which refer to the same.
+ std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
+
+ // periodically check if open blob files and their TTL's has expired
+ // if expired, close the sequential writer and make the file immutable
+ std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
+
+ // if the number of open files, approaches ULIMIT's this
+ // task will close random readers, which are kept around for
+ // efficiency
+ std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
+
+ std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
+
+ // Adds the background tasks to the timer queue
+ void StartBackgroundTasks();
+
+ // add a new Blob File
+ std::shared_ptr<BlobFile> NewBlobFile(bool has_ttl,
+ const ExpirationRange& expiration_range,
+ const std::string& reason);
+
+ // Register a new blob file.
+ // REQUIRES: write lock on mutex_.
+ void RegisterBlobFile(std::shared_ptr<BlobFile> blob_file);
+
+ // collect all the blob log files from the blob directory
+ Status GetAllBlobFiles(std::set<uint64_t>* file_numbers);
+
+ // Open all blob files found in blob_dir.
+ Status OpenAllBlobFiles();
+
+ // Link an SST to a blob file. Comes in locking and non-locking varieties
+ // (the latter is used during Open).
+ template <typename Linker>
+ void LinkSstToBlobFileImpl(uint64_t sst_file_number,
+ uint64_t blob_file_number, Linker linker);
+
+ void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number);
+
+ void LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+ uint64_t blob_file_number);
+
+ // Unlink an SST from a blob file.
+ void UnlinkSstFromBlobFile(uint64_t sst_file_number,
+ uint64_t blob_file_number);
+
+ // Initialize the mapping between blob files and SSTs during Open.
+ void InitializeBlobFileToSstMapping(
+ const std::vector<LiveFileMetaData>& live_files);
+
+ // Update the mapping between blob files and SSTs after a flush and mark
+ // any unneeded blob files obsolete.
+ void ProcessFlushJobInfo(const FlushJobInfo& info);
+
+ // Update the mapping between blob files and SSTs after a compaction and
+ // mark any unneeded blob files obsolete.
+ void ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+ // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs
+ // linked to it, and all memtables from before the blob file became immutable
+ // have been flushed. Note: should only be called if the condition holds for
+ // all lower-numbered non-TTL blob files as well.
+ bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr<BlobFile>& blob_file,
+ SequenceNumber obsolete_seq);
+
+ // Mark all immutable non-TTL blob files that aren't needed by any SSTs as
+ // obsolete. Comes in two varieties; the version used during Open need not
+ // worry about locking or snapshots.
+ template <class Functor>
+ void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed);
+
+ void MarkUnreferencedBlobFilesObsolete();
+ void MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+ void UpdateLiveSSTSize();
+
+ Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
+ std::shared_ptr<RandomAccessFileReader>* reader);
+
+ // hold write mutex on file and call.
+ // Close the above Random Access reader
+ void CloseRandomAccessLocked(const std::shared_ptr<BlobFile>& bfile);
+
+ // hold write mutex on file and call
+ // creates a sequential (append) writer for this blobfile
+ Status CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile);
+
+ // returns a BlobLogWriter object for the file. If writer is not
+ // already present, creates one. Needs Write Mutex to be held
+ Status CheckOrCreateWriterLocked(const std::shared_ptr<BlobFile>& blob_file,
+ std::shared_ptr<BlobLogWriter>* writer);
+
+ // checks if there is no snapshot which is referencing the
+ // blobs
+ bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
+ bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
+
+ void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
+
+ uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
+
+ // Check if inserting a new blob will make DB grow out of space.
+ // If is_fifo = true, FIFO eviction will be triggered to make room for the
+ // new blob. If force_evict = true, FIFO eviction will evict blob files
+ // even eviction will not make enough room for the new blob.
+ Status CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+ bool force_evict = false);
+
+ // name of the database directory
+ std::string dbname_;
+
+ // the base DB
+ DBImpl* db_impl_;
+ Env* env_;
+ SystemClock* clock_;
+ // the options that govern the behavior of Blob Storage
+ BlobDBOptions bdb_options_;
+ DBOptions db_options_;
+ ColumnFamilyOptions cf_options_;
+ FileOptions file_options_;
+
+ // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
+ // ownership.
+ Statistics* statistics_;
+
+ // by default this is "blob_dir" under dbname_
+ // but can be configured
+ std::string blob_dir_;
+
+ // pointer to directory
+ std::unique_ptr<FSDirectory> dir_ent_;
+
+ // Read Write Mutex, which protects all the data structures
+ // HEAVILY TRAFFICKED
+ mutable port::RWMutex mutex_;
+
+ // Writers has to hold write_mutex_ before writing.
+ mutable port::Mutex write_mutex_;
+
+ // counter for blob file number
+ std::atomic<uint64_t> next_file_number_;
+
+ // entire metadata of all the BLOB files memory
+ std::map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
+
+ // All live immutable non-TTL blob files.
+ std::map<uint64_t, std::shared_ptr<BlobFile>> live_imm_non_ttl_blob_files_;
+
+ // The largest sequence number that has been flushed.
+ SequenceNumber flush_sequence_;
+
+ // opened non-TTL blob file.
+ std::shared_ptr<BlobFile> open_non_ttl_file_;
+
+ // all the blob files which are currently being appended to based
+ // on variety of incoming TTL's
+ std::set<std::shared_ptr<BlobFile>, BlobFileComparatorTTL> open_ttl_files_;
+
+ // Flag to check whether Close() has been called on this DB
+ bool closed_;
+
+ // timer based queue to execute tasks
+ TimerQueue tqueue_;
+
+ // number of files opened for random access/GET
+ // counter is used to monitor and close excess RA files.
+ std::atomic<uint32_t> open_file_count_;
+
+ // Total size of all live blob files (i.e. exclude obsolete files).
+ std::atomic<uint64_t> total_blob_size_;
+
+ // total size of SST files.
+ std::atomic<uint64_t> live_sst_size_;
+
+ // Latest FIFO eviction timestamp
+ //
+ // REQUIRES: access with metex_ lock held.
+ uint64_t fifo_eviction_seq_;
+
+ // The expiration up to which latest FIFO eviction evicts.
+ //
+ // REQUIRES: access with metex_ lock held.
+ uint64_t evict_expiration_up_to_;
+
+ std::list<std::shared_ptr<BlobFile>> obsolete_files_;
+
+ // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
+ // on the mutex to avoid contention.
+ //
+ // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note
+ // the difference. mutex_ only needs to be held when access the
+ // data-structure, and delete_file_mutex_ needs to be held the whole time
+ // during DeleteObsoleteFiles to avoid being run simultaneously with
+ // DisableFileDeletions.
+ //
+ // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced
+ // to hold delete_file_mutex_ first to avoid deadlock.
+ mutable port::Mutex delete_file_mutex_;
+
+ // Each call of DisableFileDeletions will increase disable_file_deletion_
+ // by 1. EnableFileDeletions will either decrease the count by 1 or reset
+ // it to zeor, depending on the force flag.
+ //
+ // REQUIRES: access with delete_file_mutex_ held.
+ int disable_file_deletions_ = 0;
+
+ uint32_t debug_level_;
+};
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
new file mode 100644
index 000000000..87e3f33cc
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+// BlobDBImpl methods to get snapshot of files, e.g. for replication.
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+Status BlobDBImpl::DisableFileDeletions() {
+ // Disable base DB file deletions.
+ Status s = db_impl_->DisableFileDeletions();
+ if (!s.ok()) {
+ return s;
+ }
+
+ int count = 0;
+ {
+ // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job
+ // is running.
+ MutexLock l(&delete_file_mutex_);
+ count = ++disable_file_deletions_;
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "Disabled blob file deletions. count: %d", count);
+ return Status::OK();
+}
+
+Status BlobDBImpl::EnableFileDeletions(bool force) {
+ // Enable base DB file deletions.
+ Status s = db_impl_->EnableFileDeletions(force);
+ if (!s.ok()) {
+ return s;
+ }
+
+ int count = 0;
+ {
+ MutexLock l(&delete_file_mutex_);
+ if (force) {
+ disable_file_deletions_ = 0;
+ } else if (disable_file_deletions_ > 0) {
+ count = --disable_file_deletions_;
+ }
+ assert(count >= 0);
+ }
+
+ ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d",
+ count);
+ // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to
+ // make DeleteobsoleteFiles re-run interval configuration.
+ return Status::OK();
+}
+
+Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool flush_memtable) {
+ if (!bdb_options_.path_relative) {
+ return Status::NotSupported(
+ "Not able to get relative blob file path from absolute blob_dir.");
+ }
+ // Hold a lock in the beginning to avoid updates to base DB during the call
+ ReadLock rl(&mutex_);
+ Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
+ if (!s.ok()) {
+ return s;
+ }
+ ret.reserve(ret.size() + blob_files_.size());
+ for (auto bfile_pair : blob_files_) {
+ auto blob_file = bfile_pair.second;
+ // Path should be relative to db_name, but begin with slash.
+ ret.emplace_back(
+ BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+ }
+ return Status::OK();
+}
+
+void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ // Path should be relative to db_name.
+ assert(bdb_options_.path_relative);
+ // Hold a lock in the beginning to avoid updates to base DB during the call
+ ReadLock rl(&mutex_);
+ db_->GetLiveFilesMetaData(metadata);
+ for (auto bfile_pair : blob_files_) {
+ auto blob_file = bfile_pair.second;
+ LiveFileMetaData filemetadata;
+ filemetadata.size = blob_file->GetFileSize();
+ const uint64_t file_number = blob_file->BlobFileNumber();
+ // Path should be relative to db_name, but begin with slash.
+ filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+ filemetadata.file_number = file_number;
+ if (blob_file->HasTTL()) {
+ filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first;
+ }
+ auto cfh =
+ static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily());
+ filemetadata.column_family_name = cfh->GetName();
+ metadata->emplace_back(filemetadata);
+ }
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_iterator.h b/src/rocksdb/utilities/blob_db/blob_db_iterator.h
new file mode 100644
index 000000000..fd2b2f8f5
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_iterator.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/arena_wrapped_db_iter.h"
+#include "rocksdb/iterator.h"
+#include "util/stop_watch.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
+namespace blob_db {
+
+using ROCKSDB_NAMESPACE::ManagedSnapshot;
+
+class BlobDBIterator : public Iterator {
+ public:
+ BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
+ BlobDBImpl* blob_db, SystemClock* clock,
+ Statistics* statistics)
+ : snapshot_(snapshot),
+ iter_(iter),
+ blob_db_(blob_db),
+ clock_(clock),
+ statistics_(statistics) {}
+
+ virtual ~BlobDBIterator() = default;
+
+ bool Valid() const override {
+ if (!iter_->Valid()) {
+ return false;
+ }
+ return status_.ok();
+ }
+
+ Status status() const override {
+ if (!iter_->status().ok()) {
+ return iter_->status();
+ }
+ return status_;
+ }
+
+ void SeekToFirst() override {
+ StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+ iter_->SeekToFirst();
+ while (UpdateBlobValue()) {
+ iter_->Next();
+ }
+ }
+
+ void SeekToLast() override {
+ StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+ iter_->SeekToLast();
+ while (UpdateBlobValue()) {
+ iter_->Prev();
+ }
+ }
+
+ void Seek(const Slice& target) override {
+ StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+ iter_->Seek(target);
+ while (UpdateBlobValue()) {
+ iter_->Next();
+ }
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+ iter_->SeekForPrev(target);
+ while (UpdateBlobValue()) {
+ iter_->Prev();
+ }
+ }
+
+ void Next() override {
+ assert(Valid());
+ StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_NEXT);
+ iter_->Next();
+ while (UpdateBlobValue()) {
+ iter_->Next();
+ }
+ }
+
+ void Prev() override {
+ assert(Valid());
+ StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS);
+ RecordTick(statistics_, BLOB_DB_NUM_PREV);
+ iter_->Prev();
+ while (UpdateBlobValue()) {
+ iter_->Prev();
+ }
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return iter_->key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ if (!iter_->IsBlob()) {
+ return iter_->value();
+ }
+ return value_;
+ }
+
+ // Iterator::Refresh() not supported.
+
+ private:
+ // Return true if caller should continue to next value.
+ bool UpdateBlobValue() {
+ value_.Reset();
+ status_ = Status::OK();
+ if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) {
+ Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
+ if (s.IsNotFound()) {
+ return true;
+ } else {
+ if (!s.ok()) {
+ status_ = s;
+ }
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ std::unique_ptr<ManagedSnapshot> snapshot_;
+ std::unique_ptr<ArenaWrappedDBIter> iter_;
+ BlobDBImpl* blob_db_;
+ SystemClock* clock_;
+ Statistics* statistics_;
+ Status status_;
+ PinnableSlice value_;
+};
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_listener.h b/src/rocksdb/utilities/blob_db/blob_db_listener.h
new file mode 100644
index 000000000..d17d29853
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_listener.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+
+#include "rocksdb/listener.h"
+#include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBListener : public EventListener {
+ public:
+ explicit BlobDBListener(BlobDBImpl* blob_db_impl)
+ : blob_db_impl_(blob_db_impl) {}
+
+ void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+ assert(blob_db_impl_ != nullptr);
+ blob_db_impl_->SyncBlobFiles();
+ }
+
+ void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+ assert(blob_db_impl_ != nullptr);
+ blob_db_impl_->UpdateLiveSSTSize();
+ }
+
+ void OnCompactionCompleted(DB* /*db*/,
+ const CompactionJobInfo& /*info*/) override {
+ assert(blob_db_impl_ != nullptr);
+ blob_db_impl_->UpdateLiveSSTSize();
+ }
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "BlobDBListener"; }
+
+ protected:
+ BlobDBImpl* blob_db_impl_;
+};
+
+class BlobDBListenerGC : public BlobDBListener {
+ public:
+ explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl)
+ : BlobDBListener(blob_db_impl) {}
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "BlobDBListenerGC"; }
+ void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+ BlobDBListener::OnFlushCompleted(db, info);
+
+ assert(blob_db_impl_);
+ blob_db_impl_->ProcessFlushJobInfo(info);
+ }
+
+ void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+ BlobDBListener::OnCompactionCompleted(db, info);
+
+ assert(blob_db_impl_);
+ blob_db_impl_->ProcessCompactionJobInfo(info);
+ }
+};
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_test.cc b/src/rocksdb/utilities/blob_db/blob_db_test.cc
new file mode 100644
index 000000000..e392962b2
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_test.cc
@@ -0,0 +1,2407 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBTest : public testing::Test {
+ public:
+ const int kMaxBlobSize = 1 << 14;
+
+ struct BlobIndexVersion {
+ BlobIndexVersion() = default;
+ BlobIndexVersion(std::string _user_key, uint64_t _file_number,
+ uint64_t _expiration, SequenceNumber _sequence,
+ ValueType _type)
+ : user_key(std::move(_user_key)),
+ file_number(_file_number),
+ expiration(_expiration),
+ sequence(_sequence),
+ type(_type) {}
+
+ std::string user_key;
+ uint64_t file_number = kInvalidBlobFileNumber;
+ uint64_t expiration = kNoExpiration;
+ SequenceNumber sequence = 0;
+ ValueType type = kTypeValue;
+ };
+
+ BlobDBTest()
+ : dbname_(test::PerThreadDBPath("blob_db_test")), blob_db_(nullptr) {
+ mock_clock_ = std::make_shared<MockSystemClock>(SystemClock::Default());
+ mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_));
+ fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+
+ Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
+ assert(s.ok());
+ }
+
+ ~BlobDBTest() override {
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ Destroy();
+ }
+
+ Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
+ Options options = Options()) {
+ options.create_if_missing = true;
+ if (options.env == mock_env_.get()) {
+ // Need to disable stats dumping and persisting which also use
+ // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+ // With mocked time, this can hang on some platforms (MacOS)
+ // because (a) on some platforms, pthread_cond_timedwait does not appear
+ // to release the lock for other threads to operate if the deadline time
+ // is already passed, and (b) TimedWait calls are currently a bad
+ // abstraction because the deadline parameter is usually computed from
+ // Env time, but is interpreted in real clock time.
+ options.stats_dump_period_sec = 0;
+ options.stats_persist_period_sec = 0;
+ }
+ return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
+ }
+
+ void Open(BlobDBOptions bdb_options = BlobDBOptions(),
+ Options options = Options()) {
+ ASSERT_OK(TryOpen(bdb_options, options));
+ }
+
+ void Reopen(BlobDBOptions bdb_options = BlobDBOptions(),
+ Options options = Options()) {
+ assert(blob_db_ != nullptr);
+ delete blob_db_;
+ blob_db_ = nullptr;
+ Open(bdb_options, options);
+ }
+
+ void Close() {
+ assert(blob_db_ != nullptr);
+ delete blob_db_;
+ blob_db_ = nullptr;
+ }
+
+ void Destroy() {
+ if (blob_db_) {
+ Options options = blob_db_->GetOptions();
+ BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions();
+ delete blob_db_;
+ blob_db_ = nullptr;
+ ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
+ }
+ }
+
+ BlobDBImpl *blob_db_impl() {
+ return reinterpret_cast<BlobDBImpl *>(blob_db_);
+ }
+
+ Status Put(const Slice &key, const Slice &value,
+ std::map<std::string, std::string> *data = nullptr) {
+ Status s = blob_db_->Put(WriteOptions(), key, value);
+ if (data != nullptr) {
+ (*data)[key.ToString()] = value.ToString();
+ }
+ return s;
+ }
+
+ void Delete(const std::string &key,
+ std::map<std::string, std::string> *data = nullptr) {
+ ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
+ if (data != nullptr) {
+ data->erase(key);
+ }
+ }
+
+ Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl,
+ std::map<std::string, std::string> *data = nullptr) {
+ Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl);
+ if (data != nullptr) {
+ (*data)[key.ToString()] = value.ToString();
+ }
+ return s;
+ }
+
+ Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
+ return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
+ }
+
+ void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
+ std::map<std::string, std::string> *data = nullptr) {
+ int len = rnd->Next() % kMaxBlobSize + 1;
+ std::string value = rnd->HumanReadableString(len);
+ ASSERT_OK(
+ blob_db_->PutWithTTL(WriteOptions(), Slice(key), Slice(value), ttl));
+ if (data != nullptr) {
+ (*data)[key] = value;
+ }
+ }
+
+ void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
+ std::map<std::string, std::string> *data = nullptr) {
+ int len = rnd->Next() % kMaxBlobSize + 1;
+ std::string value = rnd->HumanReadableString(len);
+ ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value),
+ expiration));
+ if (data != nullptr) {
+ (*data)[key] = value;
+ }
+ }
+
+ void PutRandom(const std::string &key, Random *rnd,
+ std::map<std::string, std::string> *data = nullptr) {
+ PutRandom(blob_db_, key, rnd, data);
+ }
+
+ void PutRandom(DB *db, const std::string &key, Random *rnd,
+ std::map<std::string, std::string> *data = nullptr) {
+ int len = rnd->Next() % kMaxBlobSize + 1;
+ std::string value = rnd->HumanReadableString(len);
+ ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+ if (data != nullptr) {
+ (*data)[key] = value;
+ }
+ }
+
+ void PutRandomToWriteBatch(
+ const std::string &key, Random *rnd, WriteBatch *batch,
+ std::map<std::string, std::string> *data = nullptr) {
+ int len = rnd->Next() % kMaxBlobSize + 1;
+ std::string value = rnd->HumanReadableString(len);
+ ASSERT_OK(batch->Put(key, value));
+ if (data != nullptr) {
+ (*data)[key] = value;
+ }
+ }
+
+ // Verify blob db contain expected data and nothing more.
+ void VerifyDB(const std::map<std::string, std::string> &data) {
+ VerifyDB(blob_db_, data);
+ }
+
+ void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+ // Verify normal Get
+ auto *cfh = db->DefaultColumnFamily();
+ for (auto &p : data) {
+ PinnableSlice value_slice;
+ ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
+ ASSERT_EQ(p.second, value_slice.ToString());
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value));
+ ASSERT_EQ(p.second, value);
+ }
+
+ // Verify iterators
+ Iterator *iter = db->NewIterator(ReadOptions());
+ iter->SeekToFirst();
+ for (auto &p : data) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(p.first, iter->key().ToString());
+ ASSERT_EQ(p.second, iter->value().ToString());
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+
+ void VerifyBaseDB(
+ const std::map<std::string, KeyVersion> &expected_versions) {
+ auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+ DB *db = blob_db_->GetRootDB();
+ const size_t kMaxKeys = 10000;
+ std::vector<KeyVersion> versions;
+ ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(expected_versions.size(), versions.size());
+ size_t i = 0;
+ for (auto &key_version : expected_versions) {
+ const KeyVersion &expected_version = key_version.second;
+ ASSERT_EQ(expected_version.user_key, versions[i].user_key);
+ ASSERT_EQ(expected_version.sequence, versions[i].sequence);
+ ASSERT_EQ(expected_version.type, versions[i].type);
+ if (versions[i].type == kTypeValue) {
+ ASSERT_EQ(expected_version.value, versions[i].value);
+ } else {
+ ASSERT_EQ(kTypeBlobIndex, versions[i].type);
+ PinnableSlice value;
+ ASSERT_OK(bdb_impl->TEST_GetBlobValue(versions[i].user_key,
+ versions[i].value, &value));
+ ASSERT_EQ(expected_version.value, value.ToString());
+ }
+ i++;
+ }
+ }
+
+ void VerifyBaseDBBlobIndex(
+ const std::map<std::string, BlobIndexVersion> &expected_versions) {
+ const size_t kMaxKeys = 10000;
+ std::vector<KeyVersion> versions;
+ ASSERT_OK(
+ GetAllKeyVersions(blob_db_->GetRootDB(), "", "", kMaxKeys, &versions));
+ ASSERT_EQ(versions.size(), expected_versions.size());
+
+ size_t i = 0;
+ for (const auto &expected_pair : expected_versions) {
+ const BlobIndexVersion &expected_version = expected_pair.second;
+
+ ASSERT_EQ(versions[i].user_key, expected_version.user_key);
+ ASSERT_EQ(versions[i].sequence, expected_version.sequence);
+ ASSERT_EQ(versions[i].type, expected_version.type);
+ if (versions[i].type != kTypeBlobIndex) {
+ ASSERT_EQ(kInvalidBlobFileNumber, expected_version.file_number);
+ ASSERT_EQ(kNoExpiration, expected_version.expiration);
+
+ ++i;
+ continue;
+ }
+
+ BlobIndex blob_index;
+ ASSERT_OK(blob_index.DecodeFrom(versions[i].value));
+
+ const uint64_t file_number = !blob_index.IsInlined()
+ ? blob_index.file_number()
+ : kInvalidBlobFileNumber;
+ ASSERT_EQ(file_number, expected_version.file_number);
+
+ const uint64_t expiration =
+ blob_index.HasTTL() ? blob_index.expiration() : kNoExpiration;
+ ASSERT_EQ(expiration, expected_version.expiration);
+
+ ++i;
+ }
+ }
+
+ void InsertBlobs() {
+ WriteOptions wo;
+ std::string value;
+
+ Random rnd(301);
+ for (size_t i = 0; i < 100000; i++) {
+ uint64_t ttl = rnd.Next() % 86400;
+ PutRandomWithTTL("key" + std::to_string(i % 500), ttl, &rnd, nullptr);
+ }
+
+ for (size_t i = 0; i < 10; i++) {
+ Delete("key" + std::to_string(i % 500));
+ }
+ }
+
+ const std::string dbname_;
+ std::shared_ptr<MockSystemClock> mock_clock_;
+ std::unique_ptr<Env> mock_env_;
+ std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+ BlobDB *blob_db_;
+}; // class BlobDBTest
+
+TEST_F(BlobDBTest, Put) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, &data);
+ }
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, PutWithTTL) {
+ Random rnd(301);
+ Options options;
+ options.env = mock_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.ttl_range_secs = 1000;
+ bdb_options.min_blob_size = 0;
+ bdb_options.blob_file_size = 256 * 1000 * 1000;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options, options);
+ std::map<std::string, std::string> data;
+ mock_clock_->SetCurrentTime(50);
+ for (size_t i = 0; i < 100; i++) {
+ uint64_t ttl = rnd.Next() % 100;
+ PutRandomWithTTL("key" + std::to_string(i), ttl, &rnd,
+ (ttl <= 50 ? nullptr : &data));
+ }
+ mock_clock_->SetCurrentTime(100);
+ auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+ auto blob_files = bdb_impl->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_TRUE(blob_files[0]->HasTTL());
+ ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, PutUntil) {
+ Random rnd(301);
+ Options options;
+ options.env = mock_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.ttl_range_secs = 1000;
+ bdb_options.min_blob_size = 0;
+ bdb_options.blob_file_size = 256 * 1000 * 1000;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options, options);
+ std::map<std::string, std::string> data;
+ mock_clock_->SetCurrentTime(50);
+ for (size_t i = 0; i < 100; i++) {
+ uint64_t expiration = rnd.Next() % 100 + 50;
+ PutRandomUntil("key" + std::to_string(i), expiration, &rnd,
+ (expiration <= 100 ? nullptr : &data));
+ }
+ mock_clock_->SetCurrentTime(100);
+ auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+ auto blob_files = bdb_impl->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_TRUE(blob_files[0]->HasTTL());
+ ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, StackableDBGet) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, &data);
+ }
+ for (size_t i = 0; i < 100; i++) {
+ StackableDB *db = blob_db_;
+ ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
+ std::string key = "key" + std::to_string(i);
+ PinnableSlice pinnable_value;
+ ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
+ std::string string_value;
+ ASSERT_OK(db->Get(ReadOptions(), column_family, key, &string_value));
+ ASSERT_EQ(string_value, pinnable_value.ToString());
+ ASSERT_EQ(string_value, data[key]);
+ }
+}
+
+TEST_F(BlobDBTest, GetExpiration) {
+ Options options;
+ options.env = mock_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.disable_background_tasks = true;
+ mock_clock_->SetCurrentTime(100);
+ Open(bdb_options, options);
+ ASSERT_OK(Put("key1", "value1"));
+ ASSERT_OK(PutWithTTL("key2", "value2", 200));
+ PinnableSlice value;
+ uint64_t expiration;
+ ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration));
+ ASSERT_EQ("value1", value.ToString());
+ ASSERT_EQ(kNoExpiration, expiration);
+ ASSERT_OK(blob_db_->Get(ReadOptions(), "key2", &value, &expiration));
+ ASSERT_EQ("value2", value.ToString());
+ ASSERT_EQ(300 /* = 100 + 200 */, expiration);
+}
+
+TEST_F(BlobDBTest, GetIOError) {
+ Options options;
+ options.env = fault_injection_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0; // Make sure value write to blob file
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options, options);
+ ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+ PinnableSlice value;
+ ASSERT_OK(Put("foo", "bar"));
+ fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+ Status s = blob_db_->Get(ReadOptions(), column_family, "foo", &value);
+ ASSERT_TRUE(s.IsIOError());
+ // Reactivate file system to allow test to close DB.
+ fault_injection_env_->SetFilesystemActive(true);
+}
+
+TEST_F(BlobDBTest, PutIOError) {
+ Options options;
+ options.env = fault_injection_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0; // Make sure value write to blob file
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options, options);
+ fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+ ASSERT_TRUE(Put("foo", "v1").IsIOError());
+ fault_injection_env_->SetFilesystemActive(true, Status::IOError());
+ ASSERT_OK(Put("bar", "v1"));
+}
+
+TEST_F(BlobDBTest, WriteBatch) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < 10; j++) {
+ PutRandomToWriteBatch("key" + std::to_string(j * 100 + i), &rnd, &batch,
+ &data);
+ }
+
+ ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+ }
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, Delete) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, &data);
+ }
+ for (size_t i = 0; i < 100; i += 5) {
+ Delete("key" + std::to_string(i), &data);
+ }
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, DeleteBatch) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("key" + std::to_string(i), &rnd);
+ }
+ WriteBatch batch;
+ for (size_t i = 0; i < 100; i++) {
+ ASSERT_OK(batch.Delete("key" + std::to_string(i)));
+ }
+ ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+ // DB should be empty.
+ VerifyDB({});
+}
+
+TEST_F(BlobDBTest, Override) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (int i = 0; i < 10000; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, nullptr);
+ }
+ // override all the keys
+ for (int i = 0; i < 10000; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, &data);
+ }
+ VerifyDB(data);
+}
+
+#ifdef SNAPPY
+TEST_F(BlobDBTest, Compression) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ bdb_options.compression = CompressionType::kSnappyCompression;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("put-key" + std::to_string(i), &rnd, &data);
+ }
+ for (int i = 0; i < 100; i++) {
+ WriteBatch batch;
+ for (size_t j = 0; j < 10; j++) {
+ PutRandomToWriteBatch("write-batch-key" + std::to_string(j * 100 + i),
+ &rnd, &batch, &data);
+ }
+ ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+ }
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, DecompressAfterReopen) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ bdb_options.compression = CompressionType::kSnappyCompression;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("put-key" + std::to_string(i), &rnd, &data);
+ }
+ VerifyDB(data);
+ bdb_options.compression = CompressionType::kNoCompression;
+ Reopen(bdb_options);
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, EnableDisableCompressionGC) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.garbage_collection_cutoff = 1.0;
+ bdb_options.disable_background_tasks = true;
+ bdb_options.compression = kSnappyCompression;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ size_t data_idx = 0;
+ for (; data_idx < 100; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType());
+
+ // disable compression
+ bdb_options.compression = kNoCompression;
+ Reopen(bdb_options);
+
+ // Add more data with new compression type
+ for (; data_idx < 200; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(2, blob_files.size());
+ ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType());
+
+ // Enable GC. If we do it earlier the snapshot release triggered compaction
+ // may compact files and trigger GC before we can verify there are two files.
+ bdb_options.enable_garbage_collection = true;
+ Reopen(bdb_options);
+
+ // Trigger compaction
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ VerifyDB(data);
+
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ for (auto bfile : blob_files) {
+ ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+ }
+
+ // enabling the compression again
+ bdb_options.compression = kSnappyCompression;
+ Reopen(bdb_options);
+
+ // Add more data with new compression type
+ for (; data_idx < 300; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ // Trigger compaction
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ VerifyDB(data);
+
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ for (auto bfile : blob_files) {
+ ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+ }
+}
+
+#ifdef LZ4
+// Test switch compression types and run GC, it needs both Snappy and LZ4
+// support.
+TEST_F(BlobDBTest, ChangeCompressionGC) {
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.garbage_collection_cutoff = 1.0;
+ bdb_options.disable_background_tasks = true;
+ bdb_options.compression = kLZ4Compression;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ size_t data_idx = 0;
+ for (; data_idx < 100; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType());
+
+ // Change compression type
+ bdb_options.compression = kSnappyCompression;
+ Reopen(bdb_options);
+
+ // Add more data with Snappy compression type
+ for (; data_idx < 200; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ // Verify blob file compression type
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(2, blob_files.size());
+ ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType());
+
+ // Enable GC. If we do it earlier the snapshot release triggered compaction
+ // may compact files and trigger GC before we can verify there are two files.
+ bdb_options.enable_garbage_collection = true;
+ Reopen(bdb_options);
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyDB(data);
+
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ for (auto bfile : blob_files) {
+ ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+ }
+
+ // Disable compression
+ bdb_options.compression = kNoCompression;
+ Reopen(bdb_options);
+ for (; data_idx < 300; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyDB(data);
+
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ for (auto bfile : blob_files) {
+ ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+ }
+
+ // switching different compression types to generate mixed compression types
+ bdb_options.compression = kSnappyCompression;
+ Reopen(bdb_options);
+ for (; data_idx < 400; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ bdb_options.compression = kLZ4Compression;
+ Reopen(bdb_options);
+ for (; data_idx < 500; data_idx++) {
+ PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+ }
+ VerifyDB(data);
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyDB(data);
+
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ for (auto bfile : blob_files) {
+ ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
+ }
+}
+#endif // LZ4
+#endif // SNAPPY
+
+TEST_F(BlobDBTest, MultipleWriters) {
+ Open(BlobDBOptions());
+
+ std::vector<port::Thread> workers;
+ std::vector<std::map<std::string, std::string>> data_set(10);
+ for (uint32_t i = 0; i < 10; i++)
+ workers.push_back(port::Thread(
+ [&](uint32_t id) {
+ Random rnd(301 + id);
+ for (int j = 0; j < 100; j++) {
+ std::string key =
+ "key" + std::to_string(id) + "_" + std::to_string(j);
+ if (id < 5) {
+ PutRandom(key, &rnd, &data_set[id]);
+ } else {
+ WriteBatch batch;
+ PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]);
+ ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+ }
+ }
+ },
+ i));
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 10; i++) {
+ workers[i].join();
+ data.insert(data_set[i].begin(), data_set[i].end());
+ }
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, SstFileManager) {
+ // run the same test for Get(), MultiGet() and Iterator each.
+ std::shared_ptr<SstFileManager> sst_file_manager(
+ NewSstFileManager(mock_env_.get()));
+ sst_file_manager->SetDeleteRateBytesPerSecond(1);
+ SstFileManagerImpl *sfm =
+ static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.enable_garbage_collection = true;
+ bdb_options.garbage_collection_cutoff = 1.0;
+ Options db_options;
+
+ int files_scheduled_to_delete = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+ assert(arg);
+ const std::string *const file_path =
+ static_cast<const std::string *>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ ++files_scheduled_to_delete;
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ db_options.sst_file_manager = sst_file_manager;
+
+ Open(bdb_options, db_options);
+
+ // Create one obselete file and clean it.
+ ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ std::shared_ptr<BlobFile> bfile = blob_files[0];
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+ // Even if SSTFileManager is not set, DB is creating a dummy one.
+ ASSERT_EQ(1, files_scheduled_to_delete);
+ Destroy();
+ // Make sure that DestroyBlobDB() also goes through delete scheduler.
+ ASSERT_EQ(2, files_scheduled_to_delete);
+ SyncPoint::GetInstance()->DisableProcessing();
+ sfm->WaitForEmptyTrash();
+}
+
+TEST_F(BlobDBTest, SstFileManagerRestart) {
+ int files_scheduled_to_delete = 0;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+ assert(arg);
+ const std::string *const file_path =
+ static_cast<const std::string *>(arg);
+ if (file_path->find(".blob") != std::string::npos) {
+ ++files_scheduled_to_delete;
+ }
+ });
+
+ // run the same test for Get(), MultiGet() and Iterator each.
+ std::shared_ptr<SstFileManager> sst_file_manager(
+ NewSstFileManager(mock_env_.get()));
+ sst_file_manager->SetDeleteRateBytesPerSecond(1);
+ SstFileManagerImpl *sfm =
+ static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ Options db_options;
+
+ SyncPoint::GetInstance()->EnableProcessing();
+ db_options.sst_file_manager = sst_file_manager;
+
+ Open(bdb_options, db_options);
+ std::string blob_dir = blob_db_impl()->TEST_blob_dir();
+ ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
+ Close();
+
+ // Create 3 dummy trash files under the blob_dir
+ const auto &fs = db_options.env->GetFileSystem();
+ ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
+ ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
+ ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
+
+ // Make sure that reopening the DB rescan the existing trash files
+ Open(bdb_options, db_options);
+ ASSERT_EQ(files_scheduled_to_delete, 2);
+
+ sfm->WaitForEmptyTrash();
+
+ // There should be exact one file under the blob dir now.
+ std::vector<std::string> all_files;
+ ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files));
+ int nfiles = 0;
+ for (const auto &f : all_files) {
+ assert(!f.empty());
+ if (f[0] == '.') {
+ continue;
+ }
+ nfiles++;
+ }
+ ASSERT_EQ(nfiles, 1);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.enable_garbage_collection = true;
+ bdb_options.garbage_collection_cutoff = 1.0;
+ bdb_options.disable_background_tasks = true;
+
+ Options options;
+ options.disable_auto_compactions = true;
+
+ // i = when to take snapshot
+ for (int i = 0; i < 4; i++) {
+ Destroy();
+ Open(bdb_options, options);
+
+ const Snapshot *snapshot = nullptr;
+
+ // First file
+ ASSERT_OK(Put("key1", "value"));
+ if (i == 0) {
+ snapshot = blob_db_->GetSnapshot();
+ }
+
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+ // Second file
+ ASSERT_OK(Put("key2", "value"));
+ if (i == 1) {
+ snapshot = blob_db_->GetSnapshot();
+ }
+
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(2, blob_files.size());
+ auto bfile = blob_files[1];
+ ASSERT_FALSE(bfile->Immutable());
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+
+ // Third file
+ ASSERT_OK(Put("key3", "value"));
+ if (i == 2) {
+ snapshot = blob_db_->GetSnapshot();
+ }
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_TRUE(bfile->Obsolete());
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
+ bfile->GetObsoleteSequence());
+
+ Delete("key2");
+ if (i == 3) {
+ snapshot = blob_db_->GetSnapshot();
+ }
+
+ ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+ if (i >= 2) {
+ // The snapshot shouldn't see data in bfile
+ ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+ blob_db_->ReleaseSnapshot(snapshot);
+ } else {
+ // The snapshot will see data in bfile, so the file shouldn't be deleted
+ ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+ blob_db_->ReleaseSnapshot(snapshot);
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+ }
+ }
+}
+
+TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
+ Options options;
+ options.env = mock_env_.get();
+ mock_clock_->SetCurrentTime(0);
+ Open(BlobDBOptions(), options);
+ ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
+ ColumnFamilyHandle *handle = nullptr;
+ std::string value;
+ std::vector<std::string> values;
+ // The call simply pass through to base db. It should succeed.
+ ASSERT_OK(
+ blob_db_->CreateColumnFamily(ColumnFamilyOptions(), "foo", &handle));
+ ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported());
+ ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60)
+ .IsNotSupported());
+ ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
+ .IsNotSupported());
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("k1", "v1"));
+ ASSERT_OK(batch.Put(handle, "k2", "v2"));
+ ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported());
+ ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound());
+ ASSERT_TRUE(
+ blob_db_->Get(ReadOptions(), handle, "k", &value).IsNotSupported());
+ auto statuses = blob_db_->MultiGet(ReadOptions(), {default_handle, handle},
+ {"k1", "k2"}, &values);
+ ASSERT_EQ(2, statuses.size());
+ ASSERT_TRUE(statuses[0].IsNotSupported());
+ ASSERT_TRUE(statuses[1].IsNotSupported());
+ ASSERT_EQ(nullptr, blob_db_->NewIterator(ReadOptions(), handle));
+ delete handle;
+}
+
+TEST_F(BlobDBTest, GetLiveFilesMetaData) {
+ Random rnd(301);
+
+ BlobDBOptions bdb_options;
+ bdb_options.blob_dir = "blob_dir";
+ bdb_options.path_relative = true;
+ bdb_options.ttl_range_secs = 10;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+
+ Options options;
+ options.env = mock_env_.get();
+
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ for (size_t i = 0; i < 100; i++) {
+ PutRandom("key" + std::to_string(i), &rnd, &data);
+ }
+
+ constexpr uint64_t expiration = 1000ULL;
+ PutRandomUntil("key100", expiration, &rnd, &data);
+
+ std::vector<LiveFileMetaData> metadata;
+ blob_db_->GetLiveFilesMetaData(&metadata);
+
+ ASSERT_EQ(2U, metadata.size());
+ // Path should be relative to db_name, but begin with slash.
+ const std::string filename1("/blob_dir/000001.blob");
+ ASSERT_EQ(filename1, metadata[0].name);
+ ASSERT_EQ(1, metadata[0].file_number);
+ ASSERT_EQ(0, metadata[0].oldest_ancester_time);
+ ASSERT_EQ(kDefaultColumnFamilyName, metadata[0].column_family_name);
+
+ const std::string filename2("/blob_dir/000002.blob");
+ ASSERT_EQ(filename2, metadata[1].name);
+ ASSERT_EQ(2, metadata[1].file_number);
+ ASSERT_EQ(expiration, metadata[1].oldest_ancester_time);
+ ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name);
+
+ std::vector<std::string> livefile;
+ uint64_t mfs;
+ ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false));
+ ASSERT_EQ(5U, livefile.size());
+ ASSERT_EQ(filename1, livefile[3]);
+ ASSERT_EQ(filename2, livefile[4]);
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
+ constexpr size_t kNumKey = 20;
+ constexpr size_t kNumIteration = 10;
+ Random rnd(301);
+ std::map<std::string, std::string> data;
+ std::vector<bool> is_blob(kNumKey, false);
+
+ // Write to plain rocksdb.
+ Options options;
+ options.create_if_missing = true;
+ DB *db = nullptr;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ for (size_t i = 0; i < kNumIteration; i++) {
+ auto key_index = rnd.Next() % kNumKey;
+ std::string key = "key" + std::to_string(key_index);
+ PutRandom(db, key, &rnd, &data);
+ }
+ VerifyDB(db, data);
+ delete db;
+ db = nullptr;
+
+ // Open as blob db. Verify it can read existing data.
+ Open();
+ VerifyDB(blob_db_, data);
+ for (size_t i = 0; i < kNumIteration; i++) {
+ auto key_index = rnd.Next() % kNumKey;
+ std::string key = "key" + std::to_string(key_index);
+ is_blob[key_index] = true;
+ PutRandom(blob_db_, key, &rnd, &data);
+ }
+ VerifyDB(blob_db_, data);
+ delete blob_db_;
+ blob_db_ = nullptr;
+
+ // Verify plain db return error for keys written by blob db.
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ std::string value;
+ for (size_t i = 0; i < kNumKey; i++) {
+ std::string key = "key" + std::to_string(i);
+ Status s = db->Get(ReadOptions(), key, &value);
+ if (data.count(key) == 0) {
+ ASSERT_TRUE(s.IsNotFound());
+ } else if (is_blob[i]) {
+ ASSERT_TRUE(s.IsCorruption());
+ } else {
+ ASSERT_OK(s);
+ ASSERT_EQ(data[key], value);
+ }
+ }
+ delete db;
+}
+
+// Test to verify that a NoSpace IOError Status is returned on reaching
+// max_db_size limit.
+TEST_F(BlobDBTest, OutOfSpace) {
+ // Use mock env to stop wall clock.
+ Options options;
+ options.env = mock_env_.get();
+ BlobDBOptions bdb_options;
+ bdb_options.max_db_size = 200;
+ bdb_options.is_fifo = false;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+
+ // Each stored blob has an overhead of about 42 bytes currently.
+ // So a small key + a 100 byte blob should take up ~150 bytes in the db.
+ std::string value(100, 'v');
+ ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60));
+
+ // Putting another blob should fail as ading it would exceed the max_db_size
+ // limit.
+ Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60);
+ ASSERT_TRUE(s.IsIOError());
+ ASSERT_TRUE(s.IsNoSpace());
+}
+
+TEST_F(BlobDBTest, FIFOEviction) {
+ BlobDBOptions bdb_options;
+ bdb_options.max_db_size = 200;
+ bdb_options.blob_file_size = 100;
+ bdb_options.is_fifo = true;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+
+ std::atomic<int> evict_count{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobDBImpl::EvictOldestBlobFile:Evicted",
+ [&](void *) { evict_count++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Each stored blob has an overhead of 32 bytes currently.
+ // So a 100 byte blob should take up 132 bytes.
+ std::string value(100, 'v');
+ ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
+ VerifyDB({{"key1", value}});
+
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+
+ // Adding another 100 bytes blob would take the total size to 264 bytes
+ // (2*132). max_db_size will be exceeded
+ // than max_db_size and trigger FIFO eviction.
+ ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
+ ASSERT_EQ(1, evict_count);
+ // key1 will exist until corresponding file be deleted.
+ VerifyDB({{"key1", value}, {"key2", value}});
+
+ // Adding another 100 bytes blob without TTL.
+ ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value));
+ ASSERT_EQ(2, evict_count);
+ // key1 and key2 will exist until corresponding file be deleted.
+ VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}});
+
+ // The fourth blob file, without TTL.
+ ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value));
+ ASSERT_EQ(3, evict_count);
+ VerifyDB(
+ {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}});
+
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(4, blob_files.size());
+ ASSERT_TRUE(blob_files[0]->Obsolete());
+ ASSERT_TRUE(blob_files[1]->Obsolete());
+ ASSERT_TRUE(blob_files[2]->Obsolete());
+ ASSERT_FALSE(blob_files[3]->Obsolete());
+ auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_EQ(3, obsolete_files.size());
+ ASSERT_EQ(blob_files[0], obsolete_files[0]);
+ ASSERT_EQ(blob_files[1], obsolete_files[1]);
+ ASSERT_EQ(blob_files[2], obsolete_files[2]);
+
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_TRUE(obsolete_files.empty());
+ VerifyDB({{"key4", value}});
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
+ Options options;
+ BlobDBOptions bdb_options;
+ bdb_options.max_db_size = 1000;
+ bdb_options.blob_file_size = 5000;
+ bdb_options.is_fifo = true;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+
+ std::atomic<int> evict_count{0};
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobDBImpl::EvictOldestBlobFile:Evicted",
+ [&](void *) { evict_count++; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ std::string value(2000, 'v');
+ ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace());
+ ASSERT_EQ(0, evict_count);
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) {
+ BlobDBOptions bdb_options;
+ bdb_options.is_fifo = true;
+ bdb_options.min_blob_size = 100;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ // Use mock env to stop wall clock.
+ options.env = mock_env_.get();
+ options.disable_auto_compactions = true;
+ auto statistics = CreateDBStatistics();
+ options.statistics = statistics;
+ Open(bdb_options, options);
+
+ ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size());
+ std::string small_value(50, 'v');
+ std::map<std::string, std::string> data;
+ // Insert some data into LSM tree to make sure FIFO eviction take SST
+ // file size into account.
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_OK(Put("key" + std::to_string(i), small_value, &data));
+ }
+ ASSERT_OK(blob_db_->Flush(FlushOptions()));
+ uint64_t live_sst_size = 0;
+ ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+ &live_sst_size));
+ ASSERT_TRUE(live_sst_size > 0);
+ ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+ bdb_options.max_db_size = live_sst_size + 2000;
+ Reopen(bdb_options, options);
+ ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+ std::string value_1k(1000, 'v');
+ ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data));
+ ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ VerifyDB(data);
+ // large_key2 evicts large_key1
+ ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data));
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ data.erase("large_key1");
+ VerifyDB(data);
+ // large_key3 get no enough space even after evicting large_key2, so it
+ // instead return no space error.
+ std::string value_2k(2000, 'v');
+ ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace());
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ // Verify large_key2 still exists.
+ VerifyDB(data);
+}
+
+// Test flush or compaction will trigger FIFO eviction since they update
+// total SST file size.
+TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) {
+ BlobDBOptions bdb_options;
+ bdb_options.max_db_size = 1000;
+ bdb_options.is_fifo = true;
+ bdb_options.min_blob_size = 100;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ // Use mock env to stop wall clock.
+ options.env = mock_env_.get();
+ auto statistics = CreateDBStatistics();
+ options.statistics = statistics;
+ options.compression = kNoCompression;
+ Open(bdb_options, options);
+
+ std::string value(800, 'v');
+ ASSERT_OK(PutWithTTL("large_key", value, 60));
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ VerifyDB({{"large_key", value}});
+
+ // Insert some small keys and flush to bring DB out of space.
+ std::map<std::string, std::string> data;
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("key" + std::to_string(i), "v", &data));
+ }
+ ASSERT_OK(blob_db_->Flush(FlushOptions()));
+
+ // Verify large_key is deleted by FIFO eviction.
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, InlineSmallValues) {
+ constexpr uint64_t kMaxExpiration = 1000;
+ Random rnd(301);
+ BlobDBOptions bdb_options;
+ bdb_options.ttl_range_secs = kMaxExpiration;
+ bdb_options.min_blob_size = 100;
+ bdb_options.blob_file_size = 256 * 1000 * 1000;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ options.env = mock_env_.get();
+ mock_clock_->SetCurrentTime(0);
+ Open(bdb_options, options);
+ std::map<std::string, std::string> data;
+ std::map<std::string, KeyVersion> versions;
+ for (size_t i = 0; i < 1000; i++) {
+ bool is_small_value = rnd.Next() % 2;
+ bool has_ttl = rnd.Next() % 2;
+ uint64_t expiration = rnd.Next() % kMaxExpiration;
+ int len = is_small_value ? 50 : 200;
+ std::string key = "key" + std::to_string(i);
+ std::string value = rnd.HumanReadableString(len);
+ std::string blob_index;
+ data[key] = value;
+ SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+ if (!has_ttl) {
+ ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
+ } else {
+ ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration));
+ }
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+ versions[key] =
+ KeyVersion(key, value, sequence,
+ (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
+ }
+ VerifyDB(data);
+ VerifyBaseDB(versions);
+ auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+ auto blob_files = bdb_impl->TEST_GetBlobFiles();
+ ASSERT_EQ(2, blob_files.size());
+ std::shared_ptr<BlobFile> non_ttl_file;
+ std::shared_ptr<BlobFile> ttl_file;
+ if (blob_files[0]->HasTTL()) {
+ ttl_file = blob_files[0];
+ non_ttl_file = blob_files[1];
+ } else {
+ non_ttl_file = blob_files[0];
+ ttl_file = blob_files[1];
+ }
+ ASSERT_FALSE(non_ttl_file->HasTTL());
+ ASSERT_TRUE(ttl_file->HasTTL());
+}
+
+TEST_F(BlobDBTest, UserCompactionFilter) {
+ class CustomerFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+ std::string *new_value, bool *value_changed) const override {
+ *value_changed = false;
+ // changing value size to test value transitions between inlined data
+ // and stored-in-blob data
+ if (value.size() % 4 == 1) {
+ *new_value = value.ToString();
+ // double size by duplicating value
+ *new_value += *new_value;
+ *value_changed = true;
+ return false;
+ } else if (value.size() % 3 == 1) {
+ *new_value = value.ToString();
+ // trancate value size by half
+ *new_value = new_value->substr(0, new_value->size() / 2);
+ *value_changed = true;
+ return false;
+ } else if (value.size() % 2 == 1) {
+ return true;
+ }
+ return false;
+ }
+ bool IgnoreSnapshots() const override { return true; }
+ const char *Name() const override { return "CustomerFilter"; }
+ };
+ class CustomerFilterFactory : public CompactionFilterFactory {
+ const char *Name() const override { return "CustomerFilterFactory"; }
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context & /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new CustomerFilter());
+ }
+ };
+
+ constexpr size_t kNumPuts = 1 << 10;
+ // Generate both inlined and blob value
+ constexpr uint64_t kMinValueSize = 1 << 6;
+ constexpr uint64_t kMaxValueSize = 1 << 8;
+ constexpr uint64_t kMinBlobSize = 1 << 7;
+ static_assert(kMinValueSize < kMinBlobSize, "");
+ static_assert(kMaxValueSize > kMinBlobSize, "");
+
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = kMinBlobSize;
+ bdb_options.blob_file_size = kMaxValueSize * 10;
+ bdb_options.disable_background_tasks = true;
+ if (Snappy_Supported()) {
+ bdb_options.compression = CompressionType::kSnappyCompression;
+ }
+ // case_num == 0: Test user defined compaction filter
+ // case_num == 1: Test user defined compaction filter factory
+ for (int case_num = 0; case_num < 2; case_num++) {
+ Options options;
+ if (case_num == 0) {
+ options.compaction_filter = new CustomerFilter();
+ } else {
+ options.compaction_filter_factory.reset(new CustomerFilterFactory());
+ }
+ options.disable_auto_compactions = true;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ std::map<std::string, std::string> data_after_compact;
+ Random rnd(301);
+ uint64_t value_size = kMinValueSize;
+ int drop_record = 0;
+ for (size_t i = 0; i < kNumPuts; ++i) {
+ std::ostringstream oss;
+ oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+ const std::string key(oss.str());
+ const std::string value = rnd.HumanReadableString((int)value_size);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(Put(key, value));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+ data[key] = value;
+ if (value.length() % 4 == 1) {
+ data_after_compact[key] = value + value;
+ } else if (value.length() % 3 == 1) {
+ data_after_compact[key] = value.substr(0, value.size() / 2);
+ } else if (value.length() % 2 == 1) {
+ ++drop_record;
+ } else {
+ data_after_compact[key] = value;
+ }
+
+ if (++value_size > kMaxValueSize) {
+ value_size = kMinValueSize;
+ }
+ }
+ // Verify full data set
+ VerifyDB(data);
+ // Applying compaction filter for records
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Verify data after compaction, only value with even length left.
+ VerifyDB(data_after_compact);
+ ASSERT_EQ(drop_record,
+ options.statistics->getTickerCount(COMPACTION_KEY_DROP_USER));
+ delete options.compaction_filter;
+ Destroy();
+ }
+}
+
+// Test user comapction filter when there is IO error on blob data.
+TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
+ class CustomerFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+ std::string *new_value, bool *value_changed) const override {
+ *new_value = value.ToString() + "_new";
+ *value_changed = true;
+ return false;
+ }
+ bool IgnoreSnapshots() const override { return true; }
+ const char *Name() const override { return "CustomerFilter"; }
+ };
+
+ constexpr size_t kNumPuts = 100;
+ constexpr int kValueSize = 100;
+
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.blob_file_size = kValueSize * 10;
+ bdb_options.disable_background_tasks = true;
+ bdb_options.compression = CompressionType::kNoCompression;
+
+ std::vector<std::string> io_failure_cases = {
+ "BlobDBImpl::CreateBlobFileAndWriter",
+ "BlobIndexCompactionFilterBase::WriteBlobToNewFile",
+ "BlobDBImpl::CloseBlobFile"};
+
+ for (size_t case_num = 0; case_num < io_failure_cases.size(); case_num++) {
+ Options options;
+ options.compaction_filter = new CustomerFilter();
+ options.disable_auto_compactions = true;
+ options.env = fault_injection_env_.get();
+ options.statistics = CreateDBStatistics();
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ Random rnd(301);
+ for (size_t i = 0; i < kNumPuts; ++i) {
+ std::ostringstream oss;
+ oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+ const std::string key(oss.str());
+ const std::string value = rnd.HumanReadableString(kValueSize);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(Put(key, value));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+ data[key] = value;
+ }
+
+ // Verify full data set
+ VerifyDB(data);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ io_failure_cases[case_num], [&](void * /*arg*/) {
+ fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ auto s = blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ ASSERT_TRUE(s.IsIOError());
+
+ // Reactivate file system to allow test to verify and close DB.
+ fault_injection_env_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Verify full data set after compaction failure
+ VerifyDB(data);
+
+ delete options.compaction_filter;
+ Destroy();
+ }
+}
+
+// Test comapction filter should remove any expired blob index.
+TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
+ constexpr size_t kNumKeys = 100;
+ constexpr size_t kNumPuts = 1000;
+ constexpr uint64_t kMaxExpiration = 1000;
+ constexpr uint64_t kCompactTime = 500;
+ constexpr uint64_t kMinBlobSize = 100;
+ Random rnd(301);
+ mock_clock_->SetCurrentTime(0);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = kMinBlobSize;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ options.env = mock_env_.get();
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ std::map<std::string, std::string> data_after_compact;
+ for (size_t i = 0; i < kNumPuts; i++) {
+ bool is_small_value = rnd.Next() % 2;
+ bool has_ttl = rnd.Next() % 2;
+ uint64_t expiration = rnd.Next() % kMaxExpiration;
+ int len = is_small_value ? 10 : 200;
+ std::string key = "key" + std::to_string(rnd.Next() % kNumKeys);
+ std::string value = rnd.HumanReadableString(len);
+ if (!has_ttl) {
+ if (is_small_value) {
+ std::string blob_entry;
+ BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
+ // Fake blob index with TTL. See what it will do.
+ ASSERT_GT(kMinBlobSize, blob_entry.size());
+ value = blob_entry;
+ }
+ ASSERT_OK(Put(key, value));
+ data_after_compact[key] = value;
+ } else {
+ ASSERT_OK(PutUntil(key, value, expiration));
+ if (expiration <= kCompactTime) {
+ data_after_compact.erase(key);
+ } else {
+ data_after_compact[key] = value;
+ }
+ }
+ data[key] = value;
+ }
+ VerifyDB(data);
+
+ mock_clock_->SetCurrentTime(kCompactTime);
+ // Take a snapshot before compaction. Make sure expired blob indexes is
+ // filtered regardless of snapshot.
+ const Snapshot *snapshot = blob_db_->GetSnapshot();
+ // Issue manual compaction to trigger compaction filter.
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ blob_db_->ReleaseSnapshot(snapshot);
+ // Verify expired blob index are filtered.
+ std::vector<KeyVersion> versions;
+ const size_t kMaxKeys = 10000;
+ ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(data_after_compact.size(), versions.size());
+ for (auto &version : versions) {
+ ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
+ }
+ VerifyDB(data_after_compact);
+}
+
+// Test compaction filter should remove any blob index where corresponding
+// blob file has been removed.
+TEST_F(BlobDBTest, FilterFileNotAvailable) {
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ options.disable_auto_compactions = true;
+ Open(bdb_options, options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ ASSERT_EQ(1, blob_files[0]->BlobFileNumber());
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+ ASSERT_OK(Put("bar", "v2"));
+ blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(2, blob_files.size());
+ ASSERT_EQ(2, blob_files[1]->BlobFileNumber());
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1]));
+
+ const size_t kMaxKeys = 10000;
+
+ DB *base_db = blob_db_->GetRootDB();
+ std::vector<KeyVersion> versions;
+ ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(2, versions.size());
+ ASSERT_EQ("bar", versions[0].user_key);
+ ASSERT_EQ("foo", versions[1].user_key);
+ VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(2, versions.size());
+ ASSERT_EQ("bar", versions[0].user_key);
+ ASSERT_EQ("foo", versions[1].user_key);
+ VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+ // Remove the first blob file and compact. foo should be remove from base db.
+ blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(1, versions.size());
+ ASSERT_EQ("bar", versions[0].user_key);
+ VerifyDB({{"bar", "v2"}});
+
+ // Remove the second blob file and compact. bar should be remove from base db.
+ blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+ ASSERT_EQ(0, versions.size());
+ VerifyDB({});
+}
+
+// Test compaction filter should filter any inlined TTL keys that would have
+// been dropped by last FIFO eviction if they are store out-of-line.
+TEST_F(BlobDBTest, FilterForFIFOEviction) {
+ Random rnd(215);
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 100;
+ bdb_options.ttl_range_secs = 60;
+ bdb_options.max_db_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ // Use mock env to stop wall clock.
+ mock_clock_->SetCurrentTime(0);
+ options.env = mock_env_.get();
+ auto statistics = CreateDBStatistics();
+ options.statistics = statistics;
+ options.disable_auto_compactions = true;
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ std::map<std::string, std::string> data_after_compact;
+ // Insert some small values that will be inlined.
+ for (int i = 0; i < 1000; i++) {
+ std::string key = "key" + std::to_string(i);
+ std::string value = rnd.HumanReadableString(50);
+ uint64_t ttl = rnd.Next() % 120 + 1;
+ ASSERT_OK(PutWithTTL(key, value, ttl, &data));
+ if (ttl >= 60) {
+ data_after_compact[key] = value;
+ }
+ }
+ uint64_t num_keys_to_evict = data.size() - data_after_compact.size();
+ ASSERT_OK(blob_db_->Flush(FlushOptions()));
+ uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size();
+ ASSERT_GT(live_sst_size, 0);
+ VerifyDB(data);
+
+ bdb_options.max_db_size = live_sst_size + 30000;
+ bdb_options.is_fifo = true;
+ Reopen(bdb_options, options);
+ VerifyDB(data);
+
+ // Put two large values, each on a different blob file.
+ std::string large_value(10000, 'v');
+ ASSERT_OK(PutWithTTL("large_key1", large_value, 90));
+ ASSERT_OK(PutWithTTL("large_key2", large_value, 150));
+ ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ data["large_key1"] = large_value;
+ data["large_key2"] = large_value;
+ VerifyDB(data);
+
+ // Put a third large value which will bring the DB out of space.
+ // FIFO eviction will evict the file of large_key1.
+ ASSERT_OK(PutWithTTL("large_key3", large_value, 150));
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ data.erase("large_key1");
+ data["large_key3"] = large_value;
+ VerifyDB(data);
+
+ // Putting some more small values. These values shouldn't be evicted by
+ // compaction filter since they are inserted after FIFO eviction.
+ ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact));
+ ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact));
+
+ // FIFO eviction doesn't trigger again since there enough room for the flush.
+ ASSERT_OK(blob_db_->Flush(FlushOptions()));
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+
+ // Manual compact and check if compaction filter evict those keys with
+ // expiration < 60.
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // All keys with expiration < 60, plus large_key1 is filtered by
+ // compaction filter.
+ ASSERT_EQ(num_keys_to_evict + 1,
+ statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT));
+ ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ data_after_compact["large_key2"] = large_value;
+ data_after_compact["large_key3"] = large_value;
+ VerifyDB(data_after_compact);
+}
+
+TEST_F(BlobDBTest, GarbageCollection) {
+ constexpr size_t kNumPuts = 1 << 10;
+
+ constexpr uint64_t kExpiration = 1000;
+ constexpr uint64_t kCompactTime = 500;
+
+ constexpr uint64_t kKeySize = 7; // "key" + 4 digits
+
+ constexpr uint64_t kSmallValueSize = 1 << 6;
+ constexpr uint64_t kLargeValueSize = 1 << 8;
+ constexpr uint64_t kMinBlobSize = 1 << 7;
+ static_assert(kSmallValueSize < kMinBlobSize, "");
+ static_assert(kLargeValueSize > kMinBlobSize, "");
+
+ constexpr size_t kBlobsPerFile = 8;
+ constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile;
+ constexpr uint64_t kBlobFileSize =
+ BlobLogHeader::kSize +
+ (BlobLogRecord::kHeaderSize + kKeySize + kLargeValueSize) * kBlobsPerFile;
+
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = kMinBlobSize;
+ bdb_options.blob_file_size = kBlobFileSize;
+ bdb_options.enable_garbage_collection = true;
+ bdb_options.garbage_collection_cutoff = 0.25;
+ bdb_options.disable_background_tasks = true;
+
+ Options options;
+ options.env = mock_env_.get();
+ options.statistics = CreateDBStatistics();
+
+ Open(bdb_options, options);
+
+ std::map<std::string, std::string> data;
+ std::map<std::string, KeyVersion> blob_value_versions;
+ std::map<std::string, BlobIndexVersion> blob_index_versions;
+
+ Random rnd(301);
+
+ // Add a bunch of large non-TTL values. These will be written to non-TTL
+ // blob files and will be subject to GC.
+ for (size_t i = 0; i < kNumPuts; ++i) {
+ std::ostringstream oss;
+ oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+ const std::string key(oss.str());
+ const std::string value = rnd.HumanReadableString(kLargeValueSize);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(Put(key, value));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+ data[key] = value;
+ blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+ blob_index_versions[key] =
+ BlobIndexVersion(key, /* file_number */ (i >> 3) + 1, kNoExpiration,
+ sequence, kTypeBlobIndex);
+ }
+
+ // Add some small and/or TTL values that will be ignored during GC.
+ // First, add a large TTL value will be written to its own TTL blob file.
+ {
+ const std::string key("key2000");
+ const std::string value = rnd.HumanReadableString(kLargeValueSize);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(PutUntil(key, value, kExpiration));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+ data[key] = value;
+ blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+ blob_index_versions[key] =
+ BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kExpiration,
+ sequence, kTypeBlobIndex);
+ }
+
+ // Now add a small TTL value (which will be inlined).
+ {
+ const std::string key("key3000");
+ const std::string value = rnd.HumanReadableString(kSmallValueSize);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(PutUntil(key, value, kExpiration));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+ data[key] = value;
+ blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+ blob_index_versions[key] = BlobIndexVersion(
+ key, kInvalidBlobFileNumber, kExpiration, sequence, kTypeBlobIndex);
+ }
+
+ // Finally, add a small non-TTL value (which will be stored as a regular
+ // value).
+ {
+ const std::string key("key4000");
+ const std::string value = rnd.HumanReadableString(kSmallValueSize);
+ const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+ ASSERT_OK(Put(key, value));
+ ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+ data[key] = value;
+ blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeValue);
+ blob_index_versions[key] = BlobIndexVersion(
+ key, kInvalidBlobFileNumber, kNoExpiration, sequence, kTypeValue);
+ }
+
+ VerifyDB(data);
+ VerifyBaseDB(blob_value_versions);
+ VerifyBaseDBBlobIndex(blob_index_versions);
+
+ // At this point, we should have 128 immutable non-TTL files with file numbers
+ // 1..128.
+ {
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), kNumBlobFiles);
+ for (size_t i = 0; i < kNumBlobFiles; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+ ASSERT_EQ(live_imm_files[i]->GetFileSize(),
+ kBlobFileSize + BlobLogFooter::kSize);
+ }
+ }
+
+ mock_clock_->SetCurrentTime(kCompactTime);
+
+ ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ // We expect the data to remain the same and the blobs from the oldest N files
+ // to be moved to new files. Sequence numbers get zeroed out during the
+ // compaction.
+ VerifyDB(data);
+
+ for (auto &pair : blob_value_versions) {
+ KeyVersion &version = pair.second;
+ version.sequence = 0;
+ }
+
+ VerifyBaseDB(blob_value_versions);
+
+ const uint64_t cutoff = static_cast<uint64_t>(
+ bdb_options.garbage_collection_cutoff * kNumBlobFiles);
+ for (auto &pair : blob_index_versions) {
+ BlobIndexVersion &version = pair.second;
+
+ version.sequence = 0;
+
+ if (version.file_number == kInvalidBlobFileNumber) {
+ continue;
+ }
+
+ if (version.file_number > cutoff) {
+ continue;
+ }
+
+ version.file_number += kNumBlobFiles + 1;
+ }
+
+ VerifyBaseDBBlobIndex(blob_index_versions);
+
+ const Statistics *const statistics = options.statistics.get();
+ assert(statistics);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), cutoff);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), cutoff);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED),
+ cutoff * kBlobsPerFile);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED),
+ cutoff * kBlobsPerFile * kLargeValueSize);
+
+ // At this point, we should have 128 immutable non-TTL files with file numbers
+ // 33..128 and 130..161. (129 was taken by the TTL blob file.)
+ {
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), kNumBlobFiles);
+ for (size_t i = 0; i < kNumBlobFiles; ++i) {
+ uint64_t expected_file_number = i + cutoff + 1;
+ if (expected_file_number > kNumBlobFiles) {
+ ++expected_file_number;
+ }
+
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), expected_file_number);
+ ASSERT_EQ(live_imm_files[i]->GetFileSize(),
+ kBlobFileSize + BlobLogFooter::kSize);
+ }
+ }
+}
+
+TEST_F(BlobDBTest, GarbageCollectionFailure) {
+ BlobDBOptions bdb_options;
+ bdb_options.min_blob_size = 0;
+ bdb_options.enable_garbage_collection = true;
+ bdb_options.garbage_collection_cutoff = 1.0;
+ bdb_options.disable_background_tasks = true;
+
+ Options db_options;
+ db_options.statistics = CreateDBStatistics();
+
+ Open(bdb_options, db_options);
+
+ // Write a couple of valid blobs.
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(Put("dead", "beef"));
+
+ // Write a fake blob reference into the base DB that points to a non-existing
+ // blob file.
+ std::string blob_index;
+ BlobIndex::EncodeBlob(&blob_index, /* file_number */ 1000, /* offset */ 1234,
+ /* size */ 5678, kNoCompression);
+
+ WriteBatch batch;
+ ASSERT_OK(WriteBatchInternal::PutBlobIndex(
+ &batch, blob_db_->DefaultColumnFamily()->GetID(), "key", blob_index));
+ ASSERT_OK(blob_db_->GetRootDB()->Write(WriteOptions(), &batch));
+
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+ auto blob_file = blob_files[0];
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+
+ ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+ .IsIOError());
+
+ const Statistics *const statistics = db_options.statistics.get();
+ assert(statistics);
+
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), 1);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 1);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED), 2);
+ ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED), 7);
+}
+
+// File should be evicted after expiration.
+TEST_F(BlobDBTest, EvictExpiredFile) {
+ BlobDBOptions bdb_options;
+ bdb_options.ttl_range_secs = 100;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = true;
+ Options options;
+ options.env = mock_env_.get();
+ Open(bdb_options, options);
+ mock_clock_->SetCurrentTime(50);
+ std::map<std::string, std::string> data;
+ ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ auto blob_file = blob_files[0];
+ ASSERT_FALSE(blob_file->Immutable());
+ ASSERT_FALSE(blob_file->Obsolete());
+ VerifyDB(data);
+ mock_clock_->SetCurrentTime(250);
+ // The key should expired now.
+ blob_db_impl()->TEST_EvictExpiredFiles();
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ ASSERT_TRUE(blob_file->Immutable());
+ ASSERT_TRUE(blob_file->Obsolete());
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ // Make sure we don't return garbage value after blob file being evicted,
+ // but the blob index still exists in the LSM tree.
+ std::string val = "";
+ ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound());
+ ASSERT_EQ("", val);
+}
+
+TEST_F(BlobDBTest, DisableFileDeletions) {
+ BlobDBOptions bdb_options;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+ std::map<std::string, std::string> data;
+ for (bool force : {true, false}) {
+ ASSERT_OK(Put("foo", "v", &data));
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ auto blob_file = blob_files[0];
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+ blob_db_impl()->TEST_ObsoleteBlobFile(blob_file);
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ // Call DisableFileDeletions twice.
+ ASSERT_OK(blob_db_->DisableFileDeletions());
+ ASSERT_OK(blob_db_->DisableFileDeletions());
+ // File deletions should be disabled.
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ VerifyDB(data);
+ // Enable file deletions once. If force=true, file deletion is enabled.
+ // Otherwise it needs to enable it for a second time.
+ ASSERT_OK(blob_db_->EnableFileDeletions(force));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ if (!force) {
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ VerifyDB(data);
+ // Call EnableFileDeletions a second time.
+ ASSERT_OK(blob_db_->EnableFileDeletions(false));
+ blob_db_impl()->TEST_DeleteObsoleteFiles();
+ }
+ // Regardless of value of `force`, file should be deleted by now.
+ ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+ ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+ VerifyDB({});
+ }
+}
+
+TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
+ BlobDBOptions bdb_options;
+ bdb_options.enable_garbage_collection = true;
+ bdb_options.disable_background_tasks = true;
+ Open(bdb_options);
+
+ // Register some dummy blob files.
+ blob_db_impl()->TEST_AddDummyBlobFile(1, /* immutable_sequence */ 200);
+ blob_db_impl()->TEST_AddDummyBlobFile(2, /* immutable_sequence */ 300);
+ blob_db_impl()->TEST_AddDummyBlobFile(3, /* immutable_sequence */ 400);
+ blob_db_impl()->TEST_AddDummyBlobFile(4, /* immutable_sequence */ 500);
+ blob_db_impl()->TEST_AddDummyBlobFile(5, /* immutable_sequence */ 600);
+
+ // Initialize the blob <-> SST file mapping. First, add some SST files with
+ // blob file references, then some without.
+ std::vector<LiveFileMetaData> live_files;
+
+ for (uint64_t i = 1; i <= 10; ++i) {
+ LiveFileMetaData live_file;
+ live_file.file_number = i;
+ live_file.oldest_blob_file_number = ((i - 1) % 5) + 1;
+
+ live_files.emplace_back(live_file);
+ }
+
+ for (uint64_t i = 11; i <= 20; ++i) {
+ LiveFileMetaData live_file;
+ live_file.file_number = i;
+
+ live_files.emplace_back(live_file);
+ }
+
+ blob_db_impl()->TEST_InitializeBlobFileToSstMapping(live_files);
+
+ // Check that the blob <-> SST mappings have been correctly initialized.
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+
+ ASSERT_EQ(blob_files.size(), 5);
+
+ {
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+ }
+
+ ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+ }
+
+ {
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+ const std::vector<bool> expected_obsolete{false, false, false, false,
+ false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+ }
+
+ ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+ }
+
+ // Simulate a flush where the SST does not reference any blob files.
+ {
+ FlushJobInfo info{};
+ info.file_number = 21;
+ info.smallest_seqno = 1;
+ info.largest_seqno = 100;
+
+ blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+ const std::vector<bool> expected_obsolete{false, false, false, false,
+ false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+ }
+
+ ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+ }
+
+ // Simulate a flush where the SST references a blob file.
+ {
+ FlushJobInfo info{};
+ info.file_number = 22;
+ info.oldest_blob_file_number = 5;
+ info.smallest_seqno = 101;
+ info.largest_seqno = 200;
+
+ blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}};
+ const std::vector<bool> expected_obsolete{false, false, false, false,
+ false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 5);
+ for (size_t i = 0; i < 5; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+ }
+
+ ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+ }
+
+ // Simulate a compaction. Some inputs and outputs have blob file references,
+ // some don't. There is also a trivial move (which means the SST appears on
+ // both the input and the output list). Blob file 1 loses all its linked SSTs,
+ // and since it got marked immutable at sequence number 200 which has already
+ // been flushed, it can be marked obsolete.
+ {
+ CompactionJobInfo info{};
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1});
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2});
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 6, 1});
+ info.input_file_infos.emplace_back(
+ CompactionFileInfo{1, 11, kInvalidBlobFileNumber});
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 22, 5});
+ info.output_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+ info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3});
+ info.output_file_infos.emplace_back(
+ CompactionFileInfo{2, 24, kInvalidBlobFileNumber});
+
+ blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+ const std::vector<bool> expected_obsolete{true, false, false, false, false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 4);
+ for (size_t i = 0; i < 4; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+ }
+
+ auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_EQ(obsolete_files.size(), 1);
+ ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+ }
+
+ // Simulate a failed compaction. No mappings should be updated.
+ {
+ CompactionJobInfo info{};
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+ info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+ info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+ info.status = Status::Corruption();
+
+ blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+ const std::vector<bool> expected_obsolete{true, false, false, false, false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 4);
+ for (size_t i = 0; i < 4; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+ }
+
+ auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_EQ(obsolete_files.size(), 1);
+ ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+ }
+
+ // Simulate another compaction. Blob file 2 loses all its linked SSTs
+ // but since it got marked immutable at sequence number 300 which hasn't
+ // been flushed yet, it cannot be marked obsolete at this point.
+ {
+ CompactionJobInfo info{};
+ info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+ info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+ info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+
+ blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+ const std::vector<bool> expected_obsolete{true, false, false, false, false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 4);
+ for (size_t i = 0; i < 4; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+ }
+
+ auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_EQ(obsolete_files.size(), 1);
+ ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+ }
+
+ // Simulate a flush with largest sequence number 300. This will make it
+ // possible to mark blob file 2 obsolete.
+ {
+ FlushJobInfo info{};
+ info.file_number = 26;
+ info.smallest_seqno = 201;
+ info.largest_seqno = 300;
+
+ blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+ const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+ {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+ const std::vector<bool> expected_obsolete{true, true, false, false, false};
+ for (size_t i = 0; i < 5; ++i) {
+ const auto &blob_file = blob_files[i];
+ ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+ ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+ }
+
+ auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+ ASSERT_EQ(live_imm_files.size(), 3);
+ for (size_t i = 0; i < 3; ++i) {
+ ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 3);
+ }
+
+ auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+ ASSERT_EQ(obsolete_files.size(), 2);
+ ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+ ASSERT_EQ(obsolete_files[1]->BlobFileNumber(), 2);
+ }
+}
+
+TEST_F(BlobDBTest, ShutdownWait) {
+ BlobDBOptions bdb_options;
+ bdb_options.ttl_range_secs = 100;
+ bdb_options.min_blob_size = 0;
+ bdb_options.disable_background_tasks = false;
+ Options options;
+ options.env = mock_env_.get();
+
+ SyncPoint::GetInstance()->LoadDependency({
+ {"BlobDBImpl::EvictExpiredFiles:0", "BlobDBTest.ShutdownWait:0"},
+ {"BlobDBTest.ShutdownWait:1", "BlobDBImpl::EvictExpiredFiles:1"},
+ {"BlobDBImpl::EvictExpiredFiles:2", "BlobDBTest.ShutdownWait:2"},
+ {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"},
+ });
+ // Force all tasks to be scheduled immediately.
+ SyncPoint::GetInstance()->SetCallBack(
+ "TimeQueue::Add:item.end", [&](void *arg) {
+ std::chrono::steady_clock::time_point *tp =
+ static_cast<std::chrono::steady_clock::time_point *>(arg);
+ *tp =
+ std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
+ });
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
+ // Sleep 3 ms to increase the chance of data race.
+ // We've synced up the code so that EvictExpiredFiles()
+ // is called concurrently with ~BlobDBImpl().
+ // ~BlobDBImpl() is supposed to wait for all background
+ // task to shutdown before doing anything else. In order
+ // to use the same test to reproduce a bug of the waiting
+ // logic, we wait a little bit here, so that TSAN can
+ // catch the data race.
+ // We should improve the test if we find a better way.
+ Env::Default()->SleepForMicroseconds(3000);
+ });
+
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Open(bdb_options, options);
+ mock_clock_->SetCurrentTime(50);
+ std::map<std::string, std::string> data;
+ ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(1, blob_files.size());
+ auto blob_file = blob_files[0];
+ ASSERT_FALSE(blob_file->Immutable());
+ ASSERT_FALSE(blob_file->Obsolete());
+ VerifyDB(data);
+
+ TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0");
+ mock_clock_->SetCurrentTime(250);
+ // The key should expired now.
+ TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1");
+
+ TEST_SYNC_POINT("BlobDBTest.ShutdownWait:2");
+ TEST_SYNC_POINT("BlobDBTest.ShutdownWait:3");
+ Close();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
+ Options options;
+ options.statistics = CreateDBStatistics();
+
+ BlobDBOptions blob_options;
+ blob_options.min_blob_size = 0;
+ blob_options.bytes_per_sync = 1 << 20;
+ blob_options.disable_background_tasks = true;
+
+ Open(blob_options, options);
+
+ ASSERT_OK(Put("foo", "bar"));
+
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+
+ ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+ ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_BLOB_FILE_SYNCED), 1);
+}
+
+TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
+ Options options;
+ options.env = fault_injection_env_.get();
+
+ BlobDBOptions blob_options;
+ blob_options.min_blob_size = 0;
+ blob_options.bytes_per_sync = 1 << 20;
+ blob_options.disable_background_tasks = true;
+
+ Open(blob_options, options);
+
+ ASSERT_OK(Put("foo", "bar"));
+
+ auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+ ASSERT_EQ(blob_files.size(), 1);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlobLogWriter::Sync", [this](void * /* arg */) {
+ fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ const Status s = blob_db_impl()->TEST_CloseBlobFile(blob_files[0]);
+
+ fault_injection_env_->SetFilesystemActive(true);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ASSERT_TRUE(s.IsIOError());
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char **argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as BlobDB is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.cc b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc
new file mode 100644
index 000000000..1e0632990
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_dump_tool.h"
+
+#include <stdio.h>
+
+#include <cinttypes>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+BlobDumpTool::BlobDumpTool()
+ : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {}
+
+Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
+ DisplayType show_blob,
+ DisplayType show_uncompressed_blob,
+ bool show_summary) {
+ constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
+ Status s;
+ const auto fs = FileSystem::Default();
+ IOOptions io_opts;
+ s = fs->FileExists(filename, io_opts, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ uint64_t file_size = 0;
+ s = fs->GetFileSize(filename, io_opts, &file_size, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ std::unique_ptr<FSRandomAccessFile> file;
+ s = fs->NewRandomAccessFile(filename, FileOptions(), &file, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ file = NewReadaheadRandomAccessFile(std::move(file), kReadaheadSize);
+ if (file_size == 0) {
+ return Status::Corruption("File is empty.");
+ }
+ reader_.reset(new RandomAccessFileReader(std::move(file), filename));
+ uint64_t offset = 0;
+ uint64_t footer_offset = 0;
+ CompressionType compression = kNoCompression;
+ s = DumpBlobLogHeader(&offset, &compression);
+ if (!s.ok()) {
+ return s;
+ }
+ s = DumpBlobLogFooter(file_size, &footer_offset);
+ if (!s.ok()) {
+ return s;
+ }
+ uint64_t total_records = 0;
+ uint64_t total_key_size = 0;
+ uint64_t total_blob_size = 0;
+ uint64_t total_uncompressed_blob_size = 0;
+ if (show_key != DisplayType::kNone || show_summary) {
+ while (offset < footer_offset) {
+ s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary,
+ compression, &offset, &total_records, &total_key_size,
+ &total_blob_size, &total_uncompressed_blob_size);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (show_summary) {
+ fprintf(stdout, "Summary:\n");
+ fprintf(stdout, " total records: %" PRIu64 "\n", total_records);
+ fprintf(stdout, " total key size: %" PRIu64 "\n", total_key_size);
+ fprintf(stdout, " total blob size: %" PRIu64 "\n", total_blob_size);
+ if (compression != kNoCompression) {
+ fprintf(stdout, " total raw blob size: %" PRIu64 "\n",
+ total_uncompressed_blob_size);
+ }
+ }
+ return s;
+}
+
+Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
+ if (buffer_size_ < size) {
+ if (buffer_size_ == 0) {
+ buffer_size_ = 4096;
+ }
+ while (buffer_size_ < size) {
+ buffer_size_ *= 2;
+ }
+ buffer_.reset(new char[buffer_size_]);
+ }
+ Status s = reader_->Read(IOOptions(), offset, size, result, buffer_.get(),
+ nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+ if (result->size() != size) {
+ return Status::Corruption("Reach the end of the file unexpectedly.");
+ }
+ return s;
+}
+
+Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
+ CompressionType* compression) {
+ Slice slice;
+ Status s = Read(0, BlobLogHeader::kSize, &slice);
+ if (!s.ok()) {
+ return s;
+ }
+ BlobLogHeader header;
+ s = header.DecodeFrom(slice);
+ if (!s.ok()) {
+ return s;
+ }
+ fprintf(stdout, "Blob log header:\n");
+ fprintf(stdout, " Version : %" PRIu32 "\n", header.version);
+ fprintf(stdout, " Column Family ID : %" PRIu32 "\n",
+ header.column_family_id);
+ std::string compression_str;
+ if (!GetStringFromCompressionType(&compression_str, header.compression)
+ .ok()) {
+ compression_str = "Unrecongnized compression type (" +
+ std::to_string((int)header.compression) + ")";
+ }
+ fprintf(stdout, " Compression : %s\n", compression_str.c_str());
+ fprintf(stdout, " Expiration range : %s\n",
+ GetString(header.expiration_range).c_str());
+ *offset = BlobLogHeader::kSize;
+ *compression = header.compression;
+ return s;
+}
+
+Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
+ uint64_t* footer_offset) {
+ auto no_footer = [&]() {
+ *footer_offset = file_size;
+ fprintf(stdout, "No blob log footer.\n");
+ return Status::OK();
+ };
+ if (file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+ return no_footer();
+ }
+ Slice slice;
+ *footer_offset = file_size - BlobLogFooter::kSize;
+ Status s = Read(*footer_offset, BlobLogFooter::kSize, &slice);
+ if (!s.ok()) {
+ return s;
+ }
+ BlobLogFooter footer;
+ s = footer.DecodeFrom(slice);
+ if (!s.ok()) {
+ return no_footer();
+ }
+ fprintf(stdout, "Blob log footer:\n");
+ fprintf(stdout, " Blob count : %" PRIu64 "\n", footer.blob_count);
+ fprintf(stdout, " Expiration Range : %s\n",
+ GetString(footer.expiration_range).c_str());
+ return s;
+}
+
+Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
+ DisplayType show_uncompressed_blob,
+ bool show_summary, CompressionType compression,
+ uint64_t* offset, uint64_t* total_records,
+ uint64_t* total_key_size,
+ uint64_t* total_blob_size,
+ uint64_t* total_uncompressed_blob_size) {
+ if (show_key != DisplayType::kNone) {
+ fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
+ *offset, *offset);
+ }
+ Slice slice;
+ Status s = Read(*offset, BlobLogRecord::kHeaderSize, &slice);
+ if (!s.ok()) {
+ return s;
+ }
+ BlobLogRecord record;
+ s = record.DecodeHeaderFrom(slice);
+ if (!s.ok()) {
+ return s;
+ }
+ uint64_t key_size = record.key_size;
+ uint64_t value_size = record.value_size;
+ if (show_key != DisplayType::kNone) {
+ fprintf(stdout, " key size : %" PRIu64 "\n", key_size);
+ fprintf(stdout, " value size : %" PRIu64 "\n", value_size);
+ fprintf(stdout, " expiration : %" PRIu64 "\n", record.expiration);
+ }
+ *offset += BlobLogRecord::kHeaderSize;
+ s = Read(*offset, static_cast<size_t>(key_size + value_size), &slice);
+ if (!s.ok()) {
+ return s;
+ }
+ // Decompress value
+ std::string uncompressed_value;
+ if (compression != kNoCompression &&
+ (show_uncompressed_blob != DisplayType::kNone || show_summary)) {
+ BlockContents contents;
+ UncompressionContext context(compression);
+ UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+ compression);
+ s = UncompressBlockData(
+ info, slice.data() + key_size, static_cast<size_t>(value_size),
+ &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
+ if (!s.ok()) {
+ return s;
+ }
+ uncompressed_value = contents.data.ToString();
+ }
+ if (show_key != DisplayType::kNone) {
+ fprintf(stdout, " key : ");
+ DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
+ if (show_blob != DisplayType::kNone) {
+ fprintf(stdout, " blob : ");
+ DumpSlice(Slice(slice.data() + static_cast<size_t>(key_size),
+ static_cast<size_t>(value_size)),
+ show_blob);
+ }
+ if (show_uncompressed_blob != DisplayType::kNone) {
+ fprintf(stdout, " raw blob : ");
+ DumpSlice(Slice(uncompressed_value), show_uncompressed_blob);
+ }
+ }
+ *offset += key_size + value_size;
+ *total_records += 1;
+ *total_key_size += key_size;
+ *total_blob_size += value_size;
+ *total_uncompressed_blob_size += uncompressed_value.size();
+ return s;
+}
+
+void BlobDumpTool::DumpSlice(const Slice s, DisplayType type) {
+ if (type == DisplayType::kRaw) {
+ fprintf(stdout, "%s\n", s.ToString().c_str());
+ } else if (type == DisplayType::kHex) {
+ fprintf(stdout, "%s\n", s.ToString(true /*hex*/).c_str());
+ } else if (type == DisplayType::kDetail) {
+ char buf[100];
+ for (size_t i = 0; i < s.size(); i += 16) {
+ memset(buf, 0, sizeof(buf));
+ for (size_t j = 0; j < 16 && i + j < s.size(); j++) {
+ unsigned char c = s[i + j];
+ snprintf(buf + j * 3 + 15, 2, "%x", c >> 4);
+ snprintf(buf + j * 3 + 16, 2, "%x", c & 0xf);
+ snprintf(buf + j + 65, 2, "%c", (0x20 <= c && c <= 0x7e) ? c : '.');
+ }
+ for (size_t p = 0; p + 1 < sizeof(buf); p++) {
+ if (buf[p] == 0) {
+ buf[p] = ' ';
+ }
+ }
+ fprintf(stdout, "%s\n", i == 0 ? buf + 15 : buf);
+ }
+ }
+}
+
+template <class T>
+std::string BlobDumpTool::GetString(std::pair<T, T> p) {
+ if (p.first == 0 && p.second == 0) {
+ return "nil";
+ }
+ return "(" + std::to_string(p.first) + ", " + std::to_string(p.second) + ")";
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.h b/src/rocksdb/utilities/blob_db/blob_dump_tool.h
new file mode 100644
index 000000000..bece564e1
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "db/blob/blob_log_format.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDumpTool {
+ public:
+ enum class DisplayType {
+ kNone,
+ kRaw,
+ kHex,
+ kDetail,
+ };
+
+ BlobDumpTool();
+
+ Status Run(const std::string& filename, DisplayType show_key,
+ DisplayType show_blob, DisplayType show_uncompressed_blob,
+ bool show_summary);
+
+ private:
+ std::unique_ptr<RandomAccessFileReader> reader_;
+ std::unique_ptr<char[]> buffer_;
+ size_t buffer_size_;
+
+ Status Read(uint64_t offset, size_t size, Slice* result);
+ Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression);
+ Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset);
+ Status DumpRecord(DisplayType show_key, DisplayType show_blob,
+ DisplayType show_uncompressed_blob, bool show_summary,
+ CompressionType compression, uint64_t* offset,
+ uint64_t* total_records, uint64_t* total_key_size,
+ uint64_t* total_blob_size,
+ uint64_t* total_uncompressed_blob_size);
+ void DumpSlice(const Slice s, DisplayType type);
+
+ template <class T>
+ std::string GetString(std::pair<T, T> p);
+};
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_file.cc b/src/rocksdb/utilities/blob_db/blob_file.cc
new file mode 100644
index 000000000..c68e557c6
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_file.cc
@@ -0,0 +1,318 @@
+
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#include "utilities/blob_db/blob_file.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/filename.h"
+#include "file/readahead_raf.h"
+#include "logging/logging.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+ Logger* info_log)
+ : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+ Logger* info_log, uint32_t column_family_id,
+ CompressionType compression, bool has_ttl,
+ const ExpirationRange& expiration_range)
+ : parent_(p),
+ path_to_dir_(bdir),
+ file_number_(fn),
+ info_log_(info_log),
+ column_family_id_(column_family_id),
+ compression_(compression),
+ has_ttl_(has_ttl),
+ expiration_range_(expiration_range),
+ header_(column_family_id, compression, has_ttl, expiration_range),
+ header_valid_(true) {}
+
+BlobFile::~BlobFile() {
+ if (obsolete_) {
+ std::string pn(PathName());
+ Status s = Env::Default()->DeleteFile(PathName());
+ if (!s.ok()) {
+ // ROCKS_LOG_INFO(db_options_.info_log,
+ // "File could not be deleted %s", pn.c_str());
+ }
+ }
+}
+
+uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; }
+
+std::string BlobFile::PathName() const {
+ return BlobFileName(path_to_dir_, file_number_);
+}
+
+std::string BlobFile::DumpState() const {
+ char str[1000];
+ snprintf(
+ str, sizeof(str),
+ "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64
+ " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
+ "), writer: %d reader: %d",
+ path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(),
+ closed_.load(), obsolete_.load(), expiration_range_.first,
+ expiration_range_.second, (!!log_writer_), (!!ra_file_reader_));
+ return str;
+}
+
+void BlobFile::MarkObsolete(SequenceNumber sequence) {
+ assert(Immutable());
+ obsolete_sequence_ = sequence;
+ obsolete_.store(true);
+}
+
+Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) {
+ BlobLogFooter footer;
+ footer.blob_count = blob_count_;
+ if (HasTTL()) {
+ footer.expiration_range = expiration_range_;
+ }
+
+ // this will close the file and reset the Writable File Pointer.
+ Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr,
+ /* checksum_value */ nullptr);
+ if (s.ok()) {
+ closed_ = true;
+ immutable_sequence_ = sequence;
+ file_size_ += BlobLogFooter::kSize;
+ }
+ // delete the sequential writer
+ log_writer_.reset();
+ return s;
+}
+
+Status BlobFile::ReadFooter(BlobLogFooter* bf) {
+ if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) {
+ return Status::IOError("File does not have footer", PathName());
+ }
+
+ uint64_t footer_offset = file_size_ - BlobLogFooter::kSize;
+ // assume that ra_file_reader_ is valid before we enter this
+ assert(ra_file_reader_);
+
+ Slice result;
+ std::string buf;
+ AlignedBuf aligned_buf;
+ Status s;
+ // TODO: rate limit reading footers from blob files.
+ if (ra_file_reader_->use_direct_io()) {
+ s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+ &result, nullptr, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ } else {
+ buf.reserve(BlobLogFooter::kSize + 10);
+ s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+ &result, &buf[0], nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ }
+ if (!s.ok()) return s;
+ if (result.size() != BlobLogFooter::kSize) {
+ // should not happen
+ return Status::IOError("EOF reached before footer");
+ }
+
+ s = bf->DecodeFrom(result);
+ return s;
+}
+
+Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
+ blob_count_ = footer.blob_count;
+ expiration_range_ = footer.expiration_range;
+ closed_ = true;
+ return Status::OK();
+}
+
+Status BlobFile::Fsync() {
+ Status s;
+ if (log_writer_.get()) {
+ s = log_writer_->Sync();
+ }
+ return s;
+}
+
+void BlobFile::CloseRandomAccessLocked() {
+ ra_file_reader_.reset();
+ last_access_ = -1;
+}
+
+Status BlobFile::GetReader(Env* env, const FileOptions& file_options,
+ std::shared_ptr<RandomAccessFileReader>* reader,
+ bool* fresh_open) {
+ assert(reader != nullptr);
+ assert(fresh_open != nullptr);
+ *fresh_open = false;
+ int64_t current_time = 0;
+ if (env->GetCurrentTime(&current_time).ok()) {
+ last_access_.store(current_time);
+ }
+ Status s;
+
+ {
+ ReadLock lockbfile_r(&mutex_);
+ if (ra_file_reader_) {
+ *reader = ra_file_reader_;
+ return s;
+ }
+ }
+
+ WriteLock lockbfile_w(&mutex_);
+ // Double check.
+ if (ra_file_reader_) {
+ *reader = ra_file_reader_;
+ return s;
+ }
+
+ std::unique_ptr<FSRandomAccessFile> rfile;
+ s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options,
+ &rfile, nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(info_log_,
+ "Failed to open blob file for random-read: %s status: '%s'"
+ " exists: '%s'",
+ PathName().c_str(), s.ToString().c_str(),
+ env->FileExists(PathName()).ToString().c_str());
+ return s;
+ }
+
+ ra_file_reader_ =
+ std::make_shared<RandomAccessFileReader>(std::move(rfile), PathName());
+ *reader = ra_file_reader_;
+ *fresh_open = true;
+ return s;
+}
+
+Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_options) {
+ assert(Immutable());
+ // Get file size.
+ uint64_t file_size = 0;
+ Status s =
+ fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr);
+ if (s.ok()) {
+ file_size_ = file_size;
+ } else {
+ ROCKS_LOG_ERROR(info_log_,
+ "Failed to get size of blob file %" PRIu64 ", status: %s",
+ file_number_, s.ToString().c_str());
+ return s;
+ }
+ if (file_size < BlobLogHeader::kSize) {
+ ROCKS_LOG_ERROR(
+ info_log_, "Incomplete blob file blob file %" PRIu64 ", size: %" PRIu64,
+ file_number_, file_size);
+ return Status::Corruption("Incomplete blob file header.");
+ }
+
+ // Create file reader.
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader,
+ nullptr);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(info_log_,
+ "Failed to open blob file %" PRIu64 ", status: %s",
+ file_number_, s.ToString().c_str());
+ return s;
+ }
+
+ // Read file header.
+ std::string header_buf;
+ AlignedBuf aligned_buf;
+ Slice header_slice;
+ // TODO: rate limit reading headers from blob files.
+ if (file_reader->use_direct_io()) {
+ s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+ nullptr, &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ } else {
+ header_buf.reserve(BlobLogHeader::kSize);
+ s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+ &header_buf[0], nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ info_log_, "Failed to read header of blob file %" PRIu64 ", status: %s",
+ file_number_, s.ToString().c_str());
+ return s;
+ }
+ BlobLogHeader header;
+ s = header.DecodeFrom(header_slice);
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(info_log_,
+ "Failed to decode header of blob file %" PRIu64
+ ", status: %s",
+ file_number_, s.ToString().c_str());
+ return s;
+ }
+ column_family_id_ = header.column_family_id;
+ compression_ = header.compression;
+ has_ttl_ = header.has_ttl;
+ if (has_ttl_) {
+ expiration_range_ = header.expiration_range;
+ }
+ header_valid_ = true;
+
+ // Read file footer.
+ if (file_size_ < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+ // OK not to have footer.
+ assert(!footer_valid_);
+ return Status::OK();
+ }
+ std::string footer_buf;
+ Slice footer_slice;
+ // TODO: rate limit reading footers from blob files.
+ if (file_reader->use_direct_io()) {
+ s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+ BlobLogFooter::kSize, &footer_slice, nullptr,
+ &aligned_buf,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ } else {
+ footer_buf.reserve(BlobLogFooter::kSize);
+ s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+ BlobLogFooter::kSize, &footer_slice, &footer_buf[0],
+ nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(
+ info_log_, "Failed to read footer of blob file %" PRIu64 ", status: %s",
+ file_number_, s.ToString().c_str());
+ return s;
+ }
+ BlobLogFooter footer;
+ s = footer.DecodeFrom(footer_slice);
+ if (!s.ok()) {
+ // OK not to have footer.
+ assert(!footer_valid_);
+ return Status::OK();
+ }
+ blob_count_ = footer.blob_count;
+ if (has_ttl_) {
+ assert(header.expiration_range.first <= footer.expiration_range.first);
+ assert(header.expiration_range.second >= footer.expiration_range.second);
+ expiration_range_ = footer.expiration_range;
+ }
+ footer_valid_ = true;
+ return Status::OK();
+}
+
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_file.h b/src/rocksdb/utilities/blob_db/blob_file.h
new file mode 100644
index 000000000..6f3f2bea7
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_file.h
@@ -0,0 +1,246 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <limits>
+#include <memory>
+#include <unordered_set>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBImpl;
+
+class BlobFile {
+ friend class BlobDBImpl;
+ friend struct BlobFileComparator;
+ friend struct BlobFileComparatorTTL;
+ friend class BlobIndexCompactionFilterBase;
+ friend class BlobIndexCompactionFilterGC;
+
+ private:
+ // access to parent
+ const BlobDBImpl* parent_{nullptr};
+
+ // path to blob directory
+ std::string path_to_dir_;
+
+ // the id of the file.
+ // the above 2 are created during file creation and never changed
+ // after that
+ uint64_t file_number_{0};
+
+ // The file numbers of the SST files whose oldest blob file reference
+ // points to this blob file.
+ std::unordered_set<uint64_t> linked_sst_files_;
+
+ // Info log.
+ Logger* info_log_{nullptr};
+
+ // Column family id.
+ uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
+
+ // Compression type of blobs in the file
+ CompressionType compression_{kNoCompression};
+
+ // If true, the keys in this file all has TTL. Otherwise all keys don't
+ // have TTL.
+ bool has_ttl_{false};
+
+ // TTL range of blobs in the file.
+ ExpirationRange expiration_range_;
+
+ // number of blobs in the file
+ std::atomic<uint64_t> blob_count_{0};
+
+ // size of the file
+ std::atomic<uint64_t> file_size_{0};
+
+ BlobLogHeader header_;
+
+ // closed_ = true implies the file is no more mutable
+ // no more blobs will be appended and the footer has been written out
+ std::atomic<bool> closed_{false};
+
+ // The latest sequence number when the file was closed/made immutable.
+ SequenceNumber immutable_sequence_{0};
+
+ // Whether the file was marked obsolete (due to either TTL or GC).
+ // obsolete_ still needs to do iterator/snapshot checks
+ std::atomic<bool> obsolete_{false};
+
+ // The last sequence number by the time the file marked as obsolete.
+ // Data in this file is visible to a snapshot taken before the sequence.
+ SequenceNumber obsolete_sequence_{0};
+
+ // Sequential/Append writer for blobs
+ std::shared_ptr<BlobLogWriter> log_writer_;
+
+ // random access file reader for GET calls
+ std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
+
+ // This Read-Write mutex is per file specific and protects
+ // all the datastructures
+ mutable port::RWMutex mutex_;
+
+ // time when the random access reader was last created.
+ std::atomic<std::int64_t> last_access_{-1};
+
+ bool header_valid_{false};
+
+ bool footer_valid_{false};
+
+ public:
+ BlobFile() = default;
+
+ BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+ Logger* info_log);
+
+ BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+ Logger* info_log, uint32_t column_family_id,
+ CompressionType compression, bool has_ttl,
+ const ExpirationRange& expiration_range);
+
+ ~BlobFile();
+
+ uint32_t GetColumnFamilyId() const;
+
+ // Returns log file's absolute pathname.
+ std::string PathName() const;
+
+ // Primary identifier for blob file.
+ // once the file is created, this never changes
+ uint64_t BlobFileNumber() const { return file_number_; }
+
+ // Get the set of SST files whose oldest blob file reference points to
+ // this file.
+ const std::unordered_set<uint64_t>& GetLinkedSstFiles() const {
+ return linked_sst_files_;
+ }
+
+ // Link an SST file whose oldest blob file reference points to this file.
+ void LinkSstFile(uint64_t sst_file_number) {
+ assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end());
+ linked_sst_files_.insert(sst_file_number);
+ }
+
+ // Unlink an SST file whose oldest blob file reference points to this file.
+ void UnlinkSstFile(uint64_t sst_file_number) {
+ auto it = linked_sst_files_.find(sst_file_number);
+ assert(it != linked_sst_files_.end());
+ linked_sst_files_.erase(it);
+ }
+
+ // the following functions are atomic, and don't need
+ // read lock
+ uint64_t BlobCount() const {
+ return blob_count_.load(std::memory_order_acquire);
+ }
+
+ std::string DumpState() const;
+
+ // if the file is not taking any more appends.
+ bool Immutable() const { return closed_.load(); }
+
+ // Mark the file as immutable.
+ // REQUIRES: write lock held, or access from single thread (on DB open).
+ void MarkImmutable(SequenceNumber sequence) {
+ closed_ = true;
+ immutable_sequence_ = sequence;
+ }
+
+ SequenceNumber GetImmutableSequence() const {
+ assert(Immutable());
+ return immutable_sequence_;
+ }
+
+ // Whether the file was marked obsolete (due to either TTL or GC).
+ bool Obsolete() const {
+ assert(Immutable() || !obsolete_.load());
+ return obsolete_.load();
+ }
+
+ // Mark file as obsolete (due to either TTL or GC). The file is not visible to
+ // snapshots with sequence greater or equal to the given sequence.
+ void MarkObsolete(SequenceNumber sequence);
+
+ SequenceNumber GetObsoleteSequence() const {
+ assert(Obsolete());
+ return obsolete_sequence_;
+ }
+
+ Status Fsync();
+
+ uint64_t GetFileSize() const {
+ return file_size_.load(std::memory_order_acquire);
+ }
+
+ // All Get functions which are not atomic, will need ReadLock on the mutex
+
+ const ExpirationRange& GetExpirationRange() const {
+ return expiration_range_;
+ }
+
+ void ExtendExpirationRange(uint64_t expiration) {
+ expiration_range_.first = std::min(expiration_range_.first, expiration);
+ expiration_range_.second = std::max(expiration_range_.second, expiration);
+ }
+
+ bool HasTTL() const { return has_ttl_; }
+
+ void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
+
+ CompressionType GetCompressionType() const { return compression_; }
+
+ std::shared_ptr<BlobLogWriter> GetWriter() const { return log_writer_; }
+
+ // Read blob file header and footer. Return corruption if file header is
+ // malform or incomplete. If footer is malform or incomplete, set
+ // footer_valid_ to false and return Status::OK.
+ Status ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_options);
+
+ Status GetReader(Env* env, const FileOptions& file_options,
+ std::shared_ptr<RandomAccessFileReader>* reader,
+ bool* fresh_open);
+
+ private:
+ Status ReadFooter(BlobLogFooter* footer);
+
+ Status WriteFooterAndCloseLocked(SequenceNumber sequence);
+
+ void CloseRandomAccessLocked();
+
+ // this is used, when you are reading only the footer of a
+ // previously closed file
+ Status SetFromFooterLocked(const BlobLogFooter& footer);
+
+ void set_expiration_range(const ExpirationRange& expiration_range) {
+ expiration_range_ = expiration_range;
+ }
+
+ // The following functions are atomic, and don't need locks
+ void SetFileSize(uint64_t fs) { file_size_ = fs; }
+
+ void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
+
+ void BlobRecordAdded(uint64_t record_size) {
+ ++blob_count_;
+ file_size_ += record_size;
+ }
+};
+} // namespace blob_db
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load.cc b/src/rocksdb/utilities/cache_dump_load.cc
new file mode 100644
index 000000000..9a7c76798
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load.cc
@@ -0,0 +1,69 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/cache_dump_load.h"
+
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+#include "utilities/cache_dump_load_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_opts,
+ const std::string& file_name,
+ std::unique_ptr<CacheDumpWriter>* writer) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ IOStatus io_s = WritableFileWriter::Create(fs, file_name, file_opts,
+ &file_writer, nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ writer->reset(new ToFileCacheDumpWriter(std::move(file_writer)));
+ return io_s;
+}
+
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+ const FileOptions& file_opts,
+ const std::string& file_name,
+ std::unique_ptr<CacheDumpReader>* reader) {
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ IOStatus io_s = RandomAccessFileReader::Create(fs, file_name, file_opts,
+ &file_reader, nullptr);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ reader->reset(new FromFileCacheDumpReader(std::move(file_reader)));
+ return io_s;
+}
+
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+ const std::shared_ptr<Cache>& cache,
+ std::unique_ptr<CacheDumpWriter>&& writer,
+ std::unique_ptr<CacheDumper>* cache_dumper) {
+ cache_dumper->reset(
+ new CacheDumperImpl(dump_options, cache, std::move(writer)));
+ return Status::OK();
+}
+
+Status NewDefaultCacheDumpedLoader(
+ const CacheDumpOptions& dump_options,
+ const BlockBasedTableOptions& toptions,
+ const std::shared_ptr<SecondaryCache>& secondary_cache,
+ std::unique_ptr<CacheDumpReader>&& reader,
+ std::unique_ptr<CacheDumpedLoader>* cache_dump_loader) {
+ cache_dump_loader->reset(new CacheDumpedLoaderImpl(
+ dump_options, toptions, secondary_cache, std::move(reader)));
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load_impl.cc b/src/rocksdb/utilities/cache_dump_load_impl.cc
new file mode 100644
index 000000000..2b9f2a29d
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load_impl.cc
@@ -0,0 +1,393 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+#include "table/block_based/block_based_table_reader.h"
+#ifndef ROCKSDB_LITE
+
+#include "cache/cache_entry_roles.h"
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+#include "utilities/cache_dump_load_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Set the dump filter with a list of DBs. Block cache may be shared by multipe
+// DBs and we may only want to dump out the blocks belonging to certain DB(s).
+// Therefore, a filter is need to decide if the key of the block satisfy the
+// requirement.
+Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
+ Status s = Status::OK();
+ for (size_t i = 0; i < db_list.size(); i++) {
+ assert(i < db_list.size());
+ TablePropertiesCollection ptc;
+ assert(db_list[i] != nullptr);
+ s = db_list[i]->GetPropertiesOfAllTables(&ptc);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto id = ptc.begin(); id != ptc.end(); id++) {
+ OffsetableCacheKey base;
+ // We only want to save cache entries that are portable to another
+ // DB::Open, so only save entries with stable keys.
+ bool is_stable;
+ BlockBasedTable::SetupBaseCacheKey(id->second.get(),
+ /*cur_db_session_id*/ "",
+ /*cur_file_num*/ 0, &base, &is_stable);
+ if (is_stable) {
+ Slice prefix_slice = base.CommonPrefixSlice();
+ assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize);
+ prefix_filter_.insert(prefix_slice.ToString());
+ }
+ }
+ }
+ return s;
+}
+
+// This is the main function to dump out the cache block entries to the writer.
+// The writer may create a file or write to other systems. Currently, we will
+// iterate the whole block cache, get the blocks, and write them to the writer
+IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() {
+ // Prepare stage, check the parameters.
+ if (cache_ == nullptr) {
+ return IOStatus::InvalidArgument("Cache is null");
+ }
+ if (writer_ == nullptr) {
+ return IOStatus::InvalidArgument("CacheDumpWriter is null");
+ }
+ // Set the system clock
+ if (options_.clock == nullptr) {
+ return IOStatus::InvalidArgument("System clock is null");
+ }
+ clock_ = options_.clock;
+ // We copy the Cache Deleter Role Map as its member.
+ role_map_ = CopyCacheDeleterRoleMap();
+ // Set the sequence number
+ sequence_num_ = 0;
+
+ // Dump stage, first, we write the hader
+ IOStatus io_s = WriteHeader();
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ // Then, we iterate the block cache and dump out the blocks that are not
+ // filtered out.
+ cache_->ApplyToAllEntries(DumpOneBlockCallBack(), {});
+
+ // Finally, write the footer
+ io_s = WriteFooter();
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ io_s = writer_->Close();
+ return io_s;
+}
+
+// Check if we need to filter out the block based on its key
+bool CacheDumperImpl::ShouldFilterOut(const Slice& key) {
+ if (key.size() < OffsetableCacheKey::kCommonPrefixSize) {
+ return /*filter out*/ true;
+ }
+ Slice key_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
+ std::string prefix = key_prefix.ToString();
+ // Filter out if not found
+ return prefix_filter_.find(prefix) == prefix_filter_.end();
+}
+
+// This is the callback function which will be applied to
+// Cache::ApplyToAllEntries. In this callback function, we will get the block
+// type, decide if the block needs to be dumped based on the filter, and write
+// the block through the provided writer.
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+CacheDumperImpl::DumpOneBlockCallBack() {
+ return [&](const Slice& key, void* value, size_t /*charge*/,
+ Cache::DeleterFn deleter) {
+ // Step 1: get the type of the block from role_map_
+ auto e = role_map_.find(deleter);
+ CacheEntryRole role;
+ CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax;
+ if (e == role_map_.end()) {
+ role = CacheEntryRole::kMisc;
+ } else {
+ role = e->second;
+ }
+ bool filter_out = false;
+
+ // Step 2: based on the key prefix, check if the block should be filter out.
+ if (ShouldFilterOut(key)) {
+ filter_out = true;
+ }
+
+ // Step 3: based on the block type, get the block raw pointer and length.
+ const char* block_start = nullptr;
+ size_t block_len = 0;
+ switch (role) {
+ case CacheEntryRole::kDataBlock:
+ type = CacheDumpUnitType::kData;
+ block_start = (static_cast<Block*>(value))->data();
+ block_len = (static_cast<Block*>(value))->size();
+ break;
+ case CacheEntryRole::kFilterBlock:
+ type = CacheDumpUnitType::kFilter;
+ block_start = (static_cast<ParsedFullFilterBlock*>(value))
+ ->GetBlockContentsData()
+ .data();
+ block_len = (static_cast<ParsedFullFilterBlock*>(value))
+ ->GetBlockContentsData()
+ .size();
+ break;
+ case CacheEntryRole::kFilterMetaBlock:
+ type = CacheDumpUnitType::kFilterMetaBlock;
+ block_start = (static_cast<Block*>(value))->data();
+ block_len = (static_cast<Block*>(value))->size();
+ break;
+ case CacheEntryRole::kIndexBlock:
+ type = CacheDumpUnitType::kIndex;
+ block_start = (static_cast<Block*>(value))->data();
+ block_len = (static_cast<Block*>(value))->size();
+ break;
+ case CacheEntryRole::kDeprecatedFilterBlock:
+ // Obsolete
+ filter_out = true;
+ break;
+ case CacheEntryRole::kMisc:
+ filter_out = true;
+ break;
+ case CacheEntryRole::kOtherBlock:
+ filter_out = true;
+ break;
+ case CacheEntryRole::kWriteBuffer:
+ filter_out = true;
+ break;
+ default:
+ filter_out = true;
+ }
+
+ // Step 4: if the block should not be filter out, write the block to the
+ // CacheDumpWriter
+ if (!filter_out && block_start != nullptr) {
+ WriteBlock(type, key, Slice(block_start, block_len))
+ .PermitUncheckedError();
+ }
+ };
+}
+
+// Write the block to the writer. It takes the timestamp of the
+// block being copied from block cache, block type, key, block pointer,
+// block size and block checksum as the input. When writing the dumper raw
+// block, we first create the dump unit and encoude it to a string. Then,
+// we calculate the checksum of the whole dump unit string and store it in
+// the dump unit metadata.
+// First, we write the metadata first, which is a fixed size string. Then, we
+// Append the dump unit string to the writer.
+IOStatus CacheDumperImpl::WriteBlock(CacheDumpUnitType type, const Slice& key,
+ const Slice& value) {
+ uint64_t timestamp = clock_->NowMicros();
+ uint32_t value_checksum = crc32c::Value(value.data(), value.size());
+
+ // First, serialize the block information in a string
+ DumpUnit dump_unit;
+ dump_unit.timestamp = timestamp;
+ dump_unit.key = key;
+ dump_unit.type = type;
+ dump_unit.value_len = value.size();
+ dump_unit.value = const_cast<char*>(value.data());
+ dump_unit.value_checksum = value_checksum;
+ std::string encoded_data;
+ CacheDumperHelper::EncodeDumpUnit(dump_unit, &encoded_data);
+
+ // Second, create the metadata, which contains a sequence number, the dump
+ // unit string checksum and the string size. The sequence number monotonically
+ // increases from 0.
+ DumpUnitMeta unit_meta;
+ unit_meta.sequence_num = sequence_num_;
+ sequence_num_++;
+ unit_meta.dump_unit_checksum =
+ crc32c::Value(encoded_data.data(), encoded_data.size());
+ unit_meta.dump_unit_size = encoded_data.size();
+ std::string encoded_meta;
+ CacheDumperHelper::EncodeDumpUnitMeta(unit_meta, &encoded_meta);
+
+ // We write the metadata first.
+ assert(writer_ != nullptr);
+ IOStatus io_s = writer_->WriteMetadata(encoded_meta);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ // followed by the dump unit.
+ return writer_->WritePacket(encoded_data);
+}
+
+// Before we write any block, we write the header first to store the cache dump
+// format version, rocksdb version, and brief intro.
+IOStatus CacheDumperImpl::WriteHeader() {
+ std::string header_key = "header";
+ std::ostringstream s;
+ s << kTraceMagic << "\t"
+ << "Cache dump format version: " << kCacheDumpMajorVersion << "."
+ << kCacheDumpMinorVersion << "\t"
+ << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+ << "Format: dump_unit_metadata <sequence_number, dump_unit_checksum, "
+ "dump_unit_size>, dump_unit <timestamp, key, block_type, "
+ "block_size, block_data, block_checksum> cache_value\n";
+ std::string header_value(s.str());
+ CacheDumpUnitType type = CacheDumpUnitType::kHeader;
+ return WriteBlock(type, header_key, header_value);
+}
+
+// Write the footer after all the blocks are stored to indicate the ending.
+IOStatus CacheDumperImpl::WriteFooter() {
+ std::string footer_key = "footer";
+ std::string footer_value("cache dump completed");
+ CacheDumpUnitType type = CacheDumpUnitType::kFooter;
+ return WriteBlock(type, footer_key, footer_value);
+}
+
+// This is the main function to restore the cache entries to secondary cache.
+// First, we check if all the arguments are valid. Then, we read the block
+// sequentially from the reader and insert them to the secondary cache.
+IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() {
+ // TODO: remove this line when options are used in the loader
+ (void)options_;
+ // Step 1: we check if all the arguments are valid
+ if (secondary_cache_ == nullptr) {
+ return IOStatus::InvalidArgument("Secondary Cache is null");
+ }
+ if (reader_ == nullptr) {
+ return IOStatus::InvalidArgument("CacheDumpReader is null");
+ }
+ // we copy the Cache Deleter Role Map as its member.
+ role_map_ = CopyCacheDeleterRoleMap();
+
+ // Step 2: read the header
+ // TODO: we need to check the cache dump format version and RocksDB version
+ // after the header is read out.
+ IOStatus io_s;
+ DumpUnit dump_unit;
+ std::string data;
+ io_s = ReadHeader(&data, &dump_unit);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ // Step 3: read out the rest of the blocks from the reader. The loop will stop
+ // either I/O status is not ok or we reach to the the end.
+ while (io_s.ok()) {
+ dump_unit.reset();
+ data.clear();
+ // read the content and store in the dump_unit
+ io_s = ReadCacheBlock(&data, &dump_unit);
+ if (!io_s.ok()) {
+ break;
+ }
+ if (dump_unit.type == CacheDumpUnitType::kFooter) {
+ break;
+ }
+ // Create the uncompressed_block based on the information in the dump_unit
+ // (There is no block trailer here compatible with block-based SST file.)
+ Slice content =
+ Slice(static_cast<char*>(dump_unit.value), dump_unit.value_len);
+ Status s = secondary_cache_->InsertSaved(dump_unit.key, content);
+ if (!s.ok()) {
+ io_s = status_to_io_status(std::move(s));
+ }
+ }
+ if (dump_unit.type == CacheDumpUnitType::kFooter) {
+ return IOStatus::OK();
+ } else {
+ return io_s;
+ }
+}
+
+// Read and copy the dump unit metadata to std::string data, decode and create
+// the unit metadata based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnitMeta(std::string* data,
+ DumpUnitMeta* unit_meta) {
+ assert(reader_ != nullptr);
+ assert(data != nullptr);
+ assert(unit_meta != nullptr);
+ IOStatus io_s = reader_->ReadMetadata(data);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ return status_to_io_status(
+ CacheDumperHelper::DecodeDumpUnitMeta(*data, unit_meta));
+}
+
+// Read and copy the dump unit to std::string data, decode and create the unit
+// based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnit(size_t len, std::string* data,
+ DumpUnit* unit) {
+ assert(reader_ != nullptr);
+ assert(data != nullptr);
+ assert(unit != nullptr);
+ IOStatus io_s = reader_->ReadPacket(data);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ if (data->size() != len) {
+ return IOStatus::Corruption(
+ "The data being read out does not match the size stored in metadata!");
+ }
+ Slice block;
+ return status_to_io_status(CacheDumperHelper::DecodeDumpUnit(*data, unit));
+}
+
+// Read the header
+IOStatus CacheDumpedLoaderImpl::ReadHeader(std::string* data,
+ DumpUnit* dump_unit) {
+ DumpUnitMeta header_meta;
+ header_meta.reset();
+ std::string meta_string;
+ IOStatus io_s = ReadDumpUnitMeta(&meta_string, &header_meta);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ io_s = ReadDumpUnit(header_meta.dump_unit_size, data, dump_unit);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ uint32_t unit_checksum = crc32c::Value(data->data(), data->size());
+ if (unit_checksum != header_meta.dump_unit_checksum) {
+ return IOStatus::Corruption("Read header unit corrupted!");
+ }
+ return io_s;
+}
+
+// Read the blocks after header is read out
+IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data,
+ DumpUnit* dump_unit) {
+ // According to the write process, we read the dump_unit_metadata first
+ DumpUnitMeta unit_meta;
+ unit_meta.reset();
+ std::string unit_string;
+ IOStatus io_s = ReadDumpUnitMeta(&unit_string, &unit_meta);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+
+ // Based on the information in the dump_unit_metadata, we read the dump_unit
+ // and verify if its content is correct.
+ io_s = ReadDumpUnit(unit_meta.dump_unit_size, data, dump_unit);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ uint32_t unit_checksum = crc32c::Value(data->data(), data->size());
+ if (unit_checksum != unit_meta.dump_unit_checksum) {
+ return IOStatus::Corruption(
+ "Checksum does not match! Read dumped unit corrupted!");
+ }
+ return io_s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load_impl.h b/src/rocksdb/utilities/cache_dump_load_impl.h
new file mode 100644
index 000000000..9ca1ff45a
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load_impl.h
@@ -0,0 +1,359 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <unordered_map>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/utilities/cache_dump_load.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// the read buffer size of for the default CacheDumpReader
+const unsigned int kDumpReaderBufferSize = 1024; // 1KB
+static const unsigned int kSizePrefixLen = 4;
+
+enum CacheDumpUnitType : unsigned char {
+ kHeader = 1,
+ kFooter = 2,
+ kData = 3,
+ kFilter = 4,
+ kProperties = 5,
+ kCompressionDictionary = 6,
+ kRangeDeletion = 7,
+ kHashIndexPrefixes = 8,
+ kHashIndexMetadata = 9,
+ kMetaIndex = 10,
+ kIndex = 11,
+ kDeprecatedFilterBlock = 12, // OBSOLETE / DEPRECATED
+ kFilterMetaBlock = 13,
+ kBlockTypeMax,
+};
+
+// The metadata of a dump unit. After it is serilized, its size is fixed 16
+// bytes.
+struct DumpUnitMeta {
+ // sequence number is a monotonically increasing number to indicate the order
+ // of the blocks being written. Header is 0.
+ uint32_t sequence_num;
+ // The Crc32c checksum of its dump unit.
+ uint32_t dump_unit_checksum;
+ // The dump unit size after the dump unit is serilized to a string.
+ uint64_t dump_unit_size;
+
+ void reset() {
+ sequence_num = 0;
+ dump_unit_checksum = 0;
+ dump_unit_size = 0;
+ }
+};
+
+// The data structure to hold a block and its information.
+struct DumpUnit {
+ // The timestamp when the block is identified, copied, and dumped from block
+ // cache
+ uint64_t timestamp;
+ // The type of the block
+ CacheDumpUnitType type;
+ // The key of this block when the block is referenced by this Cache
+ Slice key;
+ // The block size
+ size_t value_len;
+ // The Crc32c checksum of the block
+ uint32_t value_checksum;
+ // Pointer to the block. Note that, in the dump process, it points to a memory
+ // buffer copied from cache block. The buffer is freed when we process the
+ // next block. In the load process, we use an std::string to store the
+ // serialized dump_unit read from the reader. So it points to the memory
+ // address of the begin of the block in this string.
+ void* value;
+
+ DumpUnit() { reset(); }
+
+ void reset() {
+ timestamp = 0;
+ type = CacheDumpUnitType::kBlockTypeMax;
+ key.clear();
+ value_len = 0;
+ value_checksum = 0;
+ value = nullptr;
+ }
+};
+
+// The default implementation of the Cache Dumper
+class CacheDumperImpl : public CacheDumper {
+ public:
+ CacheDumperImpl(const CacheDumpOptions& dump_options,
+ const std::shared_ptr<Cache>& cache,
+ std::unique_ptr<CacheDumpWriter>&& writer)
+ : options_(dump_options), cache_(cache), writer_(std::move(writer)) {}
+ ~CacheDumperImpl() { writer_.reset(); }
+ Status SetDumpFilter(std::vector<DB*> db_list) override;
+ IOStatus DumpCacheEntriesToWriter() override;
+
+ private:
+ IOStatus WriteBlock(CacheDumpUnitType type, const Slice& key,
+ const Slice& value);
+ IOStatus WriteHeader();
+ IOStatus WriteFooter();
+ bool ShouldFilterOut(const Slice& key);
+ std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+ DumpOneBlockCallBack();
+
+ CacheDumpOptions options_;
+ std::shared_ptr<Cache> cache_;
+ std::unique_ptr<CacheDumpWriter> writer_;
+ UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+ SystemClock* clock_;
+ uint32_t sequence_num_;
+ // The cache key prefix filter. Currently, we use db_session_id as the prefix,
+ // so using std::set to store the prefixes as filter is enough. Further
+ // improvement can be applied like BloomFilter or others to speedup the
+ // filtering.
+ std::set<std::string> prefix_filter_;
+};
+
+// The default implementation of CacheDumpedLoader
+class CacheDumpedLoaderImpl : public CacheDumpedLoader {
+ public:
+ CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options,
+ const BlockBasedTableOptions& /*toptions*/,
+ const std::shared_ptr<SecondaryCache>& secondary_cache,
+ std::unique_ptr<CacheDumpReader>&& reader)
+ : options_(dump_options),
+ secondary_cache_(secondary_cache),
+ reader_(std::move(reader)) {}
+ ~CacheDumpedLoaderImpl() {}
+ IOStatus RestoreCacheEntriesToSecondaryCache() override;
+
+ private:
+ IOStatus ReadDumpUnitMeta(std::string* data, DumpUnitMeta* unit_meta);
+ IOStatus ReadDumpUnit(size_t len, std::string* data, DumpUnit* unit);
+ IOStatus ReadHeader(std::string* data, DumpUnit* dump_unit);
+ IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit);
+
+ CacheDumpOptions options_;
+ std::shared_ptr<SecondaryCache> secondary_cache_;
+ std::unique_ptr<CacheDumpReader> reader_;
+ UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+};
+
+// The default implementation of CacheDumpWriter. We write the blocks to a file
+// sequentially.
+class ToFileCacheDumpWriter : public CacheDumpWriter {
+ public:
+ explicit ToFileCacheDumpWriter(
+ std::unique_ptr<WritableFileWriter>&& file_writer)
+ : file_writer_(std::move(file_writer)) {}
+
+ ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); }
+
+ // Write the serialized metadata to the file
+ virtual IOStatus WriteMetadata(const Slice& metadata) override {
+ assert(file_writer_ != nullptr);
+ std::string prefix;
+ PutFixed32(&prefix, static_cast<uint32_t>(metadata.size()));
+ IOStatus io_s = file_writer_->Append(Slice(prefix));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ io_s = file_writer_->Append(metadata);
+ return io_s;
+ }
+
+ // Write the serialized data to the file
+ virtual IOStatus WritePacket(const Slice& data) override {
+ assert(file_writer_ != nullptr);
+ std::string prefix;
+ PutFixed32(&prefix, static_cast<uint32_t>(data.size()));
+ IOStatus io_s = file_writer_->Append(Slice(prefix));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ io_s = file_writer_->Append(data);
+ return io_s;
+ }
+
+ // Reset the writer
+ virtual IOStatus Close() override {
+ file_writer_.reset();
+ return IOStatus::OK();
+ }
+
+ private:
+ std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+// The default implementation of CacheDumpReader. It is implemented based on
+// RandomAccessFileReader. Note that, we keep an internal variable to remember
+// the current offset.
+class FromFileCacheDumpReader : public CacheDumpReader {
+ public:
+ explicit FromFileCacheDumpReader(
+ std::unique_ptr<RandomAccessFileReader>&& reader)
+ : file_reader_(std::move(reader)),
+ offset_(0),
+ buffer_(new char[kDumpReaderBufferSize]) {}
+
+ ~FromFileCacheDumpReader() { delete[] buffer_; }
+
+ virtual IOStatus ReadMetadata(std::string* metadata) override {
+ uint32_t metadata_len = 0;
+ IOStatus io_s = ReadSizePrefix(&metadata_len);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ return Read(metadata_len, metadata);
+ }
+
+ virtual IOStatus ReadPacket(std::string* data) override {
+ uint32_t data_len = 0;
+ IOStatus io_s = ReadSizePrefix(&data_len);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ return Read(data_len, data);
+ }
+
+ private:
+ IOStatus ReadSizePrefix(uint32_t* len) {
+ std::string prefix;
+ IOStatus io_s = Read(kSizePrefixLen, &prefix);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ Slice encoded_slice(prefix);
+ if (!GetFixed32(&encoded_slice, len)) {
+ return IOStatus::Corruption("Decode size prefix string failed");
+ }
+ return IOStatus::OK();
+ }
+
+ IOStatus Read(size_t len, std::string* data) {
+ assert(file_reader_ != nullptr);
+ IOStatus io_s;
+
+ unsigned int bytes_to_read = static_cast<unsigned int>(len);
+ unsigned int to_read = bytes_to_read > kDumpReaderBufferSize
+ ? kDumpReaderBufferSize
+ : bytes_to_read;
+
+ while (to_read > 0) {
+ io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_,
+ buffer_, nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ if (result_.size() < to_read) {
+ return IOStatus::Corruption("Corrupted cache dump file.");
+ }
+ data->append(result_.data(), result_.size());
+
+ offset_ += to_read;
+ bytes_to_read -= to_read;
+ to_read = bytes_to_read > kDumpReaderBufferSize ? kDumpReaderBufferSize
+ : bytes_to_read;
+ }
+ return io_s;
+ }
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ Slice result_;
+ size_t offset_;
+ char* buffer_;
+};
+
+// The cache dump and load helper class
+class CacheDumperHelper {
+ public:
+ // serialize the dump_unit_meta to a string, it is fixed 16 bytes size.
+ static void EncodeDumpUnitMeta(const DumpUnitMeta& meta, std::string* data) {
+ assert(data);
+ PutFixed32(data, static_cast<uint32_t>(meta.sequence_num));
+ PutFixed32(data, static_cast<uint32_t>(meta.dump_unit_checksum));
+ PutFixed64(data, meta.dump_unit_size);
+ }
+
+ // Serialize the dump_unit to a string.
+ static void EncodeDumpUnit(const DumpUnit& dump_unit, std::string* data) {
+ assert(data);
+ PutFixed64(data, dump_unit.timestamp);
+ data->push_back(dump_unit.type);
+ PutLengthPrefixedSlice(data, dump_unit.key);
+ PutFixed32(data, static_cast<uint32_t>(dump_unit.value_len));
+ PutFixed32(data, dump_unit.value_checksum);
+ PutLengthPrefixedSlice(data,
+ Slice((char*)dump_unit.value, dump_unit.value_len));
+ }
+
+ // Deserialize the dump_unit_meta from a string
+ static Status DecodeDumpUnitMeta(const std::string& encoded_data,
+ DumpUnitMeta* unit_meta) {
+ assert(unit_meta != nullptr);
+ Slice encoded_slice = Slice(encoded_data);
+ if (!GetFixed32(&encoded_slice, &(unit_meta->sequence_num))) {
+ return Status::Incomplete("Decode dumped unit meta sequence_num failed");
+ }
+ if (!GetFixed32(&encoded_slice, &(unit_meta->dump_unit_checksum))) {
+ return Status::Incomplete(
+ "Decode dumped unit meta dump_unit_checksum failed");
+ }
+ if (!GetFixed64(&encoded_slice, &(unit_meta->dump_unit_size))) {
+ return Status::Incomplete(
+ "Decode dumped unit meta dump_unit_size failed");
+ }
+ return Status::OK();
+ }
+
+ // Deserialize the dump_unit from a string.
+ static Status DecodeDumpUnit(const std::string& encoded_data,
+ DumpUnit* dump_unit) {
+ assert(dump_unit != nullptr);
+ Slice encoded_slice = Slice(encoded_data);
+
+ // Decode timestamp
+ if (!GetFixed64(&encoded_slice, &dump_unit->timestamp)) {
+ return Status::Incomplete("Decode dumped unit string failed");
+ }
+ // Decode the block type
+ dump_unit->type = static_cast<CacheDumpUnitType>(encoded_slice[0]);
+ encoded_slice.remove_prefix(1);
+ // Decode the key
+ if (!GetLengthPrefixedSlice(&encoded_slice, &(dump_unit->key))) {
+ return Status::Incomplete("Decode dumped unit string failed");
+ }
+ // Decode the value size
+ uint32_t value_len;
+ if (!GetFixed32(&encoded_slice, &value_len)) {
+ return Status::Incomplete("Decode dumped unit string failed");
+ }
+ dump_unit->value_len = static_cast<size_t>(value_len);
+ // Decode the value checksum
+ if (!GetFixed32(&encoded_slice, &(dump_unit->value_checksum))) {
+ return Status::Incomplete("Decode dumped unit string failed");
+ }
+ // Decode the block content and copy to the memory space whose pointer
+ // will be managed by the cache finally.
+ Slice block;
+ if (!GetLengthPrefixedSlice(&encoded_slice, &block)) {
+ return Status::Incomplete("Decode dumped unit string failed");
+ }
+ dump_unit->value = (void*)block.data();
+ assert(block.size() == dump_unit->value_len);
+ return Status::OK();
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
new file mode 100644
index 000000000..4e48d63aa
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+static std::unordered_map<std::string, OptionTypeInfo>
+ cassandra_filter_type_info = {
+#ifndef ROCKSDB_LITE
+ {"purge_ttl_on_expiration",
+ {offsetof(struct CassandraOptions, purge_ttl_on_expiration),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"gc_grace_period_in_seconds",
+ {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+
+CassandraCompactionFilter::CassandraCompactionFilter(
+ bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+ : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+ RegisterOptions(&options_, &cassandra_filter_type_info);
+}
+
+CompactionFilter::Decision CassandraCompactionFilter::FilterV2(
+ int /*level*/, const Slice& /*key*/, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* /*skip_until*/) const {
+ bool value_changed = false;
+ RowValue row_value =
+ RowValue::Deserialize(existing_value.data(), existing_value.size());
+ RowValue compacted =
+ options_.purge_ttl_on_expiration
+ ? row_value.RemoveExpiredColumns(&value_changed)
+ : row_value.ConvertExpiredColumnsToTombstones(&value_changed);
+
+ if (value_type == ValueType::kValue) {
+ compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds);
+ }
+
+ if (compacted.Empty()) {
+ return Decision::kRemove;
+ }
+
+ if (value_changed) {
+ compacted.Serialize(new_value);
+ return Decision::kChangeValue;
+ }
+
+ return Decision::kKeep;
+}
+
+CassandraCompactionFilterFactory::CassandraCompactionFilterFactory(
+ bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+ : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+ RegisterOptions(&options_, &cassandra_filter_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+CassandraCompactionFilterFactory::CreateCompactionFilter(
+ const CompactionFilter::Context&) {
+ std::unique_ptr<CompactionFilter> result(new CassandraCompactionFilter(
+ options_.purge_ttl_on_expiration, options_.gc_grace_period_in_seconds));
+ return result;
+}
+
+#ifndef ROCKSDB_LITE
+int RegisterCassandraObjects(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<MergeOperator>(
+ CassandraValueMergeOperator::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new CassandraValueMergeOperator(0));
+ return guard->get();
+ });
+ library.AddFactory<CompactionFilter>(
+ CassandraCompactionFilter::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilter>* /*guard */,
+ std::string* /* errmsg */) {
+ return new CassandraCompactionFilter(false, 0);
+ });
+ library.AddFactory<CompactionFilterFactory>(
+ CassandraCompactionFilterFactory::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilterFactory>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new CassandraCompactionFilterFactory(false, 0));
+ return guard->get();
+ });
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
new file mode 100644
index 000000000..0325a4c39
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+/**
+ * Compaction filter for removing expired Cassandra data with ttl.
+ * If option `purge_ttl_on_expiration` is set to true, expired data
+ * will be directly purged. Otherwise expired data will be converted
+ * tombstones first, then be eventally removed after gc grace period.
+ * `purge_ttl_on_expiration` should only be on in the case all the
+ * writes have same ttl setting, otherwise it could bring old data back.
+ *
+ * Compaction filter is also in charge of removing tombstone that has been
+ * promoted to kValue type after serials of merging in compaction.
+ */
+class CassandraCompactionFilter : public CompactionFilter {
+ public:
+ explicit CassandraCompactionFilter(bool purge_ttl_on_expiration,
+ int32_t gc_grace_period_in_seconds);
+ static const char* kClassName() { return "CassandraCompactionFilter"; }
+ const char* Name() const override { return kClassName(); }
+
+ virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+ const Slice& existing_value, std::string* new_value,
+ std::string* skip_until) const override;
+
+ private:
+ CassandraOptions options_;
+};
+
+class CassandraCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit CassandraCompactionFilterFactory(bool purge_ttl_on_expiration,
+ int32_t gc_grace_period_in_seconds);
+ ~CassandraCompactionFilterFactory() override {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override;
+ static const char* kClassName() { return "CassandraCompactionFilterFactory"; }
+ const char* Name() const override { return kClassName(); }
+
+ private:
+ CassandraOptions options_;
+};
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_format_test.cc b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc
new file mode 100644
index 000000000..4f12947ad
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc
@@ -0,0 +1,377 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cstring>
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
+#include "utilities/cassandra/test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+TEST(ColumnTest, Column) {
+ char data[4] = {'d', 'a', 't', 'a'};
+ int8_t mask = 0;
+ int8_t index = 1;
+ int64_t timestamp = 1494022807044;
+ Column c = Column(mask, index, timestamp, sizeof(data), data);
+
+ EXPECT_EQ(c.Index(), index);
+ EXPECT_EQ(c.Timestamp(), timestamp);
+ EXPECT_EQ(c.Size(), 14 + sizeof(data));
+
+ // Verify the serialization.
+ std::string dest;
+ dest.reserve(c.Size() * 2);
+ c.Serialize(&dest);
+
+ EXPECT_EQ(dest.size(), c.Size());
+ std::size_t offset = 0;
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), timestamp);
+ offset += sizeof(int64_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(data));
+ offset += sizeof(int32_t);
+ EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0);
+
+ // Verify the deserialization.
+ std::string saved_dest = dest;
+ std::shared_ptr<Column> c1 = Column::Deserialize(saved_dest.c_str(), 0);
+ EXPECT_EQ(c1->Index(), index);
+ EXPECT_EQ(c1->Timestamp(), timestamp);
+ EXPECT_EQ(c1->Size(), 14 + sizeof(data));
+
+ c1->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 2 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+ 0);
+
+ // Verify the ColumnBase::Deserialization.
+ saved_dest = dest;
+ std::shared_ptr<ColumnBase> c2 =
+ ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
+ c2->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 3 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+ c.Size()) == 0);
+}
+
+TEST(ExpiringColumnTest, ExpiringColumn) {
+ char data[4] = {'d', 'a', 't', 'a'};
+ int8_t mask = ColumnTypeMask::EXPIRATION_MASK;
+ int8_t index = 3;
+ int64_t timestamp = 1494022807044;
+ int32_t ttl = 3600;
+ ExpiringColumn c =
+ ExpiringColumn(mask, index, timestamp, sizeof(data), data, ttl);
+
+ EXPECT_EQ(c.Index(), index);
+ EXPECT_EQ(c.Timestamp(), timestamp);
+ EXPECT_EQ(c.Size(), 18 + sizeof(data));
+
+ // Verify the serialization.
+ std::string dest;
+ dest.reserve(c.Size() * 2);
+ c.Serialize(&dest);
+
+ EXPECT_EQ(dest.size(), c.Size());
+ std::size_t offset = 0;
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), timestamp);
+ offset += sizeof(int64_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(data));
+ offset += sizeof(int32_t);
+ EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0);
+ offset += sizeof(data);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), ttl);
+
+ // Verify the deserialization.
+ std::string saved_dest = dest;
+ std::shared_ptr<ExpiringColumn> c1 =
+ ExpiringColumn::Deserialize(saved_dest.c_str(), 0);
+ EXPECT_EQ(c1->Index(), index);
+ EXPECT_EQ(c1->Timestamp(), timestamp);
+ EXPECT_EQ(c1->Size(), 18 + sizeof(data));
+
+ c1->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 2 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+ 0);
+
+ // Verify the ColumnBase::Deserialization.
+ saved_dest = dest;
+ std::shared_ptr<ColumnBase> c2 =
+ ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
+ c2->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 3 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+ c.Size()) == 0);
+}
+
+TEST(TombstoneTest, TombstoneCollectable) {
+ int32_t now = (int32_t)time(nullptr);
+ int32_t gc_grace_seconds = 16440;
+ int32_t time_delta_seconds = 10;
+ EXPECT_TRUE(
+ Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+ now - gc_grace_seconds - time_delta_seconds,
+ ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds))
+ .Collectable(gc_grace_seconds));
+ EXPECT_FALSE(
+ Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+ now - gc_grace_seconds + time_delta_seconds,
+ ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds))
+ .Collectable(gc_grace_seconds));
+}
+
+TEST(TombstoneTest, Tombstone) {
+ int8_t mask = ColumnTypeMask::DELETION_MASK;
+ int8_t index = 2;
+ int32_t local_deletion_time = 1494022807;
+ int64_t marked_for_delete_at = 1494022807044;
+ Tombstone c =
+ Tombstone(mask, index, local_deletion_time, marked_for_delete_at);
+
+ EXPECT_EQ(c.Index(), index);
+ EXPECT_EQ(c.Timestamp(), marked_for_delete_at);
+ EXPECT_EQ(c.Size(), 14);
+
+ // Verify the serialization.
+ std::string dest;
+ dest.reserve(c.Size() * 2);
+ c.Serialize(&dest);
+
+ EXPECT_EQ(dest.size(), c.Size());
+ std::size_t offset = 0;
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), local_deletion_time);
+ offset += sizeof(int32_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), marked_for_delete_at);
+
+ // Verify the deserialization.
+ std::shared_ptr<Tombstone> c1 = Tombstone::Deserialize(dest.c_str(), 0);
+ EXPECT_EQ(c1->Index(), index);
+ EXPECT_EQ(c1->Timestamp(), marked_for_delete_at);
+ EXPECT_EQ(c1->Size(), 14);
+
+ c1->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 2 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+ 0);
+
+ // Verify the ColumnBase::Deserialization.
+ std::shared_ptr<ColumnBase> c2 =
+ ColumnBase::Deserialize(dest.c_str(), c.Size());
+ c2->Serialize(&dest);
+ EXPECT_EQ(dest.size(), 3 * c.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+ c.Size()) == 0);
+}
+
+class RowValueTest : public testing::Test {};
+
+TEST(RowValueTest, RowTombstone) {
+ int32_t local_deletion_time = 1494022807;
+ int64_t marked_for_delete_at = 1494022807044;
+ RowValue r = RowValue(local_deletion_time, marked_for_delete_at);
+
+ EXPECT_EQ(r.Size(), 12);
+ EXPECT_EQ(r.IsTombstone(), true);
+ EXPECT_EQ(r.LastModifiedTime(), marked_for_delete_at);
+
+ // Verify the serialization.
+ std::string dest;
+ dest.reserve(r.Size() * 2);
+ r.Serialize(&dest);
+
+ EXPECT_EQ(dest.size(), r.Size());
+ std::size_t offset = 0;
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), local_deletion_time);
+ offset += sizeof(int32_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), marked_for_delete_at);
+
+ // Verify the deserialization.
+ RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size());
+ EXPECT_EQ(r1.Size(), 12);
+ EXPECT_EQ(r1.IsTombstone(), true);
+ EXPECT_EQ(r1.LastModifiedTime(), marked_for_delete_at);
+
+ r1.Serialize(&dest);
+ EXPECT_EQ(dest.size(), 2 * r.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) ==
+ 0);
+}
+
+TEST(RowValueTest, RowWithColumns) {
+ std::vector<std::shared_ptr<ColumnBase>> columns;
+ int64_t last_modified_time = 1494022807048;
+ std::size_t columns_data_size = 0;
+
+ char e_data[5] = {'e', 'd', 'a', 't', 'a'};
+ int8_t e_index = 0;
+ int64_t e_timestamp = 1494022807044;
+ int32_t e_ttl = 3600;
+ columns.push_back(std::shared_ptr<ExpiringColumn>(
+ new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index, e_timestamp,
+ sizeof(e_data), e_data, e_ttl)));
+ columns_data_size += columns[0]->Size();
+
+ char c_data[4] = {'d', 'a', 't', 'a'};
+ int8_t c_index = 1;
+ int64_t c_timestamp = 1494022807048;
+ columns.push_back(std::shared_ptr<Column>(
+ new Column(0, c_index, c_timestamp, sizeof(c_data), c_data)));
+ columns_data_size += columns[1]->Size();
+
+ int8_t t_index = 2;
+ int32_t t_local_deletion_time = 1494022801;
+ int64_t t_marked_for_delete_at = 1494022807043;
+ columns.push_back(std::shared_ptr<Tombstone>(
+ new Tombstone(ColumnTypeMask::DELETION_MASK, t_index,
+ t_local_deletion_time, t_marked_for_delete_at)));
+ columns_data_size += columns[2]->Size();
+
+ RowValue r = RowValue(std::move(columns), last_modified_time);
+
+ EXPECT_EQ(r.Size(), columns_data_size + 12);
+ EXPECT_EQ(r.IsTombstone(), false);
+ EXPECT_EQ(r.LastModifiedTime(), last_modified_time);
+
+ // Verify the serialization.
+ std::string dest;
+ dest.reserve(r.Size() * 2);
+ r.Serialize(&dest);
+
+ EXPECT_EQ(dest.size(), r.Size());
+ std::size_t offset = 0;
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset),
+ std::numeric_limits<int32_t>::max());
+ offset += sizeof(int32_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset),
+ std::numeric_limits<int64_t>::min());
+ offset += sizeof(int64_t);
+
+ // Column0: ExpiringColumn
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset),
+ ColumnTypeMask::EXPIRATION_MASK);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), e_index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), e_timestamp);
+ offset += sizeof(int64_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(e_data));
+ offset += sizeof(int32_t);
+ EXPECT_TRUE(std::memcmp(e_data, dest.c_str() + offset, sizeof(e_data)) == 0);
+ offset += sizeof(e_data);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), e_ttl);
+ offset += sizeof(int32_t);
+
+ // Column1: Column
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), 0);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), c_index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), c_timestamp);
+ offset += sizeof(int64_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(c_data));
+ offset += sizeof(int32_t);
+ EXPECT_TRUE(std::memcmp(c_data, dest.c_str() + offset, sizeof(c_data)) == 0);
+ offset += sizeof(c_data);
+
+ // Column2: Tombstone
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset),
+ ColumnTypeMask::DELETION_MASK);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), t_index);
+ offset += sizeof(int8_t);
+ EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), t_local_deletion_time);
+ offset += sizeof(int32_t);
+ EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), t_marked_for_delete_at);
+
+ // Verify the deserialization.
+ RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size());
+ EXPECT_EQ(r1.Size(), columns_data_size + 12);
+ EXPECT_EQ(r1.IsTombstone(), false);
+ EXPECT_EQ(r1.LastModifiedTime(), last_modified_time);
+
+ r1.Serialize(&dest);
+ EXPECT_EQ(dest.size(), 2 * r.Size());
+ EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) ==
+ 0);
+}
+
+TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) {
+ int64_t now = time(nullptr);
+
+ auto row_value = CreateTestRowValue(
+ {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+ CreateTestColumnSpec(kExpiringColumn, 1,
+ ToMicroSeconds(now - kTtl - 10)), // expired
+ CreateTestColumnSpec(kExpiringColumn, 2,
+ ToMicroSeconds(now)), // not expired
+ CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))});
+
+ bool changed = false;
+ auto purged = row_value.RemoveExpiredColumns(&changed);
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(purged.get_columns().size(), 3);
+ VerifyRowValueColumns(purged.get_columns(), 0, kColumn, 0,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(purged.get_columns(), 1, kExpiringColumn, 2,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(purged.get_columns(), 2, kTombstone, 3,
+ ToMicroSeconds(now));
+
+ purged.RemoveExpiredColumns(&changed);
+ EXPECT_FALSE(changed);
+}
+
+TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) {
+ int64_t now = time(nullptr);
+
+ auto row_value = CreateTestRowValue(
+ {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+ CreateTestColumnSpec(kExpiringColumn, 1,
+ ToMicroSeconds(now - kTtl - 10)), // expired
+ CreateTestColumnSpec(kExpiringColumn, 2,
+ ToMicroSeconds(now)), // not expired
+ CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))});
+
+ bool changed = false;
+ auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed);
+ EXPECT_TRUE(changed);
+ EXPECT_EQ(compacted.get_columns().size(), 4);
+ VerifyRowValueColumns(compacted.get_columns(), 0, kColumn, 0,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(compacted.get_columns(), 1, kTombstone, 1,
+ ToMicroSeconds(now - 10));
+ VerifyRowValueColumns(compacted.get_columns(), 2, kExpiringColumn, 2,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(compacted.get_columns(), 3, kTombstone, 3,
+ ToMicroSeconds(now));
+
+ compacted.ConvertExpiredColumnsToTombstones(&changed);
+ EXPECT_FALSE(changed);
+}
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc
new file mode 100644
index 000000000..c5be836e8
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc
@@ -0,0 +1,446 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <iostream>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+#include "utilities/cassandra/merge_operator.h"
+#include "utilities/cassandra/test_utils.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+// Path to the database on file system
+const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test");
+
+class CassandraStore {
+ public:
+ explicit CassandraStore(std::shared_ptr<DB> db)
+ : db_(db), write_option_(), get_option_() {
+ assert(db);
+ }
+
+ bool Append(const std::string& key, const RowValue& val) {
+ std::string result;
+ val.Serialize(&result);
+ Slice valSlice(result.data(), result.size());
+ auto s = db_->Merge(write_option_, key, valSlice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << "ERROR " << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ bool Put(const std::string& key, const RowValue& val) {
+ std::string result;
+ val.Serialize(&result);
+ Slice valSlice(result.data(), result.size());
+ auto s = db_->Put(write_option_, key, valSlice);
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << "ERROR " << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ Status Flush() {
+ Status s = dbfull()->TEST_FlushMemTable();
+ if (s.ok()) {
+ s = dbfull()->TEST_WaitForCompact();
+ }
+ return s;
+ }
+
+ Status Compact() {
+ return dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+ db_->DefaultColumnFamily());
+ }
+
+ std::tuple<bool, RowValue> Get(const std::string& key) {
+ std::string result;
+ auto s = db_->Get(get_option_, key, &result);
+
+ if (s.ok()) {
+ return std::make_tuple(
+ true, RowValue::Deserialize(result.data(), result.size()));
+ }
+
+ if (!s.IsNotFound()) {
+ std::cerr << "ERROR " << s.ToString() << std::endl;
+ }
+
+ return std::make_tuple(false, RowValue(0, 0));
+ }
+
+ private:
+ std::shared_ptr<DB> db_;
+ WriteOptions write_option_;
+ ReadOptions get_option_;
+
+ DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
+};
+
+class TestCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+ explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration,
+ int32_t gc_grace_period_in_seconds)
+ : purge_ttl_on_expiration_(purge_ttl_on_expiration),
+ gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(new CassandraCompactionFilter(
+ purge_ttl_on_expiration_, gc_grace_period_in_seconds_));
+ }
+
+ const char* Name() const override { return "TestCompactionFilterFactory"; }
+
+ private:
+ bool purge_ttl_on_expiration_;
+ int32_t gc_grace_period_in_seconds_;
+};
+
+// The class for unit-testing
+class CassandraFunctionalTest : public testing::Test {
+ public:
+ CassandraFunctionalTest() {
+ EXPECT_OK(
+ DestroyDB(kDbName, Options())); // Start each test with a fresh DB
+ }
+
+ std::shared_ptr<DB> OpenDb() {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator.reset(
+ new CassandraValueMergeOperator(gc_grace_period_in_seconds_));
+ auto* cf_factory = new TestCompactionFilterFactory(
+ purge_ttl_on_expiration_, gc_grace_period_in_seconds_);
+ options.compaction_filter_factory.reset(cf_factory);
+ EXPECT_OK(DB::Open(options, kDbName, &db));
+ return std::shared_ptr<DB>(db);
+ }
+
+ bool purge_ttl_on_expiration_ = false;
+ int32_t gc_grace_period_in_seconds_ = 100;
+};
+
+// THE TEST CASES BEGIN HERE
+
+TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Append(
+ "k1",
+ CreateTestRowValue({
+ CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)),
+ CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)),
+ CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)),
+ }));
+ store.Append(
+ "k1",
+ CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)),
+ CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)),
+ CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)),
+ CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)),
+ }));
+ store.Append(
+ "k1",
+ CreateTestRowValue({
+ CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)),
+ CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)),
+ CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)),
+ CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)),
+ }));
+
+ auto ret = store.Get("k1");
+
+ ASSERT_TRUE(std::get<0>(ret));
+ RowValue& merged = std::get<1>(ret);
+ EXPECT_EQ(merged.get_columns().size(), 5);
+ VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0,
+ ToMicroSeconds(now + 6));
+ VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1,
+ ToMicroSeconds(now + 8));
+ VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2,
+ ToMicroSeconds(now + 7));
+ VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7,
+ ToMicroSeconds(now + 17));
+ VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11,
+ ToMicroSeconds(now + 11));
+}
+
+constexpr int64_t kTestTimeoutSecs = 600;
+
+TEST_F(CassandraFunctionalTest,
+ CompactionShouldConvertExpiredColumnsToTombstone) {
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Append(
+ "k1",
+ CreateTestRowValue(
+ {CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 20)), // expired
+ CreateTestColumnSpec(
+ kExpiringColumn, 1,
+ ToMicroSeconds(now - kTtl + kTestTimeoutSecs)), // not expired
+ CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}));
+
+ ASSERT_OK(store.Flush());
+
+ store.Append(
+ "k1",
+ CreateTestRowValue(
+ {CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 10)), // expired
+ CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))}));
+
+ ASSERT_OK(store.Flush());
+ ASSERT_OK(store.Compact());
+
+ auto ret = store.Get("k1");
+ ASSERT_TRUE(std::get<0>(ret));
+ RowValue& merged = std::get<1>(ret);
+ EXPECT_EQ(merged.get_columns().size(), 4);
+ VerifyRowValueColumns(merged.get_columns(), 0, kTombstone, 0,
+ ToMicroSeconds(now - 10));
+ VerifyRowValueColumns(merged.get_columns(), 1, kExpiringColumn, 1,
+ ToMicroSeconds(now - kTtl + kTestTimeoutSecs));
+ VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 2,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(merged.get_columns(), 3, kTombstone, 3,
+ ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest,
+ CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) {
+ purge_ttl_on_expiration_ = true;
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Append(
+ "k1",
+ CreateTestRowValue(
+ {CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 20)), // expired
+ CreateTestColumnSpec(kExpiringColumn, 1,
+ ToMicroSeconds(now)), // not expired
+ CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}));
+
+ ASSERT_OK(store.Flush());
+
+ store.Append(
+ "k1",
+ CreateTestRowValue(
+ {CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 10)), // expired
+ CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))}));
+
+ ASSERT_OK(store.Flush());
+ ASSERT_OK(store.Compact());
+
+ auto ret = store.Get("k1");
+ ASSERT_TRUE(std::get<0>(ret));
+ RowValue& merged = std::get<1>(ret);
+ EXPECT_EQ(merged.get_columns().size(), 3);
+ VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 1,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 2,
+ ToMicroSeconds(now));
+ VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 3,
+ ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest,
+ CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) {
+ purge_ttl_on_expiration_ = true;
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Append("k1", CreateTestRowValue({
+ CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 20)),
+ CreateTestColumnSpec(kExpiringColumn, 1,
+ ToMicroSeconds(now - kTtl - 20)),
+ }));
+
+ ASSERT_OK(store.Flush());
+
+ store.Append("k1", CreateTestRowValue({
+ CreateTestColumnSpec(kExpiringColumn, 0,
+ ToMicroSeconds(now - kTtl - 10)),
+ }));
+
+ ASSERT_OK(store.Flush());
+ ASSERT_OK(store.Compact());
+ ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+TEST_F(CassandraFunctionalTest,
+ CompactionShouldRemoveTombstoneExceedingGCGracePeriod) {
+ purge_ttl_on_expiration_ = true;
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Append("k1",
+ CreateTestRowValue(
+ {CreateTestColumnSpec(
+ kTombstone, 0,
+ ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
+ CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now))}));
+
+ store.Append("k2", CreateTestRowValue({CreateTestColumnSpec(
+ kColumn, 0, ToMicroSeconds(now))}));
+
+ ASSERT_OK(store.Flush());
+
+ store.Append("k1", CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)),
+ }));
+
+ ASSERT_OK(store.Flush());
+ ASSERT_OK(store.Compact());
+
+ auto ret = store.Get("k1");
+ ASSERT_TRUE(std::get<0>(ret));
+ RowValue& gced = std::get<1>(ret);
+ EXPECT_EQ(gced.get_columns().size(), 1);
+ VerifyRowValueColumns(gced.get_columns(), 0, kColumn, 1, ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
+ purge_ttl_on_expiration_ = true;
+ CassandraStore store(OpenDb());
+ int64_t now = time(nullptr);
+
+ store.Put("k1",
+ CreateTestRowValue({
+ CreateTestColumnSpec(
+ kTombstone, 0,
+ ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
+ }));
+
+ ASSERT_OK(store.Flush());
+ ASSERT_OK(store.Compact());
+ ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CassandraFunctionalTest, LoadMergeOperator) {
+ ConfigOptions config_options;
+ std::shared_ptr<MergeOperator> mo;
+ config_options.ignore_unsupported_options = false;
+
+ ASSERT_NOK(MergeOperator::CreateFromString(
+ config_options, CassandraValueMergeOperator::kClassName(), &mo));
+
+ config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+ "cassandra");
+
+ ASSERT_OK(MergeOperator::CreateFromString(
+ config_options, CassandraValueMergeOperator::kClassName(), &mo));
+ ASSERT_NE(mo, nullptr);
+ ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+ mo.reset();
+ ASSERT_OK(MergeOperator::CreateFromString(
+ config_options,
+ std::string("operands_limit=20;gc_grace_period_in_seconds=42;id=") +
+ CassandraValueMergeOperator::kClassName(),
+ &mo));
+ ASSERT_NE(mo, nullptr);
+ ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+ const auto* opts = mo->GetOptions<CassandraOptions>();
+ ASSERT_NE(opts, nullptr);
+ ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+ ASSERT_EQ(opts->operands_limit, 20);
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilter) {
+ ConfigOptions config_options;
+ const CompactionFilter* filter = nullptr;
+ config_options.ignore_unsupported_options = false;
+
+ ASSERT_NOK(CompactionFilter::CreateFromString(
+ config_options, CassandraCompactionFilter::kClassName(), &filter));
+ config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+ "cassandra");
+
+ ASSERT_OK(CompactionFilter::CreateFromString(
+ config_options, CassandraCompactionFilter::kClassName(), &filter));
+ ASSERT_NE(filter, nullptr);
+ ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+ delete filter;
+ filter = nullptr;
+ ASSERT_OK(CompactionFilter::CreateFromString(
+ config_options,
+ std::string(
+ "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+ CassandraCompactionFilter::kClassName(),
+ &filter));
+ ASSERT_NE(filter, nullptr);
+ ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+ const auto* opts = filter->GetOptions<CassandraOptions>();
+ ASSERT_NE(opts, nullptr);
+ ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+ ASSERT_TRUE(opts->purge_ttl_on_expiration);
+ delete filter;
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) {
+ ConfigOptions config_options;
+ std::shared_ptr<CompactionFilterFactory> factory;
+
+ config_options.ignore_unsupported_options = false;
+ ASSERT_NOK(CompactionFilterFactory::CreateFromString(
+ config_options, CassandraCompactionFilterFactory::kClassName(),
+ &factory));
+ config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+ "cassandra");
+
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(
+ config_options, CassandraCompactionFilterFactory::kClassName(),
+ &factory));
+ ASSERT_NE(factory, nullptr);
+ ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+ factory.reset();
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(
+ config_options,
+ std::string(
+ "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+ CassandraCompactionFilterFactory::kClassName(),
+ &factory));
+ ASSERT_NE(factory, nullptr);
+ ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+ const auto* opts = factory->GetOptions<CassandraOptions>();
+ ASSERT_NE(opts, nullptr);
+ ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+ ASSERT_TRUE(opts->purge_ttl_on_expiration);
+}
+#endif // ROCKSDB_LITE
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_options.h b/src/rocksdb/utilities/cassandra/cassandra_options.h
new file mode 100644
index 000000000..efa73a308
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_options.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ObjectLibrary;
+namespace cassandra {
+struct CassandraOptions {
+ static const char* kName() { return "CassandraOptions"; }
+ CassandraOptions(int32_t _gc_grace_period_in_seconds, size_t _operands_limit,
+ bool _purge_ttl_on_expiration = false)
+ : operands_limit(_operands_limit),
+ gc_grace_period_in_seconds(_gc_grace_period_in_seconds),
+ purge_ttl_on_expiration(_purge_ttl_on_expiration) {}
+ // Limit on the number of merge operands.
+ size_t operands_limit;
+
+ // How long (in seconds) tombstoned data remains before it is purged
+ int32_t gc_grace_period_in_seconds;
+
+ // If is set to true, expired data will be directly purged.
+ // Otherwise expired data will be converted tombstones first,
+ // then be eventually removed after gc grace period. This value should
+ // only true if all writes have same ttl setting, otherwise it could bring old
+ // data back.
+ bool purge_ttl_on_expiration;
+};
+#ifndef ROCKSDB_LITE
+extern "C" {
+int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg);
+} // extern "C"
+#endif // ROCKSDB_LITE
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
new file mode 100644
index 000000000..0b4a89287
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+class RowValueMergeTest : public testing::Test {};
+
+TEST(RowValueMergeTest, Merge) {
+ std::vector<RowValue> row_values;
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kTombstone, 0, 5),
+ CreateTestColumnSpec(kColumn, 1, 8),
+ CreateTestColumnSpec(kExpiringColumn, 2, 5),
+ }));
+
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 0, 2),
+ CreateTestColumnSpec(kExpiringColumn, 1, 5),
+ CreateTestColumnSpec(kTombstone, 2, 7),
+ CreateTestColumnSpec(kExpiringColumn, 7, 17),
+ }));
+
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kExpiringColumn, 0, 6),
+ CreateTestColumnSpec(kTombstone, 1, 5),
+ CreateTestColumnSpec(kColumn, 2, 4),
+ CreateTestColumnSpec(kTombstone, 11, 11),
+ }));
+
+ RowValue merged = RowValue::Merge(std::move(row_values));
+ EXPECT_FALSE(merged.IsTombstone());
+ EXPECT_EQ(merged.get_columns().size(), 5);
+ VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0, 6);
+ VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1, 8);
+ VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2, 7);
+ VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7, 17);
+ VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11, 11);
+}
+
+TEST(RowValueMergeTest, MergeWithRowTombstone) {
+ std::vector<RowValue> row_values;
+
+ // A row tombstone.
+ row_values.push_back(CreateRowTombstone(11));
+
+ // This row's timestamp is smaller than tombstone.
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 0, 5),
+ CreateTestColumnSpec(kColumn, 1, 6),
+ }));
+
+ // Some of the column's row is smaller, some is larger.
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 2, 10),
+ CreateTestColumnSpec(kColumn, 3, 12),
+ }));
+
+ // All of the column's rows are larger than tombstone.
+ row_values.push_back(CreateTestRowValue({
+ CreateTestColumnSpec(kColumn, 4, 13),
+ CreateTestColumnSpec(kColumn, 5, 14),
+ }));
+
+ RowValue merged = RowValue::Merge(std::move(row_values));
+ EXPECT_FALSE(merged.IsTombstone());
+ EXPECT_EQ(merged.get_columns().size(), 3);
+ VerifyRowValueColumns(merged.get_columns(), 0, kColumn, 3, 12);
+ VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 4, 13);
+ VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 5, 14);
+
+ // If the tombstone's timestamp is the latest, then it returns a
+ // row tombstone.
+ row_values.push_back(CreateRowTombstone(15));
+
+ row_values.push_back(CreateRowTombstone(17));
+
+ merged = RowValue::Merge(std::move(row_values));
+ EXPECT_TRUE(merged.IsTombstone());
+ EXPECT_EQ(merged.LastModifiedTime(), 17);
+}
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc
new file mode 100644
index 000000000..c14d8fd80
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+TEST(SerializeTest, SerializeI64) {
+ std::string dest;
+ Serialize<int64_t>(0, &dest);
+ EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+ '\x00'}),
+ dest);
+
+ dest.clear();
+ Serialize<int64_t>(1, &dest);
+ EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+ '\x01'}),
+ dest);
+
+ dest.clear();
+ Serialize<int64_t>(-1, &dest);
+ EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff',
+ '\xff'}),
+ dest);
+
+ dest.clear();
+ Serialize<int64_t>(9223372036854775807, &dest);
+ EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff',
+ '\xff'}),
+ dest);
+
+ dest.clear();
+ Serialize<int64_t>(-9223372036854775807, &dest);
+ EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+ '\x01'}),
+ dest);
+}
+
+TEST(SerializeTest, DeserializeI64) {
+ std::string dest;
+ std::size_t offset = dest.size();
+ Serialize<int64_t>(0, &dest);
+ EXPECT_EQ(0, Deserialize<int64_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int64_t>(1, &dest);
+ EXPECT_EQ(1, Deserialize<int64_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int64_t>(-1, &dest);
+ EXPECT_EQ(-1, Deserialize<int64_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int64_t>(-9223372036854775807, &dest);
+ EXPECT_EQ(-9223372036854775807, Deserialize<int64_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int64_t>(9223372036854775807, &dest);
+ EXPECT_EQ(9223372036854775807, Deserialize<int64_t>(dest.c_str(), offset));
+}
+
+TEST(SerializeTest, SerializeI32) {
+ std::string dest;
+ Serialize<int32_t>(0, &dest);
+ EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00'}), dest);
+
+ dest.clear();
+ Serialize<int32_t>(1, &dest);
+ EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x01'}), dest);
+
+ dest.clear();
+ Serialize<int32_t>(-1, &dest);
+ EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff'}), dest);
+
+ dest.clear();
+ Serialize<int32_t>(2147483647, &dest);
+ EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff'}), dest);
+
+ dest.clear();
+ Serialize<int32_t>(-2147483648LL, &dest);
+ EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00'}), dest);
+}
+
+TEST(SerializeTest, DeserializeI32) {
+ std::string dest;
+ std::size_t offset = dest.size();
+ Serialize<int32_t>(0, &dest);
+ EXPECT_EQ(0, Deserialize<int32_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int32_t>(1, &dest);
+ EXPECT_EQ(1, Deserialize<int32_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int32_t>(-1, &dest);
+ EXPECT_EQ(-1, Deserialize<int32_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int32_t>(2147483647, &dest);
+ EXPECT_EQ(2147483647, Deserialize<int32_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int32_t>(-2147483648LL, &dest);
+ EXPECT_EQ(-2147483648LL, Deserialize<int32_t>(dest.c_str(), offset));
+}
+
+TEST(SerializeTest, SerializeI8) {
+ std::string dest;
+ Serialize<int8_t>(0, &dest);
+ EXPECT_EQ(std::string({'\x00'}), dest);
+
+ dest.clear();
+ Serialize<int8_t>(1, &dest);
+ EXPECT_EQ(std::string({'\x01'}), dest);
+
+ dest.clear();
+ Serialize<int8_t>(-1, &dest);
+ EXPECT_EQ(std::string({'\xff'}), dest);
+
+ dest.clear();
+ Serialize<int8_t>(127, &dest);
+ EXPECT_EQ(std::string({'\x7f'}), dest);
+
+ dest.clear();
+ Serialize<int8_t>(-128, &dest);
+ EXPECT_EQ(std::string({'\x80'}), dest);
+}
+
+TEST(SerializeTest, DeserializeI8) {
+ std::string dest;
+ std::size_t offset = dest.size();
+ Serialize<int8_t>(0, &dest);
+ EXPECT_EQ(0, Deserialize<int8_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int8_t>(1, &dest);
+ EXPECT_EQ(1, Deserialize<int8_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int8_t>(-1, &dest);
+ EXPECT_EQ(-1, Deserialize<int8_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int8_t>(127, &dest);
+ EXPECT_EQ(127, Deserialize<int8_t>(dest.c_str(), offset));
+
+ offset = dest.size();
+ Serialize<int8_t>(-128, &dest);
+ EXPECT_EQ(-128, Deserialize<int8_t>(dest.c_str(), offset));
+}
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/format.cc b/src/rocksdb/utilities/cassandra/format.cc
new file mode 100644
index 000000000..cc1dd2f28
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/format.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "format.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+namespace {
+const int32_t kDefaultLocalDeletionTime = std::numeric_limits<int32_t>::max();
+const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits<int64_t>::min();
+} // namespace
+
+ColumnBase::ColumnBase(int8_t mask, int8_t index)
+ : mask_(mask), index_(index) {}
+
+std::size_t ColumnBase::Size() const { return sizeof(mask_) + sizeof(index_); }
+
+int8_t ColumnBase::Mask() const { return mask_; }
+
+int8_t ColumnBase::Index() const { return index_; }
+
+void ColumnBase::Serialize(std::string* dest) const {
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int8_t>(mask_, dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int8_t>(index_, dest);
+}
+
+std::shared_ptr<ColumnBase> ColumnBase::Deserialize(const char* src,
+ std::size_t offset) {
+ int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
+ return Tombstone::Deserialize(src, offset);
+ } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
+ return ExpiringColumn::Deserialize(src, offset);
+ } else {
+ return Column::Deserialize(src, offset);
+ }
+}
+
+Column::Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size,
+ const char* value)
+ : ColumnBase(mask, index),
+ timestamp_(timestamp),
+ value_size_(value_size),
+ value_(value) {}
+
+int64_t Column::Timestamp() const { return timestamp_; }
+
+std::size_t Column::Size() const {
+ return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) +
+ value_size_;
+}
+
+void Column::Serialize(std::string* dest) const {
+ ColumnBase::Serialize(dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(timestamp_, dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(value_size_, dest);
+ dest->append(value_, value_size_);
+}
+
+std::shared_ptr<Column> Column::Deserialize(const char* src,
+ std::size_t offset) {
+ int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(mask);
+ int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(index);
+ int64_t timestamp =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+ offset += sizeof(timestamp);
+ int32_t value_size =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+ offset += sizeof(value_size);
+ return std::make_shared<Column>(mask, index, timestamp, value_size,
+ src + offset);
+}
+
+ExpiringColumn::ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
+ int32_t value_size, const char* value,
+ int32_t ttl)
+ : Column(mask, index, timestamp, value_size, value), ttl_(ttl) {}
+
+std::size_t ExpiringColumn::Size() const {
+ return Column::Size() + sizeof(ttl_);
+}
+
+void ExpiringColumn::Serialize(std::string* dest) const {
+ Column::Serialize(dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(ttl_, dest);
+}
+
+std::chrono::time_point<std::chrono::system_clock> ExpiringColumn::TimePoint()
+ const {
+ return std::chrono::time_point<std::chrono::system_clock>(
+ std::chrono::microseconds(Timestamp()));
+}
+
+std::chrono::seconds ExpiringColumn::Ttl() const {
+ return std::chrono::seconds(ttl_);
+}
+
+bool ExpiringColumn::Expired() const {
+ return TimePoint() + Ttl() < std::chrono::system_clock::now();
+}
+
+std::shared_ptr<Tombstone> ExpiringColumn::ToTombstone() const {
+ auto expired_at = (TimePoint() + Ttl()).time_since_epoch();
+ int32_t local_deletion_time = static_cast<int32_t>(
+ std::chrono::duration_cast<std::chrono::seconds>(expired_at).count());
+ int64_t marked_for_delete_at =
+ std::chrono::duration_cast<std::chrono::microseconds>(expired_at).count();
+ return std::make_shared<Tombstone>(
+ static_cast<int8_t>(ColumnTypeMask::DELETION_MASK), Index(),
+ local_deletion_time, marked_for_delete_at);
+}
+
+std::shared_ptr<ExpiringColumn> ExpiringColumn::Deserialize(
+ const char* src, std::size_t offset) {
+ int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(mask);
+ int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(index);
+ int64_t timestamp =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+ offset += sizeof(timestamp);
+ int32_t value_size =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+ offset += sizeof(value_size);
+ const char* value = src + offset;
+ offset += value_size;
+ int32_t ttl = ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+ return std::make_shared<ExpiringColumn>(mask, index, timestamp, value_size,
+ value, ttl);
+}
+
+Tombstone::Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time,
+ int64_t marked_for_delete_at)
+ : ColumnBase(mask, index),
+ local_deletion_time_(local_deletion_time),
+ marked_for_delete_at_(marked_for_delete_at) {}
+
+int64_t Tombstone::Timestamp() const { return marked_for_delete_at_; }
+
+std::size_t Tombstone::Size() const {
+ return ColumnBase::Size() + sizeof(local_deletion_time_) +
+ sizeof(marked_for_delete_at_);
+}
+
+void Tombstone::Serialize(std::string* dest) const {
+ ColumnBase::Serialize(dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(local_deletion_time_, dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
+}
+
+bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const {
+ auto local_deleted_at = std::chrono::time_point<std::chrono::system_clock>(
+ std::chrono::seconds(local_deletion_time_));
+ auto gc_grace_period = std::chrono::seconds(gc_grace_period_in_seconds);
+ return local_deleted_at + gc_grace_period < std::chrono::system_clock::now();
+}
+
+std::shared_ptr<Tombstone> Tombstone::Deserialize(const char* src,
+ std::size_t offset) {
+ int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(mask);
+ int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+ offset += sizeof(index);
+ int32_t local_deletion_time =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+ offset += sizeof(int32_t);
+ int64_t marked_for_delete_at =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+ return std::make_shared<Tombstone>(mask, index, local_deletion_time,
+ marked_for_delete_at);
+}
+
+RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at)
+ : local_deletion_time_(local_deletion_time),
+ marked_for_delete_at_(marked_for_delete_at),
+ columns_(),
+ last_modified_time_(0) {}
+
+RowValue::RowValue(Columns columns, int64_t last_modified_time)
+ : local_deletion_time_(kDefaultLocalDeletionTime),
+ marked_for_delete_at_(kDefaultMarkedForDeleteAt),
+ columns_(std::move(columns)),
+ last_modified_time_(last_modified_time) {}
+
+std::size_t RowValue::Size() const {
+ std::size_t size =
+ sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_);
+ for (const auto& column : columns_) {
+ size += column->Size();
+ }
+ return size;
+}
+
+int64_t RowValue::LastModifiedTime() const {
+ if (IsTombstone()) {
+ return marked_for_delete_at_;
+ } else {
+ return last_modified_time_;
+ }
+}
+
+bool RowValue::IsTombstone() const {
+ return marked_for_delete_at_ > kDefaultMarkedForDeleteAt;
+}
+
+void RowValue::Serialize(std::string* dest) const {
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(local_deletion_time_, dest);
+ ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
+ for (const auto& column : columns_) {
+ column->Serialize(dest);
+ }
+}
+
+RowValue RowValue::RemoveExpiredColumns(bool* changed) const {
+ *changed = false;
+ Columns new_columns;
+ for (auto& column : columns_) {
+ if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+ std::shared_ptr<ExpiringColumn> expiring_column =
+ std::static_pointer_cast<ExpiringColumn>(column);
+
+ if (expiring_column->Expired()) {
+ *changed = true;
+ continue;
+ }
+ }
+
+ new_columns.push_back(column);
+ }
+ return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const {
+ *changed = false;
+ Columns new_columns;
+ for (auto& column : columns_) {
+ if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+ std::shared_ptr<ExpiringColumn> expiring_column =
+ std::static_pointer_cast<ExpiringColumn>(column);
+
+ if (expiring_column->Expired()) {
+ std::shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
+ new_columns.push_back(tombstone);
+ *changed = true;
+ continue;
+ }
+ }
+ new_columns.push_back(column);
+ }
+ return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const {
+ Columns new_columns;
+ for (auto& column : columns_) {
+ if (column->Mask() == ColumnTypeMask::DELETION_MASK) {
+ std::shared_ptr<Tombstone> tombstone =
+ std::static_pointer_cast<Tombstone>(column);
+
+ if (tombstone->Collectable(gc_grace_period)) {
+ continue;
+ }
+ }
+
+ new_columns.push_back(column);
+ }
+ return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+bool RowValue::Empty() const { return columns_.empty(); }
+
+RowValue RowValue::Deserialize(const char* src, std::size_t size) {
+ std::size_t offset = 0;
+ assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_));
+ int32_t local_deletion_time =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+ offset += sizeof(int32_t);
+ int64_t marked_for_delete_at =
+ ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+ offset += sizeof(int64_t);
+ if (offset == size) {
+ return RowValue(local_deletion_time, marked_for_delete_at);
+ }
+
+ assert(local_deletion_time == kDefaultLocalDeletionTime);
+ assert(marked_for_delete_at == kDefaultMarkedForDeleteAt);
+ Columns columns;
+ int64_t last_modified_time = 0;
+ while (offset < size) {
+ auto c = ColumnBase::Deserialize(src, offset);
+ offset += c->Size();
+ assert(offset <= size);
+ last_modified_time = std::max(last_modified_time, c->Timestamp());
+ columns.push_back(std::move(c));
+ }
+
+ return RowValue(std::move(columns), last_modified_time);
+}
+
+// Merge multiple row values into one.
+// For each column in rows with same index, we pick the one with latest
+// timestamp. And we also take row tombstone into consideration, by iterating
+// each row from reverse timestamp order, and stop once we hit the first
+// row tombstone.
+RowValue RowValue::Merge(std::vector<RowValue>&& values) {
+ assert(values.size() > 0);
+ if (values.size() == 1) {
+ return std::move(values[0]);
+ }
+
+ // Merge columns by their last modified time, and skip once we hit
+ // a row tombstone.
+ std::sort(values.begin(), values.end(),
+ [](const RowValue& r1, const RowValue& r2) {
+ return r1.LastModifiedTime() > r2.LastModifiedTime();
+ });
+
+ std::map<int8_t, std::shared_ptr<ColumnBase>> merged_columns;
+ int64_t tombstone_timestamp = 0;
+
+ for (auto& value : values) {
+ if (value.IsTombstone()) {
+ if (merged_columns.size() == 0) {
+ return std::move(value);
+ }
+ tombstone_timestamp = value.LastModifiedTime();
+ break;
+ }
+ for (auto& column : value.columns_) {
+ int8_t index = column->Index();
+ if (merged_columns.find(index) == merged_columns.end()) {
+ merged_columns[index] = column;
+ } else {
+ if (column->Timestamp() > merged_columns[index]->Timestamp()) {
+ merged_columns[index] = column;
+ }
+ }
+ }
+ }
+
+ int64_t last_modified_time = 0;
+ Columns columns;
+ for (auto& pair : merged_columns) {
+ // For some row, its last_modified_time > row tombstone_timestamp, but
+ // it might have rows whose timestamp is ealier than tombstone, so we
+ // ned to filter these rows.
+ if (pair.second->Timestamp() <= tombstone_timestamp) {
+ continue;
+ }
+ last_modified_time = std::max(last_modified_time, pair.second->Timestamp());
+ columns.push_back(std::move(pair.second));
+ }
+ return RowValue(std::move(columns), last_modified_time);
+}
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/format.h b/src/rocksdb/utilities/cassandra/format.h
new file mode 100644
index 000000000..1b2714735
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/format.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+/**
+ * The encoding of Cassandra Row Value.
+ *
+ * A Cassandra Row Value could either be a row tombstone,
+ * or contains multiple columns, it has following fields:
+ *
+ * struct row_value {
+ * int32_t local_deletion_time; // Time in second when the row is deleted,
+ * // only used for Cassandra tombstone gc.
+ * int64_t marked_for_delete_at; // Ms that marked this row is deleted.
+ * struct column_base columns[]; // For non tombstone row, all columns
+ * // are stored here.
+ * }
+ *
+ * If the local_deletion_time and marked_for_delete_at is set, then this is
+ * a tombstone, otherwise it contains multiple columns.
+ *
+ * There are three type of Columns: Normal Column, Expiring Column and Column
+ * Tombstone, which have following fields:
+ *
+ * // Identify the type of the column.
+ * enum mask {
+ * DELETION_MASK = 0x01,
+ * EXPIRATION_MASK = 0x02,
+ * };
+ *
+ * struct column {
+ * int8_t mask = 0;
+ * int8_t index;
+ * int64_t timestamp;
+ * int32_t value_length;
+ * char value[value_length];
+ * }
+ *
+ * struct expiring_column {
+ * int8_t mask = mask.EXPIRATION_MASK;
+ * int8_t index;
+ * int64_t timestamp;
+ * int32_t value_length;
+ * char value[value_length];
+ * int32_t ttl;
+ * }
+ *
+ * struct tombstone_column {
+ * int8_t mask = mask.DELETION_MASK;
+ * int8_t index;
+ * int32_t local_deletion_time; // Similar to row_value's field.
+ * int64_t marked_for_delete_at;
+ * }
+ */
+
+#pragma once
+#include <chrono>
+#include <memory>
+#include <vector>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+// Identify the type of the column.
+enum ColumnTypeMask {
+ DELETION_MASK = 0x01,
+ EXPIRATION_MASK = 0x02,
+};
+
+class ColumnBase {
+ public:
+ ColumnBase(int8_t mask, int8_t index);
+ virtual ~ColumnBase() = default;
+
+ virtual int64_t Timestamp() const = 0;
+ virtual int8_t Mask() const;
+ virtual int8_t Index() const;
+ virtual std::size_t Size() const;
+ virtual void Serialize(std::string* dest) const;
+ static std::shared_ptr<ColumnBase> Deserialize(const char* src,
+ std::size_t offset);
+
+ private:
+ int8_t mask_;
+ int8_t index_;
+};
+
+class Column : public ColumnBase {
+ public:
+ Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size,
+ const char* value);
+
+ virtual int64_t Timestamp() const override;
+ virtual std::size_t Size() const override;
+ virtual void Serialize(std::string* dest) const override;
+ static std::shared_ptr<Column> Deserialize(const char* src,
+ std::size_t offset);
+
+ private:
+ int64_t timestamp_;
+ int32_t value_size_;
+ const char* value_;
+};
+
+class Tombstone : public ColumnBase {
+ public:
+ Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time,
+ int64_t marked_for_delete_at);
+
+ virtual int64_t Timestamp() const override;
+ virtual std::size_t Size() const override;
+ virtual void Serialize(std::string* dest) const override;
+ bool Collectable(int32_t gc_grace_period) const;
+ static std::shared_ptr<Tombstone> Deserialize(const char* src,
+ std::size_t offset);
+
+ private:
+ int32_t local_deletion_time_;
+ int64_t marked_for_delete_at_;
+};
+
+class ExpiringColumn : public Column {
+ public:
+ ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
+ int32_t value_size, const char* value, int32_t ttl);
+
+ virtual std::size_t Size() const override;
+ virtual void Serialize(std::string* dest) const override;
+ bool Expired() const;
+ std::shared_ptr<Tombstone> ToTombstone() const;
+
+ static std::shared_ptr<ExpiringColumn> Deserialize(const char* src,
+ std::size_t offset);
+
+ private:
+ int32_t ttl_;
+ std::chrono::time_point<std::chrono::system_clock> TimePoint() const;
+ std::chrono::seconds Ttl() const;
+};
+
+using Columns = std::vector<std::shared_ptr<ColumnBase>>;
+
+class RowValue {
+ public:
+ // Create a Row Tombstone.
+ RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at);
+ // Create a Row containing columns.
+ RowValue(Columns columns, int64_t last_modified_time);
+ RowValue(const RowValue& /*that*/) = delete;
+ RowValue(RowValue&& /*that*/) noexcept = default;
+ RowValue& operator=(const RowValue& /*that*/) = delete;
+ RowValue& operator=(RowValue&& /*that*/) = default;
+
+ std::size_t Size() const;
+ bool IsTombstone() const;
+ // For Tombstone this returns the marked_for_delete_at_,
+ // otherwise it returns the max timestamp of containing columns.
+ int64_t LastModifiedTime() const;
+ void Serialize(std::string* dest) const;
+ RowValue RemoveExpiredColumns(bool* changed) const;
+ RowValue ConvertExpiredColumnsToTombstones(bool* changed) const;
+ RowValue RemoveTombstones(int32_t gc_grace_period) const;
+ bool Empty() const;
+
+ static RowValue Deserialize(const char* src, std::size_t size);
+ // Merge multiple rows according to their timestamp.
+ static RowValue Merge(std::vector<RowValue>&& values);
+
+ const Columns& get_columns() { return columns_; }
+
+ private:
+ int32_t local_deletion_time_;
+ int64_t marked_for_delete_at_;
+ Columns columns_;
+ int64_t last_modified_time_;
+};
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/merge_operator.cc b/src/rocksdb/utilities/cassandra/merge_operator.cc
new file mode 100644
index 000000000..bde5dcbad
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/merge_operator.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "merge_operator.h"
+
+#include <assert.h>
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+static std::unordered_map<std::string, OptionTypeInfo>
+ merge_operator_options_info = {
+#ifndef ROCKSDB_LITE
+ {"gc_grace_period_in_seconds",
+ {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"operands_limit",
+ {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+
+CassandraValueMergeOperator::CassandraValueMergeOperator(
+ int32_t gc_grace_period_in_seconds, size_t operands_limit)
+ : options_(gc_grace_period_in_seconds, operands_limit) {
+ RegisterOptions(&options_, &merge_operator_options_info);
+}
+
+// Implementation for the merge operation (merges two Cassandra values)
+bool CassandraValueMergeOperator::FullMergeV2(
+ const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // Clear the *new_value for writing.
+ merge_out->new_value.clear();
+ std::vector<RowValue> row_values;
+ if (merge_in.existing_value) {
+ row_values.push_back(RowValue::Deserialize(
+ merge_in.existing_value->data(), merge_in.existing_value->size()));
+ }
+
+ for (auto& operand : merge_in.operand_list) {
+ row_values.push_back(RowValue::Deserialize(operand.data(), operand.size()));
+ }
+
+ RowValue merged = RowValue::Merge(std::move(row_values));
+ merged = merged.RemoveTombstones(options_.gc_grace_period_in_seconds);
+ merge_out->new_value.reserve(merged.Size());
+ merged.Serialize(&(merge_out->new_value));
+
+ return true;
+}
+
+bool CassandraValueMergeOperator::PartialMergeMulti(
+ const Slice& /*key*/, const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* /*logger*/) const {
+ // Clear the *new_value for writing.
+ assert(new_value);
+ new_value->clear();
+
+ std::vector<RowValue> row_values;
+ for (auto& operand : operand_list) {
+ row_values.push_back(RowValue::Deserialize(operand.data(), operand.size()));
+ }
+ RowValue merged = RowValue::Merge(std::move(row_values));
+ new_value->reserve(merged.Size());
+ merged.Serialize(new_value);
+ return true;
+}
+
+} // namespace cassandra
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/merge_operator.h b/src/rocksdb/utilities/cassandra/merge_operator.h
new file mode 100644
index 000000000..af8725db7
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/merge_operator.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+/**
+ * A MergeOperator for rocksdb that implements Cassandra row value merge.
+ */
+class CassandraValueMergeOperator : public MergeOperator {
+ public:
+ explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds,
+ size_t operands_limit = 0);
+
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ virtual bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const override;
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "CassandraValueMergeOperator"; }
+
+ virtual bool AllowSingleOperand() const override { return true; }
+
+ virtual bool ShouldMerge(const std::vector<Slice>& operands) const override {
+ return options_.operands_limit > 0 &&
+ operands.size() >= options_.operands_limit;
+ }
+
+ private:
+ CassandraOptions options_;
+};
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/serialize.h b/src/rocksdb/utilities/cassandra/serialize.h
new file mode 100644
index 000000000..4bd552bfc
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/serialize.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+/**
+ * Helper functions which serialize and deserialize integers
+ * into bytes in big endian.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+namespace {
+const int64_t kCharMask = 0xFFLL;
+const int32_t kBitsPerByte = 8;
+} // namespace
+
+template <typename T>
+void Serialize(T val, std::string* dest);
+
+template <typename T>
+T Deserialize(const char* src, std::size_t offset = 0);
+
+// Specializations
+template <>
+inline void Serialize<int8_t>(int8_t t, std::string* dest) {
+ dest->append(1, static_cast<char>(t & kCharMask));
+}
+
+template <>
+inline void Serialize<int32_t>(int32_t t, std::string* dest) {
+ for (unsigned long i = 0; i < sizeof(int32_t); i++) {
+ dest->append(
+ 1, static_cast<char>((t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) &
+ kCharMask));
+ }
+}
+
+template <>
+inline void Serialize<int64_t>(int64_t t, std::string* dest) {
+ for (unsigned long i = 0; i < sizeof(int64_t); i++) {
+ dest->append(
+ 1, static_cast<char>((t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) &
+ kCharMask));
+ }
+}
+
+template <>
+inline int8_t Deserialize<int8_t>(const char* src, std::size_t offset) {
+ return static_cast<int8_t>(src[offset]);
+}
+
+template <>
+inline int32_t Deserialize<int32_t>(const char* src, std::size_t offset) {
+ int32_t result = 0;
+ for (unsigned long i = 0; i < sizeof(int32_t); i++) {
+ result |= static_cast<int32_t>(static_cast<unsigned char>(src[offset + i]))
+ << ((sizeof(int32_t) - 1 - i) * kBitsPerByte);
+ }
+ return result;
+}
+
+template <>
+inline int64_t Deserialize<int64_t>(const char* src, std::size_t offset) {
+ int64_t result = 0;
+ for (unsigned long i = 0; i < sizeof(int64_t); i++) {
+ result |= static_cast<int64_t>(static_cast<unsigned char>(src[offset + i]))
+ << ((sizeof(int64_t) - 1 - i) * kBitsPerByte);
+ }
+ return result;
+}
+
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/test_utils.cc b/src/rocksdb/utilities/cassandra/test_utils.cc
new file mode 100644
index 000000000..ec6e5752d
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/test_utils.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+const char kData[] = {'d', 'a', 't', 'a'};
+const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'};
+const int32_t kTtl = 86400;
+const int8_t kColumn = 0;
+const int8_t kTombstone = 1;
+const int8_t kExpiringColumn = 2;
+
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask, int8_t index,
+ int64_t timestamp) {
+ if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
+ return std::shared_ptr<Tombstone>(
+ new Tombstone(mask, index, ToSeconds(timestamp), timestamp));
+ } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
+ return std::shared_ptr<ExpiringColumn>(new ExpiringColumn(
+ mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl));
+ } else {
+ return std::shared_ptr<Column>(
+ new Column(mask, index, timestamp, sizeof(kData), kData));
+ }
+}
+
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+ int8_t index,
+ int64_t timestamp) {
+ return std::make_tuple(mask, index, timestamp);
+}
+
+RowValue CreateTestRowValue(
+ std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs) {
+ std::vector<std::shared_ptr<ColumnBase>> columns;
+ int64_t last_modified_time = 0;
+ for (auto spec : column_specs) {
+ auto c = CreateTestColumn(std::get<0>(spec), std::get<1>(spec),
+ std::get<2>(spec));
+ last_modified_time = std::max(last_modified_time, c->Timestamp());
+ columns.push_back(std::move(c));
+ }
+ return RowValue(std::move(columns), last_modified_time);
+}
+
+RowValue CreateRowTombstone(int64_t timestamp) {
+ return RowValue(ToSeconds(timestamp), timestamp);
+}
+
+void VerifyRowValueColumns(
+ const std::vector<std::shared_ptr<ColumnBase>> &columns,
+ std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+ int64_t expected_timestamp) {
+ EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp());
+ EXPECT_EQ(expected_mask, columns[index_of_vector]->Mask());
+ EXPECT_EQ(expected_index, columns[index_of_vector]->Index());
+}
+
+int64_t ToMicroSeconds(int64_t seconds) { return seconds * (int64_t)1000000; }
+
+int32_t ToSeconds(int64_t microseconds) {
+ return (int32_t)(microseconds / (int64_t)1000000);
+}
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/test_utils.h b/src/rocksdb/utilities/cassandra/test_utils.h
new file mode 100644
index 000000000..be23f7076
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/test_utils.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+extern const char kData[];
+extern const char kExpiringData[];
+extern const int32_t kTtl;
+extern const int8_t kColumn;
+extern const int8_t kTombstone;
+extern const int8_t kExpiringColumn;
+
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask, int8_t index,
+ int64_t timestamp);
+
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+ int8_t index,
+ int64_t timestamp);
+
+RowValue CreateTestRowValue(
+ std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs);
+
+RowValue CreateRowTombstone(int64_t timestamp);
+
+void VerifyRowValueColumns(
+ const std::vector<std::shared_ptr<ColumnBase>> &columns,
+ std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+ int64_t expected_timestamp);
+
+int64_t ToMicroSeconds(int64_t seconds);
+int32_t ToSeconds(int64_t microseconds);
+} // namespace cassandra
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc
new file mode 100644
index 000000000..44ce70b1b
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc
@@ -0,0 +1,469 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/checkpoint/checkpoint_impl.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "db/wal_manager.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
+ *checkpoint_ptr = new CheckpointImpl(db);
+ return Status::OK();
+}
+
+Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
+ uint64_t /*log_size_for_flush*/,
+ uint64_t* /*sequence_number_ptr*/) {
+ return Status::NotSupported("");
+}
+
+void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path,
+ Logger* info_log) {
+ std::vector<std::string> subchildren;
+ Status s = db_->GetEnv()->FileExists(full_private_path);
+ if (s.IsNotFound()) {
+ return;
+ }
+ ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(),
+ s.ToString().c_str());
+ s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+ if (s.ok()) {
+ for (auto& subchild : subchildren) {
+ std::string subchild_path = full_private_path + "/" + subchild;
+ s = db_->GetEnv()->DeleteFile(subchild_path);
+ ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
+ s.ToString().c_str());
+ }
+ }
+ // finally delete the private dir
+ s = db_->GetEnv()->DeleteDir(full_private_path);
+ ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(),
+ s.ToString().c_str());
+}
+
+Status Checkpoint::ExportColumnFamily(
+ ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
+ ExportImportFilesMetaData** /*metadata*/) {
+ return Status::NotSupported("");
+}
+
+// Builds an openable snapshot of RocksDB
+Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
+ uint64_t log_size_for_flush,
+ uint64_t* sequence_number_ptr) {
+ DBOptions db_options = db_->GetDBOptions();
+
+ Status s = db_->GetEnv()->FileExists(checkpoint_dir);
+ if (s.ok()) {
+ return Status::InvalidArgument("Directory exists");
+ } else if (!s.IsNotFound()) {
+ assert(s.IsIOError());
+ return s;
+ }
+
+ ROCKS_LOG_INFO(
+ db_options.info_log,
+ "Started the snapshot process -- creating snapshot in directory %s",
+ checkpoint_dir.c_str());
+
+ size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/');
+ if (final_nonslash_idx == std::string::npos) {
+ // npos means it's only slashes or empty. Non-empty means it's the root
+ // directory, but it shouldn't be because we verified above the directory
+ // doesn't exist.
+ assert(checkpoint_dir.empty());
+ return Status::InvalidArgument("invalid checkpoint directory name");
+ }
+
+ std::string full_private_path =
+ checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+ ROCKS_LOG_INFO(db_options.info_log,
+ "Snapshot process -- using temporary directory %s",
+ full_private_path.c_str());
+ CleanStagingDirectory(full_private_path, db_options.info_log.get());
+ // create snapshot directory
+ s = db_->GetEnv()->CreateDir(full_private_path);
+ uint64_t sequence_number = 0;
+ if (s.ok()) {
+ // enable file deletions
+ s = db_->DisableFileDeletions();
+ const bool disabled_file_deletions = s.ok();
+
+ if (s.ok() || s.IsNotSupported()) {
+ s = CreateCustomCheckpoint(
+ [&](const std::string& src_dirname, const std::string& fname,
+ FileType) {
+ ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
+ fname.c_str());
+ return db_->GetFileSystem()->LinkFile(
+ src_dirname + "/" + fname, full_private_path + "/" + fname,
+ IOOptions(), nullptr);
+ } /* link_file_cb */,
+ [&](const std::string& src_dirname, const std::string& fname,
+ uint64_t size_limit_bytes, FileType,
+ const std::string& /* checksum_func_name */,
+ const std::string& /* checksum_val */,
+ const Temperature temperature) {
+ ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
+ return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname,
+ full_private_path + "/" + fname, size_limit_bytes,
+ db_options.use_fsync, nullptr, temperature);
+ } /* copy_file_cb */,
+ [&](const std::string& fname, const std::string& contents, FileType) {
+ ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
+ return CreateFile(db_->GetFileSystem(),
+ full_private_path + "/" + fname, contents,
+ db_options.use_fsync);
+ } /* create_file_cb */,
+ &sequence_number, log_size_for_flush);
+
+ // we copied all the files, enable file deletions
+ if (disabled_file_deletions) {
+ Status ss = db_->EnableFileDeletions(false);
+ assert(ss.ok());
+ ss.PermitUncheckedError();
+ }
+ }
+ }
+
+ if (s.ok()) {
+ // move tmp private backup to real snapshot directory
+ s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
+ }
+ if (s.ok()) {
+ std::unique_ptr<FSDirectory> checkpoint_directory;
+ s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(),
+ &checkpoint_directory, nullptr);
+ if (s.ok() && checkpoint_directory != nullptr) {
+ s = checkpoint_directory->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
+ }
+ }
+
+ if (s.ok()) {
+ if (sequence_number_ptr != nullptr) {
+ *sequence_number_ptr = sequence_number;
+ }
+ // here we know that we succeeded and installed the new snapshot
+ ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good");
+ ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64,
+ sequence_number);
+ } else {
+ // clean all the files we might have created
+ ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s",
+ s.ToString().c_str());
+ CleanStagingDirectory(full_private_path, db_options.info_log.get());
+ }
+ return s;
+}
+
+Status CheckpointImpl::CreateCustomCheckpoint(
+ std::function<Status(const std::string& src_dirname,
+ const std::string& src_fname, FileType type)>
+ link_file_cb,
+ std::function<
+ Status(const std::string& src_dirname, const std::string& src_fname,
+ uint64_t size_limit_bytes, FileType type,
+ const std::string& checksum_func_name,
+ const std::string& checksum_val, const Temperature temperature)>
+ copy_file_cb,
+ std::function<Status(const std::string& fname, const std::string& contents,
+ FileType type)>
+ create_file_cb,
+ uint64_t* sequence_number, uint64_t log_size_for_flush,
+ bool get_live_table_checksum) {
+ *sequence_number = db_->GetLatestSequenceNumber();
+
+ LiveFilesStorageInfoOptions opts;
+ opts.include_checksum_info = get_live_table_checksum;
+ opts.wal_size_for_flush = log_size_for_flush;
+
+ std::vector<LiveFileStorageInfo> infos;
+ {
+ Status s = db_->GetLiveFilesStorageInfo(opts, &infos);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Verify that everything except WAL files are in same directory
+ // (db_paths / cf_paths not supported)
+ std::unordered_set<std::string> dirs;
+ for (auto& info : infos) {
+ if (info.file_type != kWalFile) {
+ dirs.insert(info.directory);
+ }
+ }
+ if (dirs.size() > 1) {
+ return Status::NotSupported(
+ "db_paths / cf_paths not supported for Checkpoint nor BackupEngine");
+ }
+
+ bool same_fs = true;
+
+ for (auto& info : infos) {
+ Status s;
+ if (!info.replacement_contents.empty()) {
+ // Currently should only be used for CURRENT file.
+ assert(info.file_type == kCurrentFile);
+
+ if (info.size != info.replacement_contents.size()) {
+ s = Status::Corruption("Inconsistent size metadata for " +
+ info.relative_filename);
+ } else {
+ s = create_file_cb(info.relative_filename, info.replacement_contents,
+ info.file_type);
+ }
+ } else {
+ if (same_fs && !info.trim_to_size) {
+ s = link_file_cb(info.directory, info.relative_filename,
+ info.file_type);
+ if (s.IsNotSupported()) {
+ same_fs = false;
+ s = Status::OK();
+ }
+ s.MustCheck();
+ }
+ if (!same_fs || info.trim_to_size) {
+ assert(info.file_checksum_func_name.empty() ==
+ !opts.include_checksum_info);
+ // no assertion on file_checksum because empty is used for both "not
+ // set" and "unknown"
+ if (opts.include_checksum_info) {
+ s = copy_file_cb(info.directory, info.relative_filename, info.size,
+ info.file_type, info.file_checksum_func_name,
+ info.file_checksum, info.temperature);
+ } else {
+ s = copy_file_cb(info.directory, info.relative_filename, info.size,
+ info.file_type, kUnknownFileChecksumFuncName,
+ kUnknownFileChecksum, info.temperature);
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+// Exports all live SST files of a specified Column Family onto export_dir,
+// returning SST files information in metadata.
+Status CheckpointImpl::ExportColumnFamily(
+ ColumnFamilyHandle* handle, const std::string& export_dir,
+ ExportImportFilesMetaData** metadata) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
+ const auto cf_name = cfh->GetName();
+ const auto db_options = db_->GetDBOptions();
+
+ assert(metadata != nullptr);
+ assert(*metadata == nullptr);
+ auto s = db_->GetEnv()->FileExists(export_dir);
+ if (s.ok()) {
+ return Status::InvalidArgument("Specified export_dir exists");
+ } else if (!s.IsNotFound()) {
+ assert(s.IsIOError());
+ return s;
+ }
+
+ const auto final_nonslash_idx = export_dir.find_last_not_of('/');
+ if (final_nonslash_idx == std::string::npos) {
+ return Status::InvalidArgument("Specified export_dir invalid");
+ }
+ ROCKS_LOG_INFO(db_options.info_log,
+ "[%s] export column family onto export directory %s",
+ cf_name.c_str(), export_dir.c_str());
+
+ // Create a temporary export directory.
+ const auto tmp_export_dir =
+ export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+ s = db_->GetEnv()->CreateDir(tmp_export_dir);
+
+ if (s.ok()) {
+ s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
+ }
+
+ ColumnFamilyMetaData db_metadata;
+ if (s.ok()) {
+ // Export live sst files with file deletions disabled.
+ s = db_->DisableFileDeletions();
+ if (s.ok()) {
+ db_->GetColumnFamilyMetaData(handle, &db_metadata);
+
+ s = ExportFilesInMetaData(
+ db_options, db_metadata,
+ [&](const std::string& src_dirname, const std::string& fname) {
+ ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
+ cf_name.c_str(), fname.c_str());
+ return db_->GetEnv()->LinkFile(src_dirname + fname,
+ tmp_export_dir + fname);
+ } /*link_file_cb*/,
+ [&](const std::string& src_dirname, const std::string& fname) {
+ ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
+ cf_name.c_str(), fname.c_str());
+ return CopyFile(db_->GetFileSystem(), src_dirname + fname,
+ tmp_export_dir + fname, 0, db_options.use_fsync,
+ nullptr, Temperature::kUnknown);
+ } /*copy_file_cb*/);
+
+ const auto enable_status = db_->EnableFileDeletions(false /*force*/);
+ if (s.ok()) {
+ s = enable_status;
+ }
+ }
+ }
+
+ auto moved_to_user_specified_dir = false;
+ if (s.ok()) {
+ // Move temporary export directory to the actual export directory.
+ s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
+ }
+
+ if (s.ok()) {
+ // Fsync export directory.
+ moved_to_user_specified_dir = true;
+ std::unique_ptr<FSDirectory> dir_ptr;
+ s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr,
+ nullptr);
+ if (s.ok()) {
+ assert(dir_ptr != nullptr);
+ s = dir_ptr->FsyncWithDirOptions(
+ IOOptions(), nullptr,
+ DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
+ }
+ }
+
+ if (s.ok()) {
+ // Export of files succeeded. Fill in the metadata information.
+ auto result_metadata = new ExportImportFilesMetaData();
+ result_metadata->db_comparator_name = handle->GetComparator()->Name();
+ for (const auto& level_metadata : db_metadata.levels) {
+ for (const auto& file_metadata : level_metadata.files) {
+ LiveFileMetaData live_file_metadata;
+ live_file_metadata.size = file_metadata.size;
+ live_file_metadata.name = std::move(file_metadata.name);
+ live_file_metadata.file_number = file_metadata.file_number;
+ live_file_metadata.db_path = export_dir;
+ live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
+ live_file_metadata.largest_seqno = file_metadata.largest_seqno;
+ live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
+ live_file_metadata.largestkey = std::move(file_metadata.largestkey);
+ live_file_metadata.oldest_blob_file_number =
+ file_metadata.oldest_blob_file_number;
+ live_file_metadata.level = level_metadata.level;
+ result_metadata->files.push_back(live_file_metadata);
+ }
+ *metadata = result_metadata;
+ }
+ ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
+ cf_name.c_str());
+ } else {
+ // Failure: Clean up all the files/directories created.
+ ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
+ cf_name.c_str(), s.ToString().c_str());
+ std::vector<std::string> subchildren;
+ const auto cleanup_dir =
+ moved_to_user_specified_dir ? export_dir : tmp_export_dir;
+ db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
+ for (const auto& subchild : subchildren) {
+ const auto subchild_path = cleanup_dir + "/" + subchild;
+ const auto status = db_->GetEnv()->DeleteFile(subchild_path);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
+ subchild_path.c_str(), status.ToString().c_str());
+ }
+ }
+ const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
+ cleanup_dir.c_str(), status.ToString().c_str());
+ }
+ }
+ return s;
+}
+
+Status CheckpointImpl::ExportFilesInMetaData(
+ const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+ std::function<Status(const std::string& src_dirname,
+ const std::string& src_fname)>
+ link_file_cb,
+ std::function<Status(const std::string& src_dirname,
+ const std::string& src_fname)>
+ copy_file_cb) {
+ Status s;
+ auto hardlink_file = true;
+
+ // Copy/hard link files in metadata.
+ size_t num_files = 0;
+ for (const auto& level_metadata : metadata.levels) {
+ for (const auto& file_metadata : level_metadata.files) {
+ uint64_t number;
+ FileType type;
+ const auto ok = ParseFileName(file_metadata.name, &number, &type);
+ if (!ok) {
+ s = Status::Corruption("Could not parse file name");
+ break;
+ }
+
+ // We should only get sst files here.
+ assert(type == kTableFile);
+ assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
+ const auto src_fname = file_metadata.name;
+ ++num_files;
+
+ if (hardlink_file) {
+ s = link_file_cb(db_->GetName(), src_fname);
+ if (num_files == 1 && s.IsNotSupported()) {
+ // Fallback to copy if link failed due to cross-device directories.
+ hardlink_file = false;
+ s = Status::OK();
+ }
+ }
+ if (!hardlink_file) {
+ s = copy_file_cb(db_->GetName(), src_fname);
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
+ num_files);
+
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.h b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h
new file mode 100644
index 000000000..2947330cc
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/checkpoint.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CheckpointImpl : public Checkpoint {
+ public:
+ explicit CheckpointImpl(DB* db) : db_(db) {}
+
+ Status CreateCheckpoint(const std::string& checkpoint_dir,
+ uint64_t log_size_for_flush,
+ uint64_t* sequence_number_ptr) override;
+
+ Status ExportColumnFamily(ColumnFamilyHandle* handle,
+ const std::string& export_dir,
+ ExportImportFilesMetaData** metadata) override;
+
+ // Checkpoint logic can be customized by providing callbacks for link, copy,
+ // or create.
+ Status CreateCustomCheckpoint(
+ std::function<Status(const std::string& src_dirname,
+ const std::string& fname, FileType type)>
+ link_file_cb,
+ std::function<Status(const std::string& src_dirname,
+ const std::string& fname, uint64_t size_limit_bytes,
+ FileType type, const std::string& checksum_func_name,
+ const std::string& checksum_val,
+ const Temperature src_temperature)>
+ copy_file_cb,
+ std::function<Status(const std::string& fname,
+ const std::string& contents, FileType type)>
+ create_file_cb,
+ uint64_t* sequence_number, uint64_t log_size_for_flush,
+ bool get_live_table_checksum = false);
+
+ private:
+ void CleanStagingDirectory(const std::string& path, Logger* info_log);
+
+ // Export logic customization by providing callbacks for link or copy.
+ Status ExportFilesInMetaData(
+ const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+ std::function<Status(const std::string& src_dirname,
+ const std::string& fname)>
+ link_file_cb,
+ std::function<Status(const std::string& src_dirname,
+ const std::string& fname)>
+ copy_file_cb);
+
+ private:
+ DB* db_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_test.cc b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
new file mode 100644
index 000000000..3da753d5f
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
@@ -0,0 +1,974 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Syncpoint prevents us building and running tests in release
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/checkpoint.h"
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <iostream>
+#include <thread>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CheckpointTest : public testing::Test {
+ protected:
+ // Sequence of option configurations to try
+ enum OptionConfig {
+ kDefault = 0,
+ };
+ int option_config_;
+
+ public:
+ std::string dbname_;
+ std::string alternative_wal_dir_;
+ Env* env_;
+ DB* db_;
+ Options last_options_;
+ std::vector<ColumnFamilyHandle*> handles_;
+ std::string snapshot_name_;
+ std::string export_path_;
+ ColumnFamilyHandle* cfh_reverse_comp_;
+ ExportImportFilesMetaData* metadata_;
+
+ CheckpointTest() : env_(Env::Default()) {
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->SetBackgroundThreads(1, Env::HIGH);
+ dbname_ = test::PerThreadDBPath(env_, "checkpoint_test");
+ alternative_wal_dir_ = dbname_ + "/wal";
+ auto options = CurrentOptions();
+ auto delete_options = options;
+ delete_options.wal_dir = alternative_wal_dir_;
+ EXPECT_OK(DestroyDB(dbname_, delete_options));
+ // Destroy it for not alternative WAL dir is used.
+ EXPECT_OK(DestroyDB(dbname_, options));
+ db_ = nullptr;
+ snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
+ std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
+ EXPECT_OK(DestroyDB(snapshot_name_, options));
+ test::DeleteDir(env_, snapshot_name_);
+ EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
+ test::DeleteDir(env_, snapshot_tmp_name);
+ Reopen(options);
+ export_path_ = test::PerThreadDBPath("/export");
+ DestroyDir(env_, export_path_).PermitUncheckedError();
+ cfh_reverse_comp_ = nullptr;
+ metadata_ = nullptr;
+ }
+
+ ~CheckpointTest() override {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ if (cfh_reverse_comp_) {
+ EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_));
+ cfh_reverse_comp_ = nullptr;
+ }
+ if (metadata_) {
+ delete metadata_;
+ metadata_ = nullptr;
+ }
+ Close();
+ Options options;
+ options.db_paths.emplace_back(dbname_, 0);
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ options.db_paths.emplace_back(dbname_ + "_3", 0);
+ options.db_paths.emplace_back(dbname_ + "_4", 0);
+ EXPECT_OK(DestroyDB(dbname_, options));
+ EXPECT_OK(DestroyDB(snapshot_name_, options));
+ DestroyDir(env_, export_path_).PermitUncheckedError();
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ Options options;
+ options.env = env_;
+ options.create_if_missing = true;
+ return options;
+ }
+
+ void CreateColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ColumnFamilyOptions cf_opts(options);
+ size_t cfi = handles_.size();
+ handles_.resize(cfi + cfs.size());
+ for (auto cf : cfs) {
+ ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+ }
+ }
+
+ void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+ const Options& options) {
+ CreateColumnFamilies(cfs, options);
+ std::vector<std::string> cfs_plus_default = cfs;
+ cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+ ReopenWithColumnFamilies(cfs_plus_default, options);
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const std::vector<Options>& options) {
+ Close();
+ EXPECT_EQ(cfs.size(), options.size());
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+ }
+ DBOptions db_opts = DBOptions(options[0]);
+ return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+ }
+
+ Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ Close();
+ std::vector<Options> v_opts(cfs.size(), options);
+ return TryReopenWithColumnFamilies(cfs, v_opts);
+ }
+
+ void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+ void CompactAll() {
+ for (auto h : handles_) {
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr));
+ }
+ }
+
+ void Close() {
+ for (auto h : handles_) {
+ delete h;
+ }
+ handles_.clear();
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(const Options& options) {
+ // Destroy using last options
+ Destroy(last_options_);
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Destroy(const Options& options) {
+ Close();
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ Status ReadOnlyReopen(const Options& options) {
+ return DB::OpenForReadOnly(options, dbname_, &db_);
+ }
+
+ Status ReadOnlyReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+ const Options& options) {
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (const auto& cf : cfs) {
+ column_families.emplace_back(cf, options);
+ }
+ return DB::OpenForReadOnly(options, dbname_, column_families, &handles_,
+ &db_);
+ }
+
+ Status TryReopen(const Options& options) {
+ Close();
+ last_options_ = options;
+ return DB::Open(options, dbname_, &db_);
+ }
+
+ Status Flush(int cf = 0) {
+ if (cf == 0) {
+ return db_->Flush(FlushOptions());
+ } else {
+ return db_->Flush(FlushOptions(), handles_[cf]);
+ }
+ }
+
+ Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, k, v);
+ }
+
+ Status Put(int cf, const Slice& k, const Slice& v,
+ WriteOptions wo = WriteOptions()) {
+ return db_->Put(wo, handles_[cf], k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ Status Delete(int cf, const std::string& k) {
+ return db_->Delete(WriteOptions(), handles_[cf], k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string Get(int cf, const std::string& k,
+ const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.verify_checksums = true;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, handles_[cf], k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+};
+
+TEST_F(CheckpointTest, GetSnapshotLink) {
+ for (uint64_t log_size_for_flush : {0, 1000000}) {
+ Options options;
+ DB* snapshotDB;
+ ReadOptions roptions;
+ std::string result;
+ Checkpoint* checkpoint;
+
+ options = CurrentOptions();
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ // Create a database
+ options.create_if_missing = true;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ std::string key = std::string("foo");
+ ASSERT_OK(Put(key, "v1"));
+ // Take a snapshot
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush));
+ ASSERT_OK(Put(key, "v2"));
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_OK(Flush());
+ ASSERT_EQ("v2", Get(key));
+ // Open snapshot and verify contents while DB is running
+ options.create_if_missing = false;
+ ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
+ ASSERT_OK(snapshotDB->Get(roptions, key, &result));
+ ASSERT_EQ("v1", result);
+ delete snapshotDB;
+ snapshotDB = nullptr;
+ delete db_;
+ db_ = nullptr;
+
+ // Destroy original DB
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ // Open snapshot and verify contents
+ options.create_if_missing = false;
+ dbname_ = snapshot_name_;
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ ASSERT_EQ("v1", Get(key));
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ delete checkpoint;
+
+ // Restore DB name
+ dbname_ = test::PerThreadDBPath(env_, "db_test");
+ }
+}
+
+TEST_F(CheckpointTest, CheckpointWithBlob) {
+ // Create a database with a blob file
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ constexpr char key[] = "key";
+ constexpr char blob[] = "blob";
+
+ ASSERT_OK(Put(key, blob));
+ ASSERT_OK(Flush());
+
+ // Create a checkpoint
+ Checkpoint* checkpoint = nullptr;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+ std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
+
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+ // Make sure it contains the blob file
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+
+ bool blob_file_found = false;
+ for (const auto& file : files) {
+ uint64_t number = 0;
+ FileType type = kWalFile;
+
+ if (ParseFileName(file, &number, &type) && type == kBlobFile) {
+ blob_file_found = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(blob_file_found);
+
+ // Make sure the checkpoint can be opened and the blob value read
+ options.create_if_missing = false;
+ DB* checkpoint_db = nullptr;
+ ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
+
+ std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
+
+ PinnableSlice value;
+ ASSERT_OK(checkpoint_db->Get(
+ ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
+
+ ASSERT_EQ(value, blob);
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
+ // Create a database
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({}, options);
+
+ // Helper to verify the number of files in metadata and export dir
+ auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
+ int num_files_expected) {
+ ASSERT_EQ(metadata.files.size(), num_files_expected);
+ std::vector<std::string> subchildren;
+ ASSERT_OK(env_->GetChildren(export_path_, &subchildren));
+ ASSERT_EQ(subchildren.size(), num_files_expected);
+ };
+
+ // Test DefaultColumnFamily
+ {
+ const auto key = std::string("foo");
+ ASSERT_OK(Put(key, "v1"));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+ // Export the Tables and verify
+ ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+ export_path_, &metadata_));
+ verify_files_exported(*metadata_, 1);
+ ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+ ASSERT_OK(DestroyDir(env_, export_path_));
+ delete metadata_;
+ metadata_ = nullptr;
+
+ // Check again after compaction
+ CompactAll();
+ ASSERT_OK(Put(key, "v2"));
+ ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+ export_path_, &metadata_));
+ verify_files_exported(*metadata_, 2);
+ ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+ ASSERT_OK(DestroyDir(env_, export_path_));
+ delete metadata_;
+ metadata_ = nullptr;
+ delete checkpoint;
+ }
+
+ // Test non default column family with non default comparator
+ {
+ auto cf_options = CurrentOptions();
+ cf_options.comparator = ReverseBytewiseComparator();
+ ASSERT_OK(db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
+
+ const auto key = std::string("foo");
+ ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+ // Export the Tables and verify
+ ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
+ &metadata_));
+ verify_files_exported(*metadata_, 1);
+ ASSERT_EQ(metadata_->db_comparator_name,
+ ReverseBytewiseComparator()->Name());
+ delete checkpoint;
+ }
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
+ // Create a database
+ auto options = CurrentOptions();
+ options.create_if_missing = true;
+ CreateAndReopenWithCF({}, options);
+
+ const auto key = std::string("foo");
+ ASSERT_OK(Put(key, "v1"));
+
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+ // Export onto existing directory
+ ASSERT_OK(env_->CreateDirIfMissing(export_path_));
+ ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+ export_path_, &metadata_),
+ Status::InvalidArgument("Specified export_dir exists"));
+ ASSERT_OK(DestroyDir(env_, export_path_));
+
+ // Export with invalid directory specification
+ export_path_ = "";
+ ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+ export_path_, &metadata_),
+ Status::InvalidArgument("Specified export_dir invalid"));
+ delete checkpoint;
+}
+
+TEST_F(CheckpointTest, CheckpointCF) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CheckpointTest::CheckpointCF:2", "DBImpl::GetLiveFiles:2"},
+ {"DBImpl::GetLiveFiles:1", "CheckpointTest::CheckpointCF:1"}});
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(0, "Default", "Default"));
+ ASSERT_OK(Put(1, "one", "one"));
+ ASSERT_OK(Put(2, "two", "two"));
+ ASSERT_OK(Put(3, "three", "three"));
+ ASSERT_OK(Put(4, "four", "four"));
+ ASSERT_OK(Put(5, "five", "five"));
+
+ DB* snapshotDB;
+ ReadOptions roptions;
+ std::string result;
+ std::vector<ColumnFamilyHandle*> cphandles;
+
+ // Take a snapshot
+ ROCKSDB_NAMESPACE::port::Thread t([&]() {
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ });
+ TEST_SYNC_POINT("CheckpointTest::CheckpointCF:1");
+ ASSERT_OK(Put(0, "Default", "Default1"));
+ ASSERT_OK(Put(1, "one", "eleven"));
+ ASSERT_OK(Put(2, "two", "twelve"));
+ ASSERT_OK(Put(3, "three", "thirteen"));
+ ASSERT_OK(Put(4, "four", "fourteen"));
+ ASSERT_OK(Put(5, "five", "fifteen"));
+ TEST_SYNC_POINT("CheckpointTest::CheckpointCF:2");
+ t.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_OK(Put(1, "one", "twentyone"));
+ ASSERT_OK(Put(2, "two", "twentytwo"));
+ ASSERT_OK(Put(3, "three", "twentythree"));
+ ASSERT_OK(Put(4, "four", "twentyfour"));
+ ASSERT_OK(Put(5, "five", "twentyfive"));
+ ASSERT_OK(Flush());
+
+ // Open snapshot and verify contents while DB is running
+ options.create_if_missing = false;
+ std::vector<std::string> cfs;
+ cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
+ }
+ ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
+ &snapshotDB));
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
+ ASSERT_EQ("Default1", result);
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result));
+ ASSERT_EQ("eleven", result);
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result));
+ for (auto h : cphandles) {
+ delete h;
+ }
+ cphandles.clear();
+ delete snapshotDB;
+ snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointCFNoFlush) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put(0, "Default", "Default"));
+ ASSERT_OK(Put(1, "one", "one"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put(2, "two", "two"));
+
+ DB* snapshotDB;
+ ReadOptions roptions;
+ std::string result;
+ std::vector<ColumnFamilyHandle*> cphandles;
+
+ // Take a snapshot
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
+ // Flush should never trigger.
+ FAIL();
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000));
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ delete checkpoint;
+ ASSERT_OK(Put(1, "one", "two"));
+ ASSERT_OK(Flush(1));
+ ASSERT_OK(Put(2, "two", "twentytwo"));
+ Close();
+ EXPECT_OK(DestroyDB(dbname_, options));
+
+ // Open snapshot and verify contents while DB is running
+ options.create_if_missing = false;
+ std::vector<std::string> cfs;
+ cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < cfs.size(); ++i) {
+ column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
+ }
+ ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
+ &snapshotDB));
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
+ ASSERT_EQ("Default", result);
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result));
+ ASSERT_EQ("one", result);
+ ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result));
+ ASSERT_EQ("two", result);
+ for (auto h : cphandles) {
+ delete h;
+ }
+ cphandles.clear();
+ delete snapshotDB;
+ snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
+ Options options = CurrentOptions();
+ options.max_manifest_file_size = 0; // always rollover manifest for file add
+ Reopen(options);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {// Get past the flush in the checkpoint thread before adding any keys to
+ // the db so the checkpoint thread won't hit the WriteManifest
+ // syncpoints.
+ {"CheckpointImpl::CreateCheckpoint:FlushDone",
+ "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"},
+ // Roll the manifest during checkpointing right after live files are
+ // snapshotted.
+ {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+ "VersionSet::LogAndApply:WriteManifest"},
+ {"VersionSet::LogAndApply:WriteManifestDone",
+ "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread t([&]() {
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ });
+ TEST_SYNC_POINT(
+ "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut");
+ ASSERT_OK(Put("Default", "Default1"));
+ ASSERT_OK(Flush());
+ t.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ DB* snapshotDB;
+ // Successful Open() implies that CURRENT pointed to the manifest in the
+ // checkpoint.
+ ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
+ delete snapshotDB;
+ snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
+ Close();
+ const std::string dbname = test::PerThreadDBPath("transaction_testdb");
+ ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
+ test::DeleteDir(env_, dbname);
+
+ Options options = CurrentOptions();
+ options.allow_2pc = true;
+ // allow_2pc is implicitly set with tx prepare
+ // options.allow_2pc = true;
+ TransactionDBOptions txn_db_options;
+ TransactionDB* txdb;
+ Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
+ ASSERT_OK(s);
+ ColumnFamilyHandle* cfa;
+ ColumnFamilyHandle* cfb;
+ ColumnFamilyOptions cf_options;
+ ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFA", &cfa));
+
+ WriteOptions write_options;
+ // Insert something into CFB so lots of log files will be kept
+ // before creating the checkpoint.
+ ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFB", &cfb));
+ ASSERT_OK(txdb->Put(write_options, cfb, "", ""));
+
+ ReadOptions read_options;
+ std::string value;
+ TransactionOptions txn_options;
+ Transaction* txn = txdb->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("xid");
+ ASSERT_OK(s);
+ ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
+
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+ s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
+ ASSERT_OK(s);
+ // Writing prepare into middle of first WAL, then flush WALs many times
+ for (int i = 1; i <= 100000; i++) {
+ Transaction* tx = txdb->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(tx->SetName("x"));
+ ASSERT_OK(tx->Put(Slice(std::to_string(i)), Slice("val")));
+ ASSERT_OK(tx->Put(cfa, Slice("aaa"), Slice("111")));
+ ASSERT_OK(tx->Prepare());
+ ASSERT_OK(tx->Commit());
+ if (i % 10000 == 0) {
+ ASSERT_OK(txdb->Flush(FlushOptions()));
+ }
+ if (i == 88888) {
+ ASSERT_OK(txn->Prepare());
+ }
+ delete tx;
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+ "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"},
+ {"CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit",
+ "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread t([&]() {
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(txdb, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ });
+ TEST_SYNC_POINT(
+ "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit");
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ TEST_SYNC_POINT(
+ "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit");
+ t.join();
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // No more than two logs files should exist.
+ std::vector<std::string> files;
+ ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+ int num_log_files = 0;
+ for (auto& file : files) {
+ uint64_t num;
+ FileType type;
+ WalFileType log_type;
+ if (ParseFileName(file, &num, &type, &log_type) && type == kWalFile) {
+ num_log_files++;
+ }
+ }
+ // One flush after preapare + one outstanding file before checkpoint + one log
+ // file generated after checkpoint.
+ ASSERT_LE(num_log_files, 3);
+
+ TransactionDB* snapshotDB;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+ std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+ ASSERT_OK(TransactionDB::Open(options, txn_db_options, snapshot_name_,
+ column_families, &cf_handles, &snapshotDB));
+ ASSERT_OK(snapshotDB->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+ ASSERT_OK(snapshotDB->Get(read_options, cf_handles[1], "foocfa", &value));
+ ASSERT_EQ(value, "barcfa");
+
+ delete cfa;
+ delete cfb;
+ delete cf_handles[0];
+ delete cf_handles[1];
+ delete cf_handles[2];
+ delete snapshotDB;
+ snapshotDB = nullptr;
+ delete txdb;
+}
+
+TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) {
+ for (std::string checkpoint_dir : {"", "/", "////"}) {
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_TRUE(
+ checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument());
+ delete checkpoint;
+ }
+}
+
+TEST_F(CheckpointTest, CheckpointWithParallelWrites) {
+ // When run with TSAN, this exposes the data race fixed in
+ // https://github.com/facebook/rocksdb/pull/3603
+ ASSERT_OK(Put("key1", "val1"));
+ port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); });
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ thread.join();
+}
+
+TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
+ Options options = CurrentOptions();
+ std::unique_ptr<FaultInjectionTestEnv> env(new FaultInjectionTestEnv(env_));
+ options.env = env.get();
+ Reopen(options);
+ ASSERT_OK(Put("key1", "val1"));
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ ASSERT_OK(env->DropUnsyncedFileData());
+
+ // make sure it's openable even though whatever data that wasn't synced got
+ // dropped.
+ options.env = env_;
+ DB* snapshot_db;
+ ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+ ReadOptions read_opts;
+ std::string get_result;
+ ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+ ASSERT_EQ("val1", get_result);
+ delete snapshot_db;
+ delete db_;
+ db_ = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
+ // Regression test for a bug where checkpoint failed on a DB where persisting
+ // OPTIONS file failed and the DB was opened with
+ // `fail_if_options_file_error == false`.
+ Options options = CurrentOptions();
+ options.fail_if_options_file_error = false;
+ auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+
+ // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
+ // operation when inside the OPTIONS file persisting code.
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+ fault_fs->SetRandomMetadataWriteError(1 /* one_in */);
+ SyncPoint::GetInstance()->SetCallBack(
+ "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
+ fault_fs->EnableMetadataWriteErrorInjection();
+ });
+ SyncPoint::GetInstance()->SetCallBack(
+ "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
+ [fault_fs](void* /* arg */) {
+ fault_fs->DisableMetadataWriteErrorInjection();
+ });
+ options.env = fault_fs_env.get();
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ Reopen(options);
+ ASSERT_OK(Put("key1", "val1"));
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+
+ // Make sure it's usable.
+ options.env = env_;
+ DB* snapshot_db;
+ ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+ ReadOptions read_opts;
+ std::string get_result;
+ ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+ ASSERT_EQ("val1", get_result);
+ delete snapshot_db;
+ delete db_;
+ db_ = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Flush());
+ Close();
+ Options options = CurrentOptions();
+ ASSERT_OK(ReadOnlyReopen(options));
+ Checkpoint* checkpoint = nullptr;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ checkpoint = nullptr;
+ Close();
+ DB* snapshot_db = nullptr;
+ ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+ ReadOptions read_opts;
+ std::string get_result;
+ ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
+ ASSERT_EQ("foo_value", get_result);
+ delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
+ Options options = CurrentOptions();
+ CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+ for (int i = 0; i != 3; ++i) {
+ ASSERT_OK(Put(i, "foo", "foo_value"));
+ ASSERT_OK(Flush(i));
+ }
+ Close();
+ Status s = ReadOnlyReopenWithColumnFamilies(
+ {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
+ ASSERT_OK(s);
+ Checkpoint* checkpoint = nullptr;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+ delete checkpoint;
+ checkpoint = nullptr;
+ Close();
+
+ std::vector<ColumnFamilyDescriptor> column_families{
+ {kDefaultColumnFamilyName, options},
+ {"pikachu", options},
+ {"eevee", options}};
+ DB* snapshot_db = nullptr;
+ std::vector<ColumnFamilyHandle*> snapshot_handles;
+ s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
+ &snapshot_db);
+ ASSERT_OK(s);
+ ReadOptions read_opts;
+ for (int i = 0; i != 3; ++i) {
+ std::string get_result;
+ s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result);
+ ASSERT_OK(s);
+ ASSERT_EQ("foo_value", get_result);
+ }
+
+ for (auto snapshot_h : snapshot_handles) {
+ delete snapshot_h;
+ }
+ snapshot_handles.clear();
+ delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointWithDbPath) {
+ Options options = CurrentOptions();
+ options.db_paths.emplace_back(dbname_ + "_2", 0);
+ Reopen(options);
+ ASSERT_OK(Put("key1", "val1"));
+ Flush();
+ Checkpoint* checkpoint;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+ // Currently not supported
+ ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported());
+ delete checkpoint;
+}
+
+TEST_F(CheckpointTest, PutRaceWithCheckpointTrackedWalSync) {
+ // Repro for a race condition where a user write comes in after the checkpoint
+ // syncs WAL for `track_and_verify_wals_in_manifest` but before the
+ // corresponding MANIFEST update. With the bug, that scenario resulted in an
+ // unopenable DB with error "Corruption: Size mismatch: WAL ...".
+ Options options = CurrentOptions();
+ std::unique_ptr<FaultInjectionTestEnv> fault_env(
+ new FaultInjectionTestEnv(env_));
+ options.env = fault_env.get();
+ options.track_and_verify_wals_in_manifest = true;
+ Reopen(options);
+
+ ASSERT_OK(Put("key1", "val1"));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+ [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::unique_ptr<Checkpoint> checkpoint;
+ {
+ Checkpoint* checkpoint_ptr;
+ ASSERT_OK(Checkpoint::Create(db_, &checkpoint_ptr));
+ checkpoint.reset(checkpoint_ptr);
+ }
+
+ ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+ // Ensure callback ran.
+ ASSERT_EQ("val2", Get("key2"));
+
+ Close();
+
+ // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
+ // DB WAL.
+ fault_env->DropUnsyncedFileData();
+
+ // Before the bug fix, reopening the DB would fail because the MANIFEST's
+ // AddWal entry indicated the WAL should be synced through "key2" -> "val2".
+ Reopen(options);
+
+ // Need to close before `fault_env` goes out of scope.
+ Close();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters.cc b/src/rocksdb/utilities/compaction_filters.cc
new file mode 100644
index 000000000..8763901c3
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters.cc
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinCompactionFilters(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<CompactionFilter>(
+ RemoveEmptyValueCompactionFilter::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilter>* /*guard*/,
+ std::string* /*errmsg*/) {
+ return new RemoveEmptyValueCompactionFilter();
+ });
+ return 1;
+}
+#endif // ROCKSDB_LITE
+Status CompactionFilter::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ const CompactionFilter** result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ CompactionFilter* filter = const_cast<CompactionFilter*>(*result);
+ Status status = LoadStaticObject<CompactionFilter>(config_options, value,
+ nullptr, &filter);
+ if (status.ok()) {
+ *result = const_cast<CompactionFilter*>(filter);
+ }
+ return status;
+}
+
+Status CompactionFilterFactory::CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::shared_ptr<CompactionFilterFactory>* result) {
+ // Currently there are no builtin CompactionFilterFactories.
+ // If any are introduced, they need to be registered here.
+ Status status = LoadSharedObject<CompactionFilterFactory>(
+ config_options, value, nullptr, result);
+ return status;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h
new file mode 100644
index 000000000..803fa94ae
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h
@@ -0,0 +1,41 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Abstract base class for building layered compaction filter on top of
+// user compaction filter.
+// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage.
+class LayeredCompactionFilterBase : public CompactionFilter {
+ public:
+ LayeredCompactionFilterBase(
+ const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+ : user_comp_filter_(_user_comp_filter),
+ user_comp_filter_from_factory_(
+ std::move(_user_comp_filter_from_factory)) {
+ if (!user_comp_filter_) {
+ user_comp_filter_ = user_comp_filter_from_factory_.get();
+ }
+ }
+
+ // Return a pointer to user compaction filter
+ const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
+
+ const Customizable* Inner() const override { return user_comp_filter_; }
+
+ protected:
+ const CompactionFilter* user_comp_filter_;
+
+ private:
+ std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
new file mode 100644
index 000000000..b788dbf9b
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/,
+ const Slice& /*key*/,
+ const Slice& existing_value,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const {
+ // remove kv pairs that have empty values
+ return existing_value.empty();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
new file mode 100644
index 000000000..864ad15ff
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RemoveEmptyValueCompactionFilter : public CompactionFilter {
+ public:
+ static const char* kClassName() { return "RemoveEmptyValueCompactionFilter"; }
+
+ const char* Name() const override { return kClassName(); }
+
+ bool Filter(int level, const Slice& key, const Slice& existing_value,
+ std::string* new_value, bool* value_changed) const override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/convenience/info_log_finder.cc b/src/rocksdb/utilities/convenience/info_log_finder.cc
new file mode 100644
index 000000000..fe62fd561
--- /dev/null
+++ b/src/rocksdb/utilities/convenience/info_log_finder.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "rocksdb/utilities/info_log_finder.h"
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
+ if (!db) {
+ return Status::InvalidArgument("DB pointer is not valid");
+ }
+ std::string parent_path;
+ const Options& options = db->GetOptions();
+ return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir,
+ db->GetName(), &parent_path, info_log_list);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/counted_fs.cc b/src/rocksdb/utilities/counted_fs.cc
new file mode 100644
index 000000000..e43f3a191
--- /dev/null
+++ b/src/rocksdb/utilities/counted_fs.cc
@@ -0,0 +1,379 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/counted_fs.h"
+
+#include <sstream>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class CountedSequentialFile : public FSSequentialFileOwnerWrapper {
+ private:
+ CountedFileSystem* fs_;
+
+ public:
+ CountedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+ CountedFileSystem* fs)
+ : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+ ~CountedSequentialFile() override { fs_->counters()->closes++; }
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override {
+ IOStatus rv = target()->Read(n, options, result, scratch, dbg);
+ fs_->counters()->reads.RecordOp(rv, result->size());
+ return rv;
+ }
+
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override {
+ IOStatus rv =
+ target()->PositionedRead(offset, n, options, result, scratch, dbg);
+ fs_->counters()->reads.RecordOp(rv, result->size());
+ return rv;
+ }
+};
+
+class CountedRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ private:
+ CountedFileSystem* fs_;
+
+ public:
+ CountedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+ CountedFileSystem* fs)
+ : FSRandomAccessFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+ ~CountedRandomAccessFile() override { fs_->counters()->closes++; }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg);
+ fs_->counters()->reads.RecordOp(rv, result->size());
+ return rv;
+ }
+
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->MultiRead(reqs, num_reqs, options, dbg);
+ for (size_t r = 0; r < num_reqs; r++) {
+ fs_->counters()->reads.RecordOp(reqs[r].status, reqs[r].result.size());
+ }
+ return rv;
+ }
+};
+
+class CountedWritableFile : public FSWritableFileOwnerWrapper {
+ private:
+ CountedFileSystem* fs_;
+
+ public:
+ CountedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+ CountedFileSystem* fs)
+ : FSWritableFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->Append(data, options, dbg);
+ fs_->counters()->writes.RecordOp(rv, data.size());
+ return rv;
+ }
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& info,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->Append(data, options, info, dbg);
+ fs_->counters()->writes.RecordOp(rv, data.size());
+ return rv;
+ }
+
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->PositionedAppend(data, offset, options, dbg);
+ fs_->counters()->writes.RecordOp(rv, data.size());
+ return rv;
+ }
+
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ const DataVerificationInfo& info,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->PositionedAppend(data, offset, options, info, dbg);
+ fs_->counters()->writes.RecordOp(rv, data.size());
+ return rv;
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Close(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->closes++;
+ }
+ return rv;
+ }
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Flush(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->flushes++;
+ }
+ return rv;
+ }
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Sync(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->syncs++;
+ }
+ return rv;
+ }
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Fsync(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->fsyncs++;
+ }
+ return rv;
+ }
+
+ IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->RangeSync(offset, nbytes, options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->syncs++;
+ }
+ return rv;
+ }
+};
+
+class CountedRandomRWFile : public FSRandomRWFileOwnerWrapper {
+ private:
+ mutable CountedFileSystem* fs_;
+
+ public:
+ CountedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+ CountedFileSystem* fs)
+ : FSRandomRWFileOwnerWrapper(std::move(f)), fs_(fs) {}
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override {
+ IOStatus rv = target()->Write(offset, data, options, dbg);
+ fs_->counters()->writes.RecordOp(rv, data.size());
+ return rv;
+ }
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override {
+ IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg);
+ fs_->counters()->reads.RecordOp(rv, result->size());
+ return rv;
+ }
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Flush(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->flushes++;
+ }
+ return rv;
+ }
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Sync(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->syncs++;
+ }
+ return rv;
+ }
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Fsync(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->fsyncs++;
+ }
+ return rv;
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = target()->Close(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->closes++;
+ }
+ return rv;
+ }
+};
+
+class CountedDirectory : public FSDirectoryWrapper {
+ private:
+ mutable CountedFileSystem* fs_;
+ bool closed_ = false;
+
+ public:
+ CountedDirectory(std::unique_ptr<FSDirectory>&& f, CountedFileSystem* fs)
+ : FSDirectoryWrapper(std::move(f)), fs_(fs) {}
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = FSDirectoryWrapper::Fsync(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->dsyncs++;
+ }
+ return rv;
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus rv = FSDirectoryWrapper::Close(options, dbg);
+ if (rv.ok()) {
+ fs_->counters()->closes++;
+ fs_->counters()->dir_closes++;
+ closed_ = true;
+ }
+ return rv;
+ }
+
+ IOStatus FsyncWithDirOptions(const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& dir_options) override {
+ IOStatus rv =
+ FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, dir_options);
+ if (rv.ok()) {
+ fs_->counters()->dsyncs++;
+ }
+ return rv;
+ }
+
+ ~CountedDirectory() {
+ if (!closed_) {
+ // TODO: fix DB+CF code to use explicit Close, not rely on destructor
+ fs_->counters()->closes++;
+ fs_->counters()->dir_closes++;
+ }
+ }
+};
+} // anonymous namespace
+
+std::string FileOpCounters::PrintCounters() const {
+ std::stringstream ss;
+ ss << "Num files opened: " << opens.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num files deleted: " << deletes.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num files renamed: " << renames.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num Flush(): " << flushes.load(std::memory_order_relaxed) << std::endl;
+ ss << "Num Sync(): " << syncs.load(std::memory_order_relaxed) << std::endl;
+ ss << "Num Fsync(): " << fsyncs.load(std::memory_order_relaxed) << std::endl;
+ ss << "Num Dir Fsync(): " << dsyncs.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num Close(): " << closes.load(std::memory_order_relaxed) << std::endl;
+ ss << "Num Dir Open(): " << dir_opens.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num Dir Close(): " << dir_closes.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num Read(): " << reads.ops.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num Append(): " << writes.ops.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num bytes read: " << reads.bytes.load(std::memory_order_relaxed)
+ << std::endl;
+ ss << "Num bytes written: " << writes.bytes.load(std::memory_order_relaxed)
+ << std::endl;
+ return ss.str();
+}
+
+CountedFileSystem::CountedFileSystem(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base) {}
+
+IOStatus CountedFileSystem::NewSequentialFile(
+ const std::string& f, const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* r, IODebugContext* dbg) {
+ std::unique_ptr<FSSequentialFile> base;
+ IOStatus s = target()->NewSequentialFile(f, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ r->reset(new CountedSequentialFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::NewRandomAccessFile(
+ const std::string& f, const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* r, IODebugContext* dbg) {
+ std::unique_ptr<FSRandomAccessFile> base;
+ IOStatus s = target()->NewRandomAccessFile(f, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ r->reset(new CountedRandomAccessFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::NewWritableFile(const std::string& f,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) {
+ std::unique_ptr<FSWritableFile> base;
+ IOStatus s = target()->NewWritableFile(f, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ r->reset(new CountedWritableFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::ReopenWritableFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ std::unique_ptr<FSWritableFile> base;
+ IOStatus s = target()->ReopenWritableFile(fname, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ result->reset(new CountedWritableFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::ReuseWritableFile(
+ const std::string& fname, const std::string& old_fname,
+ const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ std::unique_ptr<FSWritableFile> base;
+ IOStatus s =
+ target()->ReuseWritableFile(fname, old_fname, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ result->reset(new CountedWritableFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::NewRandomRWFile(
+ const std::string& name, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+ std::unique_ptr<FSRandomRWFile> base;
+ IOStatus s = target()->NewRandomRWFile(name, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ result->reset(new CountedRandomRWFile(std::move(base), this));
+ }
+ return s;
+}
+
+IOStatus CountedFileSystem::NewDirectory(const std::string& name,
+ const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) {
+ std::unique_ptr<FSDirectory> base;
+ IOStatus s = target()->NewDirectory(name, options, &base, dbg);
+ if (s.ok()) {
+ counters_.opens++;
+ counters_.dir_opens++;
+ result->reset(new CountedDirectory(std::move(base), this));
+ }
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/counted_fs.h b/src/rocksdb/utilities/counted_fs.h
new file mode 100644
index 000000000..cb8a8968f
--- /dev/null
+++ b/src/rocksdb/utilities/counted_fs.h
@@ -0,0 +1,158 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+struct OpCounter {
+ std::atomic<int> ops;
+ std::atomic<uint64_t> bytes;
+
+ OpCounter() : ops(0), bytes(0) {}
+
+ void Reset() {
+ ops = 0;
+ bytes = 0;
+ }
+ void RecordOp(const IOStatus& io_s, size_t added_bytes) {
+ if (!io_s.IsNotSupported()) {
+ ops.fetch_add(1, std::memory_order_relaxed);
+ }
+ if (io_s.ok()) {
+ bytes.fetch_add(added_bytes, std::memory_order_relaxed);
+ }
+ }
+};
+
+struct FileOpCounters {
+ static const char* kName() { return "FileOpCounters"; }
+
+ std::atomic<int> opens;
+ std::atomic<int> closes;
+ std::atomic<int> deletes;
+ std::atomic<int> renames;
+ std::atomic<int> flushes;
+ std::atomic<int> syncs;
+ std::atomic<int> dsyncs;
+ std::atomic<int> fsyncs;
+ std::atomic<int> dir_opens;
+ std::atomic<int> dir_closes;
+ OpCounter reads;
+ OpCounter writes;
+
+ FileOpCounters()
+ : opens(0),
+ closes(0),
+ deletes(0),
+ renames(0),
+ flushes(0),
+ syncs(0),
+ dsyncs(0),
+ fsyncs(0),
+ dir_opens(0),
+ dir_closes(0) {}
+
+ void Reset() {
+ opens = 0;
+ closes = 0;
+ deletes = 0;
+ renames = 0;
+ flushes = 0;
+ syncs = 0;
+ dsyncs = 0;
+ fsyncs = 0;
+ dir_opens = 0;
+ dir_closes = 0;
+ reads.Reset();
+ writes.Reset();
+ }
+ std::string PrintCounters() const;
+};
+
+// A FileSystem class that counts operations (reads, writes, opens, closes, etc)
+class CountedFileSystem : public FileSystemWrapper {
+ public:
+ private:
+ FileOpCounters counters_;
+
+ public:
+ explicit CountedFileSystem(const std::shared_ptr<FileSystem>& base);
+ static const char* kClassName() { return "CountedFileSystem"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& f,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* r,
+ IODebugContext* dbg) override;
+
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override;
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus NewRandomRWFile(const std::string& name, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override {
+ IOStatus s = target()->DeleteFile(fname, options, dbg);
+ if (s.ok()) {
+ counters_.deletes++;
+ }
+ return s;
+ }
+
+ IOStatus RenameFile(const std::string& s, const std::string& t,
+ const IOOptions& options, IODebugContext* dbg) override {
+ IOStatus st = target()->RenameFile(s, t, options, dbg);
+ if (st.ok()) {
+ counters_.renames++;
+ }
+ return st;
+ }
+
+ const FileOpCounters* counters() const { return &counters_; }
+
+ FileOpCounters* counters() { return &counters_; }
+
+ const void* GetOptionsPtr(const std::string& name) const override {
+ if (name == FileOpCounters::kName()) {
+ return counters();
+ } else {
+ return FileSystemWrapper::GetOptionsPtr(name);
+ }
+ }
+
+ // Prints the counters to a string
+ std::string PrintCounters() const { return counters_.PrintCounters(); }
+ void ResetCounters() { counters_.Reset(); }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/debug.cc b/src/rocksdb/utilities/debug.cc
new file mode 100644
index 000000000..f2c3bb513
--- /dev/null
+++ b/src/rocksdb/utilities/debug.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/debug.h"
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::unordered_map<std::string, ValueType> value_type_string_map = {
+ {"TypeDeletion", ValueType::kTypeDeletion},
+ {"TypeValue", ValueType::kTypeValue},
+ {"TypeMerge", ValueType::kTypeMerge},
+ {"TypeLogData", ValueType::kTypeLogData},
+ {"TypeColumnFamilyDeletion", ValueType::kTypeColumnFamilyDeletion},
+ {"TypeColumnFamilyValue", ValueType::kTypeColumnFamilyValue},
+ {"TypeColumnFamilyMerge", ValueType::kTypeColumnFamilyMerge},
+ {"TypeSingleDeletion", ValueType::kTypeSingleDeletion},
+ {"TypeColumnFamilySingleDeletion",
+ ValueType::kTypeColumnFamilySingleDeletion},
+ {"TypeBeginPrepareXID", ValueType::kTypeBeginPrepareXID},
+ {"TypeEndPrepareXID", ValueType::kTypeEndPrepareXID},
+ {"TypeCommitXID", ValueType::kTypeCommitXID},
+ {"TypeRollbackXID", ValueType::kTypeRollbackXID},
+ {"TypeNoop", ValueType::kTypeNoop},
+ {"TypeColumnFamilyRangeDeletion",
+ ValueType::kTypeColumnFamilyRangeDeletion},
+ {"TypeRangeDeletion", ValueType::kTypeRangeDeletion},
+ {"TypeColumnFamilyBlobIndex", ValueType::kTypeColumnFamilyBlobIndex},
+ {"TypeBlobIndex", ValueType::kTypeBlobIndex},
+ {"TypeBeginPersistedPrepareXID", ValueType::kTypeBeginPersistedPrepareXID},
+ {"TypeBeginUnprepareXID", ValueType::kTypeBeginUnprepareXID},
+ {"TypeDeletionWithTimestamp", ValueType::kTypeDeletionWithTimestamp},
+ {"TypeCommitXIDAndTimestamp", ValueType::kTypeCommitXIDAndTimestamp},
+ {"TypeWideColumnEntity", ValueType::kTypeWideColumnEntity},
+ {"TypeColumnFamilyWideColumnEntity",
+ ValueType::kTypeColumnFamilyWideColumnEntity}};
+
+std::string KeyVersion::GetTypeName() const {
+ std::string type_name;
+ if (SerializeEnum<ValueType>(value_type_string_map,
+ static_cast<ValueType>(type), &type_name)) {
+ return type_name;
+ } else {
+ return "Invalid";
+ }
+}
+
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+ size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions) {
+ if (nullptr == db) {
+ return Status::InvalidArgument("db cannot be null.");
+ }
+ return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key,
+ max_num_ikeys, key_versions);
+}
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+ Slice end_key, size_t max_num_ikeys,
+ std::vector<KeyVersion>* key_versions) {
+ if (nullptr == db) {
+ return Status::InvalidArgument("db cannot be null.");
+ }
+ if (nullptr == cfh) {
+ return Status::InvalidArgument("Column family handle cannot be null.");
+ }
+ if (nullptr == key_versions) {
+ return Status::InvalidArgument("key_versions cannot be null.");
+ }
+ key_versions->clear();
+
+ DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
+ auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
+ ReadOptions read_options;
+ Arena arena;
+ ScopedArenaIterator iter(
+ idb->NewInternalIterator(read_options, &arena, kMaxSequenceNumber, cfh));
+
+ if (!begin_key.empty()) {
+ InternalKey ikey;
+ ikey.SetMinPossibleForUserKey(begin_key);
+ iter->Seek(ikey.Encode());
+ } else {
+ iter->SeekToFirst();
+ }
+
+ size_t num_keys = 0;
+ for (; iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ Status pik_status =
+ ParseInternalKey(iter->key(), &ikey, true /* log_err_key */); // TODO
+ if (!pik_status.ok()) {
+ return pik_status;
+ }
+
+ if (!end_key.empty() &&
+ icmp.user_comparator()->Compare(ikey.user_key, end_key) > 0) {
+ break;
+ }
+
+ key_versions->emplace_back(ikey.user_key.ToString() /* _user_key */,
+ iter->value().ToString() /* _value */,
+ ikey.sequence /* _sequence */,
+ static_cast<int>(ikey.type) /* _type */);
+ if (++num_keys >= max_num_ikeys) {
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/env_mirror.cc b/src/rocksdb/utilities/env_mirror.cc
new file mode 100644
index 000000000..3ea323b42
--- /dev/null
+++ b/src/rocksdb/utilities/env_mirror.cc
@@ -0,0 +1,275 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/env_mirror.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An implementation of Env that mirrors all work over two backend
+// Env's. This is useful for debugging purposes.
+class SequentialFileMirror : public SequentialFile {
+ public:
+ std::unique_ptr<SequentialFile> a_, b_;
+ std::string fname;
+ explicit SequentialFileMirror(std::string f) : fname(f) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ Slice aslice;
+ Status as = a_->Read(n, &aslice, scratch);
+ if (as == Status::OK()) {
+ char* bscratch = new char[n];
+ Slice bslice;
+#ifndef NDEBUG
+ size_t off = 0;
+#endif
+ size_t left = aslice.size();
+ while (left) {
+ Status bs = b_->Read(left, &bslice, bscratch);
+#ifndef NDEBUG
+ assert(as == bs);
+ assert(memcmp(bscratch, scratch + off, bslice.size()) == 0);
+ off += bslice.size();
+#endif
+ left -= bslice.size();
+ }
+ delete[] bscratch;
+ *result = aslice;
+ } else {
+ Status bs = b_->Read(n, result, scratch);
+ assert(as == bs);
+ }
+ return as;
+ }
+
+ Status Skip(uint64_t n) override {
+ Status as = a_->Skip(n);
+ Status bs = b_->Skip(n);
+ assert(as == bs);
+ return as;
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ Status as = a_->InvalidateCache(offset, length);
+ Status bs = b_->InvalidateCache(offset, length);
+ assert(as == bs);
+ return as;
+ };
+};
+
+class RandomAccessFileMirror : public RandomAccessFile {
+ public:
+ std::unique_ptr<RandomAccessFile> a_, b_;
+ std::string fname;
+ explicit RandomAccessFileMirror(std::string f) : fname(f) {}
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ Status as = a_->Read(offset, n, result, scratch);
+ if (as == Status::OK()) {
+ char* bscratch = new char[n];
+ Slice bslice;
+ size_t off = 0;
+ size_t left = result->size();
+ while (left) {
+ Status bs = b_->Read(offset + off, left, &bslice, bscratch);
+ assert(as == bs);
+ assert(memcmp(bscratch, scratch + off, bslice.size()) == 0);
+ off += bslice.size();
+ left -= bslice.size();
+ }
+ delete[] bscratch;
+ } else {
+ Status bs = b_->Read(offset, n, result, scratch);
+ assert(as == bs);
+ }
+ return as;
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ // NOTE: not verified
+ return a_->GetUniqueId(id, max_size);
+ }
+};
+
+class WritableFileMirror : public WritableFile {
+ public:
+ std::unique_ptr<WritableFile> a_, b_;
+ std::string fname;
+ explicit WritableFileMirror(std::string f, const EnvOptions& options)
+ : WritableFile(options), fname(f) {}
+
+ Status Append(const Slice& data) override {
+ Status as = a_->Append(data);
+ Status bs = b_->Append(data);
+ assert(as == bs);
+ return as;
+ }
+ Status Append(const Slice& data,
+ const DataVerificationInfo& /* verification_info */) override {
+ return Append(data);
+ }
+ Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ Status as = a_->PositionedAppend(data, offset);
+ Status bs = b_->PositionedAppend(data, offset);
+ assert(as == bs);
+ return as;
+ }
+ Status PositionedAppend(
+ const Slice& data, uint64_t offset,
+ const DataVerificationInfo& /* verification_info */) override {
+ return PositionedAppend(data, offset);
+ }
+ Status Truncate(uint64_t size) override {
+ Status as = a_->Truncate(size);
+ Status bs = b_->Truncate(size);
+ assert(as == bs);
+ return as;
+ }
+ Status Close() override {
+ Status as = a_->Close();
+ Status bs = b_->Close();
+ assert(as == bs);
+ return as;
+ }
+ Status Flush() override {
+ Status as = a_->Flush();
+ Status bs = b_->Flush();
+ assert(as == bs);
+ return as;
+ }
+ Status Sync() override {
+ Status as = a_->Sync();
+ Status bs = b_->Sync();
+ assert(as == bs);
+ return as;
+ }
+ Status Fsync() override {
+ Status as = a_->Fsync();
+ Status bs = b_->Fsync();
+ assert(as == bs);
+ return as;
+ }
+ bool IsSyncThreadSafe() const override {
+ bool as = a_->IsSyncThreadSafe();
+ assert(as == b_->IsSyncThreadSafe());
+ return as;
+ }
+ void SetIOPriority(Env::IOPriority pri) override {
+ a_->SetIOPriority(pri);
+ b_->SetIOPriority(pri);
+ }
+ Env::IOPriority GetIOPriority() override {
+ // NOTE: we don't verify this one
+ return a_->GetIOPriority();
+ }
+ uint64_t GetFileSize() override {
+ uint64_t as = a_->GetFileSize();
+ assert(as == b_->GetFileSize());
+ return as;
+ }
+ void GetPreallocationStatus(size_t* block_size,
+ size_t* last_allocated_block) override {
+ // NOTE: we don't verify this one
+ return a_->GetPreallocationStatus(block_size, last_allocated_block);
+ }
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+ // NOTE: we don't verify this one
+ return a_->GetUniqueId(id, max_size);
+ }
+ Status InvalidateCache(size_t offset, size_t length) override {
+ Status as = a_->InvalidateCache(offset, length);
+ Status bs = b_->InvalidateCache(offset, length);
+ assert(as == bs);
+ return as;
+ }
+
+ protected:
+ Status Allocate(uint64_t offset, uint64_t length) override {
+ Status as = a_->Allocate(offset, length);
+ Status bs = b_->Allocate(offset, length);
+ assert(as == bs);
+ return as;
+ }
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ Status as = a_->RangeSync(offset, nbytes);
+ Status bs = b_->RangeSync(offset, nbytes);
+ assert(as == bs);
+ return as;
+ }
+};
+
+Status EnvMirror::NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& options) {
+ if (f.find("/proc/") == 0) {
+ return a_->NewSequentialFile(f, r, options);
+ }
+ SequentialFileMirror* mf = new SequentialFileMirror(f);
+ Status as = a_->NewSequentialFile(f, &mf->a_, options);
+ Status bs = b_->NewSequentialFile(f, &mf->b_, options);
+ assert(as == bs);
+ if (as.ok())
+ r->reset(mf);
+ else
+ delete mf;
+ return as;
+}
+
+Status EnvMirror::NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& options) {
+ if (f.find("/proc/") == 0) {
+ return a_->NewRandomAccessFile(f, r, options);
+ }
+ RandomAccessFileMirror* mf = new RandomAccessFileMirror(f);
+ Status as = a_->NewRandomAccessFile(f, &mf->a_, options);
+ Status bs = b_->NewRandomAccessFile(f, &mf->b_, options);
+ assert(as == bs);
+ if (as.ok())
+ r->reset(mf);
+ else
+ delete mf;
+ return as;
+}
+
+Status EnvMirror::NewWritableFile(const std::string& f,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) {
+ if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options);
+ WritableFileMirror* mf = new WritableFileMirror(f, options);
+ Status as = a_->NewWritableFile(f, &mf->a_, options);
+ Status bs = b_->NewWritableFile(f, &mf->b_, options);
+ assert(as == bs);
+ if (as.ok())
+ r->reset(mf);
+ else
+ delete mf;
+ return as;
+}
+
+Status EnvMirror::ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ std::unique_ptr<WritableFile>* r,
+ const EnvOptions& options) {
+ if (fname.find("/proc/") == 0)
+ return a_->ReuseWritableFile(fname, old_fname, r, options);
+ WritableFileMirror* mf = new WritableFileMirror(fname, options);
+ Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options);
+ Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options);
+ assert(as == bs);
+ if (as.ok())
+ r->reset(mf);
+ else
+ delete mf;
+ return as;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/utilities/env_mirror_test.cc b/src/rocksdb/utilities/env_mirror_test.cc
new file mode 100644
index 000000000..c372de1da
--- /dev/null
+++ b/src/rocksdb/utilities/env_mirror_test.cc
@@ -0,0 +1,226 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/env_mirror.h"
+
+#include "env/mock_env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EnvMirrorTest : public testing::Test {
+ public:
+ Env* default_;
+ MockEnv *a_, *b_;
+ EnvMirror* env_;
+ const EnvOptions soptions_;
+
+ EnvMirrorTest()
+ : default_(Env::Default()),
+ a_(new MockEnv(default_)),
+ b_(new MockEnv(default_)),
+ env_(new EnvMirror(a_, b_)) {}
+ ~EnvMirrorTest() {
+ delete env_;
+ delete a_;
+ delete b_;
+ }
+};
+
+TEST_F(EnvMirrorTest, Basics) {
+ uint64_t file_size;
+ std::unique_ptr<WritableFile> writable_file;
+ std::vector<std::string> children;
+
+ ASSERT_OK(env_->CreateDir("/dir"));
+
+ // Check that the directory is empty.
+ ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/non_existent"));
+ ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+ ASSERT_OK(env_->GetChildren("/dir", &children));
+ ASSERT_EQ(0U, children.size());
+
+ // Create a file.
+ ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+ writable_file.reset();
+
+ // Check that the file exists.
+ ASSERT_OK(env_->FileExists("/dir/f"));
+ ASSERT_OK(a_->FileExists("/dir/f"));
+ ASSERT_OK(b_->FileExists("/dir/f"));
+ ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+ ASSERT_EQ(0U, file_size);
+ ASSERT_OK(env_->GetChildren("/dir", &children));
+ ASSERT_EQ(1U, children.size());
+ ASSERT_EQ("f", children[0]);
+ ASSERT_OK(a_->GetChildren("/dir", &children));
+ ASSERT_EQ(1U, children.size());
+ ASSERT_EQ("f", children[0]);
+ ASSERT_OK(b_->GetChildren("/dir", &children));
+ ASSERT_EQ(1U, children.size());
+ ASSERT_EQ("f", children[0]);
+
+ // Write to the file.
+ ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("abc"));
+ writable_file.reset();
+
+ // Check for expected size.
+ ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+ ASSERT_EQ(3U, file_size);
+ ASSERT_OK(a_->GetFileSize("/dir/f", &file_size));
+ ASSERT_EQ(3U, file_size);
+ ASSERT_OK(b_->GetFileSize("/dir/f", &file_size));
+ ASSERT_EQ(3U, file_size);
+
+ // Check that renaming works.
+ ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+ ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+ ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/f"));
+ ASSERT_OK(env_->FileExists("/dir/g"));
+ ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+ ASSERT_EQ(3U, file_size);
+ ASSERT_OK(a_->FileExists("/dir/g"));
+ ASSERT_OK(a_->GetFileSize("/dir/g", &file_size));
+ ASSERT_EQ(3U, file_size);
+ ASSERT_OK(b_->FileExists("/dir/g"));
+ ASSERT_OK(b_->GetFileSize("/dir/g", &file_size));
+ ASSERT_EQ(3U, file_size);
+
+ // Check that opening non-existent file fails.
+ std::unique_ptr<SequentialFile> seq_file;
+ std::unique_ptr<RandomAccessFile> rand_file;
+ ASSERT_TRUE(
+ !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
+ ASSERT_TRUE(!seq_file);
+ ASSERT_TRUE(
+ !env_->NewRandomAccessFile("/dir/non_existent", &rand_file, soptions_)
+ .ok());
+ ASSERT_TRUE(!rand_file);
+
+ // Check that deleting works.
+ ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+ ASSERT_OK(env_->DeleteFile("/dir/g"));
+ ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/g"));
+ ASSERT_OK(env_->GetChildren("/dir", &children));
+ ASSERT_EQ(0U, children.size());
+ ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST_F(EnvMirrorTest, ReadWrite) {
+ std::unique_ptr<WritableFile> writable_file;
+ std::unique_ptr<SequentialFile> seq_file;
+ std::unique_ptr<RandomAccessFile> rand_file;
+ Slice result;
+ char scratch[100];
+
+ ASSERT_OK(env_->CreateDir("/dir"));
+
+ ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("hello "));
+ ASSERT_OK(writable_file->Append("world"));
+ writable_file.reset();
+
+ // Read sequentially.
+ ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+ ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+ ASSERT_EQ(0, result.compare("hello"));
+ ASSERT_OK(seq_file->Skip(1));
+ ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+ ASSERT_EQ(0, result.compare("world"));
+ ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+ ASSERT_EQ(0U, result.size());
+ ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+ ASSERT_OK(seq_file->Read(1000, &result, scratch));
+ ASSERT_EQ(0U, result.size());
+
+ // Random reads.
+ ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+ ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+ ASSERT_EQ(0, result.compare("world"));
+ ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+ ASSERT_EQ(0, result.compare("hello"));
+ ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+ ASSERT_EQ(0, result.compare("d"));
+
+ // Too high offset.
+ ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_F(EnvMirrorTest, Locks) {
+ FileLock* lock;
+
+ // These are no-ops, but we test they return success.
+ ASSERT_OK(env_->LockFile("some file", &lock));
+ ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST_F(EnvMirrorTest, Misc) {
+ std::string test_dir;
+ ASSERT_OK(env_->GetTestDirectory(&test_dir));
+ ASSERT_TRUE(!test_dir.empty());
+
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+ // These are no-ops, but we test they return success.
+ ASSERT_OK(writable_file->Sync());
+ ASSERT_OK(writable_file->Flush());
+ ASSERT_OK(writable_file->Close());
+ writable_file.reset();
+}
+
+TEST_F(EnvMirrorTest, LargeWrite) {
+ const size_t kWriteSize = 300 * 1024;
+ char* scratch = new char[kWriteSize * 2];
+
+ std::string write_data;
+ for (size_t i = 0; i < kWriteSize; ++i) {
+ write_data.append(1, static_cast<char>(i));
+ }
+
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+ ASSERT_OK(writable_file->Append("foo"));
+ ASSERT_OK(writable_file->Append(write_data));
+ writable_file.reset();
+
+ std::unique_ptr<SequentialFile> seq_file;
+ Slice result;
+ ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+ ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+ ASSERT_EQ(0, result.compare("foo"));
+
+ size_t read = 0;
+ std::string read_data;
+ while (read < kWriteSize) {
+ ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+ read_data.append(result.data(), result.size());
+ read += result.size();
+ }
+ ASSERT_TRUE(write_data == read_data);
+ delete[] scratch;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+ fprintf(stderr, "SKIPPED as EnvMirror is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/env_timed.cc b/src/rocksdb/utilities/env_timed.cc
new file mode 100644
index 000000000..1eb723146
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include "utilities/env_timed.h"
+
+#include "env/composite_env_wrapper.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+TimedFileSystem::TimedFileSystem(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base) {}
+IOStatus TimedFileSystem::NewSequentialFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_sequential_file_nanos);
+ return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_random_access_file_nanos);
+ return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewWritableFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_writable_file_nanos);
+ return FileSystemWrapper::NewWritableFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::ReuseWritableFile(
+ const std::string& fname, const std::string& old_fname,
+ const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
+ return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options, result,
+ dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomRWFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
+ return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewDirectory(const std::string& name,
+ const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_directory_nanos);
+ return FileSystemWrapper::NewDirectory(name, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::FileExists(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_file_exists_nanos);
+ return FileSystemWrapper::FileExists(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildren(const std::string& dir,
+ const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_get_children_nanos);
+ return FileSystemWrapper::GetChildren(dir, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildrenFileAttributes(
+ const std::string& dir, const IOOptions& options,
+ std::vector<FileAttributes>* result, IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
+ return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+ dbg);
+}
+
+IOStatus TimedFileSystem::DeleteFile(const std::string& fname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_delete_file_nanos);
+ return FileSystemWrapper::DeleteFile(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_create_dir_nanos);
+ return FileSystemWrapper::CreateDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
+ return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::DeleteDir(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_delete_dir_nanos);
+ return FileSystemWrapper::DeleteDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileSize(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_size,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_get_file_size_nanos);
+ return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
+ return FileSystemWrapper::GetFileModificationTime(fname, options, file_mtime,
+ dbg);
+}
+
+IOStatus TimedFileSystem::RenameFile(const std::string& src,
+ const std::string& dst,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_rename_file_nanos);
+ return FileSystemWrapper::RenameFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LinkFile(const std::string& src,
+ const std::string& dst,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_link_file_nanos);
+ return FileSystemWrapper::LinkFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LockFile(const std::string& fname,
+ const IOOptions& options, FileLock** lock,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_lock_file_nanos);
+ return FileSystemWrapper::LockFile(fname, options, lock, dbg);
+}
+
+IOStatus TimedFileSystem::UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_unlock_file_nanos);
+ return FileSystemWrapper::UnlockFile(lock, options, dbg);
+}
+
+IOStatus TimedFileSystem::NewLogger(const std::string& fname,
+ const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) {
+ PERF_TIMER_GUARD(env_new_logger_nanos);
+ return FileSystemWrapper::NewLogger(fname, options, result, dbg);
+}
+
+std::shared_ptr<FileSystem> NewTimedFileSystem(
+ const std::shared_ptr<FileSystem>& base) {
+ return std::make_shared<TimedFileSystem>(base);
+}
+
+// An environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+Env* NewTimedEnv(Env* base_env) {
+ std::shared_ptr<FileSystem> timed_fs =
+ NewTimedFileSystem(base_env->GetFileSystem());
+ return new CompositeEnvWrapper(base_env, timed_fs);
+}
+
+#else // ROCKSDB_LITE
+
+Env* NewTimedEnv(Env* /*base_env*/) { return nullptr; }
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/env_timed.h b/src/rocksdb/utilities/env_timed.h
new file mode 100644
index 000000000..2d34fd590
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include "rocksdb/file_system.h"
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+class TimedFileSystem : public FileSystemWrapper {
+ public:
+ explicit TimedFileSystem(const std::shared_ptr<FileSystem>& base);
+
+ static const char* kClassName() { return "TimedFS"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReuseWritableFile(const std::string& fname,
+ const std::string& old_fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus FileExists(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+ std::vector<std::string>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus GetChildrenFileAttributes(const std::string& dir,
+ const IOOptions& options,
+ std::vector<FileAttributes>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override;
+
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override;
+
+ IOStatus RenameFile(const std::string& src, const std::string& dst,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus LinkFile(const std::string& src, const std::string& dst,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) override;
+
+ IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override;
+};
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/env_timed_test.cc b/src/rocksdb/utilities/env_timed_test.cc
new file mode 100644
index 000000000..6e392579d
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed_test.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/env.h"
+#include "rocksdb/perf_context.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TimedEnvTest : public testing::Test {};
+
+TEST_F(TimedEnvTest, BasicTest) {
+ SetPerfLevel(PerfLevel::kEnableTime);
+ ASSERT_EQ(0, get_perf_context()->env_new_writable_file_nanos);
+
+ std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+ std::unique_ptr<Env> timed_env(NewTimedEnv(mem_env.get()));
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(timed_env->NewWritableFile("f", &writable_file, EnvOptions()));
+
+ ASSERT_GT(get_perf_context()->env_new_writable_file_nanos, 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else // ROCKSDB_LITE
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as TimedEnv is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/fault_injection_env.cc b/src/rocksdb/utilities/fault_injection_env.cc
new file mode 100644
index 000000000..b0495a8c1
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_env.cc
@@ -0,0 +1,555 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "utilities/fault_injection_env.h"
+
+#include <functional>
+#include <utility>
+
+#include "util/random.h"
+namespace ROCKSDB_NAMESPACE {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string GetDirName(const std::string filename) {
+ size_t found = filename.find_last_of("/\\");
+ if (found == std::string::npos) {
+ return "";
+ } else {
+ return filename.substr(0, found);
+ }
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+ std::unique_ptr<SequentialFile> orig_file;
+ const EnvOptions options;
+ Status s = env->NewSequentialFile(filename, &orig_file, options);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+ filename.c_str(), s.ToString().c_str());
+ return s;
+ }
+
+ std::unique_ptr<char[]> scratch(new char[length]);
+ ROCKSDB_NAMESPACE::Slice result;
+ s = orig_file->Read(length, &result, scratch.get());
+#ifdef OS_WIN
+ orig_file.reset();
+#endif
+ if (s.ok()) {
+ std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+ std::unique_ptr<WritableFile> tmp_file;
+ s = env->NewWritableFile(tmp_name, &tmp_file, options);
+ if (s.ok()) {
+ s = tmp_file->Append(result);
+ if (s.ok()) {
+ s = env->RenameFile(tmp_name, filename);
+ } else {
+ fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+ filename.c_str(), s.ToString().c_str());
+ env->DeleteFile(tmp_name);
+ }
+ }
+ }
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+ s.ToString().c_str());
+ }
+
+ return s;
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TrimDirname(const std::string& str) {
+ size_t found = str.find_last_not_of("/");
+ if (found == std::string::npos) {
+ return str;
+ }
+ return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> GetDirAndName(const std::string& name) {
+ std::string dirname = GetDirName(name);
+ std::string fname = name.substr(dirname.size() + 1);
+ return std::make_pair(dirname, fname);
+}
+
+Status FileState::DropUnsyncedData(Env* env) const {
+ ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+ return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+ ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+ assert(pos_ >= sync_pos);
+ int range = static_cast<int>(pos_ - sync_pos);
+ uint64_t truncated_size =
+ static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+ return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ env_->SyncDir(dirname_);
+ return dir_->Fsync();
+}
+
+Status TestDirectory::Close() {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ return dir_->Close();
+}
+
+TestRandomAccessFile::TestRandomAccessFile(
+ std::unique_ptr<RandomAccessFile>&& target, FaultInjectionTestEnv* env)
+ : target_(std::move(target)), env_(env) {
+ assert(target_);
+ assert(env_);
+}
+
+Status TestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ assert(env_);
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+
+ assert(target_);
+ return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
+ assert(env_);
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+
+ assert(target_);
+ return target_->Prefetch(offset, n);
+}
+
+Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) {
+ assert(env_);
+ if (!env_->IsFilesystemActive()) {
+ const Status s = env_->GetError();
+
+ assert(reqs);
+ for (size_t i = 0; i < num_reqs; ++i) {
+ reqs[i].status = s;
+ }
+
+ return s;
+ }
+
+ assert(target_);
+ return target_->MultiRead(reqs, num_reqs);
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>&& f,
+ FaultInjectionTestEnv* env)
+ : state_(fname),
+ target_(std::move(f)),
+ writable_file_opened_(true),
+ env_(env) {
+ assert(target_ != nullptr);
+ state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+ if (writable_file_opened_) {
+ Close().PermitUncheckedError();
+ }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ Status s = target_->Append(data);
+ if (s.ok()) {
+ state_.pos_ += data.size();
+ env_->WritableFileAppended(state_);
+ }
+ return s;
+}
+
+Status TestWritableFile::Close() {
+ writable_file_opened_ = false;
+ Status s = target_->Close();
+ if (s.ok()) {
+ env_->WritableFileClosed(state_);
+ }
+ return s;
+}
+
+Status TestWritableFile::Flush() {
+ Status s = target_->Flush();
+ if (s.ok() && env_->IsFilesystemActive()) {
+ state_.pos_at_last_flush_ = state_.pos_;
+ }
+ return s;
+}
+
+Status TestWritableFile::Sync() {
+ if (!env_->IsFilesystemActive()) {
+ return Status::IOError("FaultInjectionTestEnv: not active");
+ }
+ // No need to actual sync.
+ state_.pos_at_last_sync_ = state_.pos_;
+ env_->WritableFileSynced(state_);
+ return Status::OK();
+}
+
+TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
+ std::unique_ptr<RandomRWFile>&& f,
+ FaultInjectionTestEnv* env)
+ : target_(std::move(f)), file_opened_(true), env_(env) {
+ assert(target_ != nullptr);
+}
+
+TestRandomRWFile::~TestRandomRWFile() {
+ if (file_opened_) {
+ Close().PermitUncheckedError();
+ }
+}
+
+Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ return target_->Write(offset, data);
+}
+
+Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomRWFile::Close() {
+ file_opened_ = false;
+ return target_->Close();
+}
+
+Status TestRandomRWFile::Flush() {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ return target_->Flush();
+}
+
+Status TestRandomRWFile::Sync() {
+ if (!env_->IsFilesystemActive()) {
+ return env_->GetError();
+ }
+ return target_->Sync();
+}
+
+Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) {
+ std::unique_ptr<Directory> r;
+ Status s = target()->NewDirectory(name, &r);
+ assert(s.ok());
+ if (!s.ok()) {
+ return s;
+ }
+ result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+ return Status::OK();
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(
+ const std::string& fname, std::unique_ptr<WritableFile>* result,
+ const EnvOptions& soptions) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ // Not allow overwriting files
+ Status s = target()->FileExists(fname);
+ if (s.ok()) {
+ return Status::Corruption("File already exists.");
+ } else if (!s.IsNotFound()) {
+ assert(s.IsIOError());
+ return s;
+ }
+ s = target()->NewWritableFile(fname, result, soptions);
+ if (s.ok()) {
+ result->reset(new TestWritableFile(fname, std::move(*result), this));
+ // WritableFileWriter* file is opened
+ // again then it will be truncated - so forget our saved state.
+ UntrackFile(fname);
+ MutexLock l(&mutex_);
+ open_managed_files_.insert(fname);
+ auto dir_and_name = GetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ list.insert(dir_and_name.second);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::ReopenWritableFile(
+ const std::string& fname, std::unique_ptr<WritableFile>* result,
+ const EnvOptions& soptions) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+
+ bool exists;
+ Status s, exists_s = target()->FileExists(fname);
+ if (exists_s.IsNotFound()) {
+ exists = false;
+ } else if (exists_s.ok()) {
+ exists = true;
+ } else {
+ s = exists_s;
+ exists = false;
+ }
+
+ if (s.ok()) {
+ s = target()->ReopenWritableFile(fname, result, soptions);
+ }
+
+ // Only track files we created. Files created outside of this
+ // `FaultInjectionTestEnv` are not eligible for tracking/data dropping
+ // (for example, they may contain data a previous db_stress run expects to
+ // be recovered). This could be extended to track/drop data appended once
+ // the file is under `FaultInjectionTestEnv`'s control.
+ if (s.ok()) {
+ bool should_track;
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(fname) != db_file_state_.end()) {
+ // It was written by this `Env` earlier.
+ assert(exists);
+ should_track = true;
+ } else if (!exists) {
+ // It was created by this `Env` just now.
+ should_track = true;
+ open_managed_files_.insert(fname);
+ auto dir_and_name = GetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ list.insert(dir_and_name.second);
+ } else {
+ should_track = false;
+ }
+ }
+ if (should_track) {
+ result->reset(new TestWritableFile(fname, std::move(*result), this));
+ }
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomRWFile(
+ const std::string& fname, std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& soptions) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ Status s = target()->NewRandomRWFile(fname, result, soptions);
+ if (s.ok()) {
+ result->reset(new TestRandomRWFile(fname, std::move(*result), this));
+ // WritableFileWriter* file is opened
+ // again then it will be truncated - so forget our saved state.
+ UntrackFile(fname);
+ MutexLock l(&mutex_);
+ open_managed_files_.insert(fname);
+ auto dir_and_name = GetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ list.insert(dir_and_name.second);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+ const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& soptions) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+
+ assert(target());
+ const Status s = target()->NewRandomAccessFile(fname, result, soptions);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(result);
+ result->reset(new TestRandomAccessFile(std::move(*result), this));
+
+ return Status::OK();
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ Status s = EnvWrapper::DeleteFile(f);
+ if (s.ok()) {
+ UntrackFile(f);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+ const std::string& t) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ Status ret = EnvWrapper::RenameFile(s, t);
+
+ if (ret.ok()) {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ db_file_state_.erase(s);
+ }
+
+ auto sdn = GetDirAndName(s);
+ auto tdn = GetDirAndName(t);
+ if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+ auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+ assert(tlist.find(tdn.second) == tlist.end());
+ tlist.insert(tdn.second);
+ }
+ }
+
+ return ret;
+}
+
+Status FaultInjectionTestEnv::LinkFile(const std::string& s,
+ const std::string& t) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ Status ret = EnvWrapper::LinkFile(s, t);
+
+ if (ret.ok()) {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ }
+
+ auto sdn = GetDirAndName(s);
+ auto tdn = GetDirAndName(t);
+ if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+ dir_to_new_files_since_last_sync_[sdn.first].end()) {
+ auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+ assert(tlist.find(tdn.second) == tlist.end());
+ tlist.insert(tdn.second);
+ }
+ }
+
+ return ret;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ db_file_state_[state.filename_] = state;
+ open_managed_files_.erase(state.filename_);
+ }
+}
+
+void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+ db_file_state_.insert(std::make_pair(state.filename_, state));
+ } else {
+ db_file_state_[state.filename_] = state;
+ }
+ }
+}
+
+void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+ db_file_state_.insert(std::make_pair(state.filename_, state));
+ } else {
+ db_file_state_[state.filename_] = state;
+ }
+ }
+}
+
+// For every file that is not fully synced, make a call to `func` with
+// FileState of the file as the parameter.
+Status FaultInjectionTestEnv::DropFileData(
+ std::function<Status(Env*, FileState)> func) {
+ Status s;
+ MutexLock l(&mutex_);
+ for (std::map<std::string, FileState>::const_iterator it =
+ db_file_state_.begin();
+ s.ok() && it != db_file_state_.end(); ++it) {
+ const FileState& state = it->second;
+ if (!state.IsFullySynced()) {
+ s = func(target(), state);
+ }
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+ return DropFileData([&](Env* env, const FileState& state) {
+ return state.DropUnsyncedData(env);
+ });
+}
+
+Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) {
+ return DropFileData([&](Env* env, const FileState& state) {
+ return state.DropRandomUnsyncedData(env, rnd);
+ });
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+ // Because DeleteFile access this container make a copy to avoid deadlock
+ std::map<std::string, std::set<std::string>> map_copy;
+ {
+ MutexLock l(&mutex_);
+ map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+ dir_to_new_files_since_last_sync_.end());
+ }
+
+ for (auto& pair : map_copy) {
+ for (std::string name : pair.second) {
+ Status s = DeleteFile(pair.first + "/" + name);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ return Status::OK();
+}
+void FaultInjectionTestEnv::ResetState() {
+ MutexLock l(&mutex_);
+ db_file_state_.clear();
+ dir_to_new_files_since_last_sync_.clear();
+ SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+ MutexLock l(&mutex_);
+ auto dir_and_name = GetDirAndName(f);
+ dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+ dir_and_name.second);
+ db_file_state_.erase(f);
+ open_managed_files_.erase(f);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_env.h b/src/rocksdb/utilities/fault_injection_env.h
new file mode 100644
index 000000000..549bfe716
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_env.h
@@ -0,0 +1,258 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Random;
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+struct FileState {
+ std::string filename_;
+ ssize_t pos_;
+ ssize_t pos_at_last_sync_;
+ ssize_t pos_at_last_flush_;
+
+ explicit FileState(const std::string& filename)
+ : filename_(filename),
+ pos_(-1),
+ pos_at_last_sync_(-1),
+ pos_at_last_flush_(-1) {}
+
+ FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+ bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+ Status DropUnsyncedData(Env* env) const;
+
+ Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+class TestRandomAccessFile : public RandomAccessFile {
+ public:
+ TestRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+ FaultInjectionTestEnv* env);
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override;
+
+ Status Prefetch(uint64_t offset, size_t n) override;
+
+ Status MultiRead(ReadRequest* reqs, size_t num_reqs) override;
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+ explicit TestWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>&& f,
+ FaultInjectionTestEnv* env);
+ virtual ~TestWritableFile();
+ virtual Status Append(const Slice& data) override;
+ virtual Status Append(
+ const Slice& data,
+ const DataVerificationInfo& /*verification_info*/) override {
+ return Append(data);
+ }
+ virtual Status Truncate(uint64_t size) override {
+ return target_->Truncate(size);
+ }
+ virtual Status Close() override;
+ virtual Status Flush() override;
+ virtual Status Sync() override;
+ virtual bool IsSyncThreadSafe() const override { return true; }
+ virtual Status PositionedAppend(const Slice& data, uint64_t offset) override {
+ return target_->PositionedAppend(data, offset);
+ }
+ virtual Status PositionedAppend(
+ const Slice& data, uint64_t offset,
+ const DataVerificationInfo& /*verification_info*/) override {
+ return PositionedAppend(data, offset);
+ }
+ virtual bool use_direct_io() const override {
+ return target_->use_direct_io();
+ };
+
+ private:
+ FileState state_;
+ std::unique_ptr<WritableFile> target_;
+ bool writable_file_opened_;
+ FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestRandomRWFile : public RandomRWFile {
+ public:
+ explicit TestRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>&& f,
+ FaultInjectionTestEnv* env);
+ virtual ~TestRandomRWFile();
+ Status Write(uint64_t offset, const Slice& data) override;
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override;
+ Status Close() override;
+ Status Flush() override;
+ Status Sync() override;
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+ std::unique_ptr<RandomRWFile> target_;
+ bool file_opened_;
+ FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+ explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+ Directory* dir)
+ : env_(env), dirname_(dirname), dir_(dir) {}
+ ~TestDirectory() {}
+
+ virtual Status Fsync() override;
+ virtual Status Close() override;
+
+ private:
+ FaultInjectionTestEnv* env_;
+ std::string dirname_;
+ std::unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+ explicit FaultInjectionTestEnv(Env* base)
+ : EnvWrapper(base), filesystem_active_(true) {}
+ virtual ~FaultInjectionTestEnv() { error_.PermitUncheckedError(); }
+
+ static const char* kClassName() { return "FaultInjectionTestEnv"; }
+ const char* Name() const override { return kClassName(); }
+
+ Status NewDirectory(const std::string& name,
+ std::unique_ptr<Directory>* result) override;
+
+ Status NewWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& soptions) override;
+
+ Status ReopenWritableFile(const std::string& fname,
+ std::unique_ptr<WritableFile>* result,
+ const EnvOptions& soptions) override;
+
+ Status NewRandomRWFile(const std::string& fname,
+ std::unique_ptr<RandomRWFile>* result,
+ const EnvOptions& soptions) override;
+
+ Status NewRandomAccessFile(const std::string& fname,
+ std::unique_ptr<RandomAccessFile>* result,
+ const EnvOptions& soptions) override;
+
+ virtual Status DeleteFile(const std::string& f) override;
+
+ virtual Status RenameFile(const std::string& s,
+ const std::string& t) override;
+
+ virtual Status LinkFile(const std::string& s, const std::string& t) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+ virtual Status GetFreeSpace(const std::string& path,
+ uint64_t* disk_free) override {
+ if (!IsFilesystemActive() &&
+ error_.subcode() == IOStatus::SubCode::kNoSpace) {
+ *disk_free = 0;
+ return Status::OK();
+ } else {
+ return target()->GetFreeSpace(path, disk_free);
+ }
+ }
+
+ void WritableFileClosed(const FileState& state);
+
+ void WritableFileSynced(const FileState& state);
+
+ void WritableFileAppended(const FileState& state);
+
+ // For every file that is not fully synced, make a call to `func` with
+ // FileState of the file as the parameter.
+ Status DropFileData(std::function<Status(Env*, FileState)> func);
+
+ Status DropUnsyncedFileData();
+
+ Status DropRandomUnsyncedFileData(Random* rnd);
+
+ Status DeleteFilesCreatedAfterLastDirSync();
+
+ void ResetState();
+
+ void UntrackFile(const std::string& f);
+
+ void SyncDir(const std::string& dirname) {
+ MutexLock l(&mutex_);
+ dir_to_new_files_since_last_sync_.erase(dirname);
+ }
+
+ // Setting the filesystem to inactive is the test equivalent to simulating a
+ // system reset. Setting to inactive will freeze our saved filesystem state so
+ // that it will stop being recorded. It can then be reset back to the state at
+ // the time of the reset.
+ bool IsFilesystemActive() {
+ MutexLock l(&mutex_);
+ return filesystem_active_;
+ }
+ void SetFilesystemActiveNoLock(
+ bool active, Status error = Status::Corruption("Not active")) {
+ error.PermitUncheckedError();
+ filesystem_active_ = active;
+ if (!active) {
+ error_ = error;
+ }
+ error.PermitUncheckedError();
+ }
+ void SetFilesystemActive(bool active,
+ Status error = Status::Corruption("Not active")) {
+ error.PermitUncheckedError();
+ MutexLock l(&mutex_);
+ SetFilesystemActiveNoLock(active, error);
+ error.PermitUncheckedError();
+ }
+ void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+ Status GetError() { return error_; }
+
+ private:
+ port::Mutex mutex_;
+ std::map<std::string, FileState> db_file_state_;
+ std::set<std::string> open_managed_files_;
+ std::unordered_map<std::string, std::set<std::string>>
+ dir_to_new_files_since_last_sync_;
+ bool filesystem_active_; // Record flushes, syncs, writes
+ Status error_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_fs.cc b/src/rocksdb/utilities/fault_injection_fs.cc
new file mode 100644
index 000000000..549051856
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_fs.cc
@@ -0,0 +1,1032 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can simulate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#include "utilities/fault_injection_fs.h"
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+
+#include "env/composite_env_wrapper.h"
+#include "port/lang.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kNewFileNoOverwrite = "";
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string TestFSGetDirName(const std::string filename) {
+ size_t found = filename.find_last_of("/\\");
+ if (found == std::string::npos) {
+ return "";
+ } else {
+ return filename.substr(0, found);
+ }
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TestFSTrimDirname(const std::string& str) {
+ size_t found = str.find_last_not_of("/");
+ if (found == std::string::npos) {
+ return str;
+ }
+ return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> TestFSGetDirAndName(
+ const std::string& name) {
+ std::string dirname = TestFSGetDirName(name);
+ std::string fname = name.substr(dirname.size() + 1);
+ return std::make_pair(dirname, fname);
+}
+
+// Calculate the checksum of the data with corresponding checksum
+// type. If name does not match, no checksum is returned.
+void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data,
+ size_t size, std::string* checksum) {
+ if (checksum_type == ChecksumType::kCRC32c) {
+ uint32_t v_crc32c = crc32c::Extend(0, data, size);
+ PutFixed32(checksum, v_crc32c);
+ return;
+ } else if (checksum_type == ChecksumType::kxxHash) {
+ uint32_t v = XXH32(data, size, 0);
+ PutFixed32(checksum, v);
+ }
+ return;
+}
+
+IOStatus FSFileState::DropUnsyncedData() {
+ buffer_.resize(0);
+ return IOStatus::OK();
+}
+
+IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) {
+ int range = static_cast<int>(buffer_.size());
+ size_t truncated_size = static_cast<size_t>(rand->Uniform(range));
+ buffer_.resize(truncated_size);
+ return IOStatus::OK();
+}
+
+IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ {
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ fs_->SyncDir(dirname_);
+ IOStatus s = dir_->Fsync(options, dbg);
+ {
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ return s;
+}
+
+IOStatus TestFSDirectory::Close(const IOOptions& options, IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ IOStatus s = dir_->Close(options, dbg);
+ return s;
+}
+
+IOStatus TestFSDirectory::FsyncWithDirOptions(
+ const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& dir_fsync_options) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ {
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ fs_->SyncDir(dirname_);
+ IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+ {
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ return s;
+}
+
+TestFSWritableFile::TestFSWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>&& f,
+ FaultInjectionTestFS* fs)
+ : state_(fname),
+ file_opts_(file_opts),
+ target_(std::move(f)),
+ writable_file_opened_(true),
+ fs_(fs) {
+ assert(target_ != nullptr);
+ state_.pos_ = 0;
+}
+
+TestFSWritableFile::~TestFSWritableFile() {
+ if (writable_file_opened_) {
+ Close(IOOptions(), nullptr).PermitUncheckedError();
+ }
+}
+
+IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ if (target_->use_direct_io()) {
+ target_->Append(data, options, dbg).PermitUncheckedError();
+ } else {
+ state_.buffer_.append(data.data(), data.size());
+ state_.pos_ += data.size();
+ fs_->WritableFileAppended(state_);
+ }
+ IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+ return io_s;
+}
+
+// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is
+// simulated.
+IOStatus TestFSWritableFile::Append(
+ const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ if (fs_->ShouldDataCorruptionBeforeWrite()) {
+ return IOStatus::Corruption("Data is corrupted!");
+ }
+
+ // Calculate the checksum
+ std::string checksum;
+ CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+ data.size(), &checksum);
+ if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+ checksum != verification_info.checksum.ToString()) {
+ std::string msg = "Data is corrupted! Origin data checksum: " +
+ verification_info.checksum.ToString() +
+ "current data checksum: " + checksum;
+ return IOStatus::Corruption(msg);
+ }
+ if (target_->use_direct_io()) {
+ target_->Append(data, options, dbg).PermitUncheckedError();
+ } else {
+ state_.buffer_.append(data.data(), data.size());
+ state_.pos_ += data.size();
+ fs_->WritableFileAppended(state_);
+ }
+ IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+ return io_s;
+}
+
+IOStatus TestFSWritableFile::PositionedAppend(
+ const Slice& data, uint64_t offset, const IOOptions& options,
+ const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ if (fs_->ShouldDataCorruptionBeforeWrite()) {
+ return IOStatus::Corruption("Data is corrupted!");
+ }
+
+ // Calculate the checksum
+ std::string checksum;
+ CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+ data.size(), &checksum);
+ if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+ checksum != verification_info.checksum.ToString()) {
+ std::string msg = "Data is corrupted! Origin data checksum: " +
+ verification_info.checksum.ToString() +
+ "current data checksum: " + checksum;
+ return IOStatus::Corruption(msg);
+ }
+ target_->PositionedAppend(data, offset, options, dbg);
+ IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+ return io_s;
+}
+
+IOStatus TestFSWritableFile::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ {
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ writable_file_opened_ = false;
+ IOStatus io_s;
+ if (!target_->use_direct_io()) {
+ io_s = target_->Append(state_.buffer_, options, dbg);
+ }
+ if (io_s.ok()) {
+ state_.buffer_.resize(0);
+ // Ignore sync errors
+ target_->Sync(options, dbg).PermitUncheckedError();
+ io_s = target_->Close(options, dbg);
+ }
+ if (io_s.ok()) {
+ fs_->WritableFileClosed(state_);
+ IOStatus in_s = fs_->InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ return io_s;
+}
+
+IOStatus TestFSWritableFile::Flush(const IOOptions&, IODebugContext*) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ if (fs_->IsFilesystemActive()) {
+ state_.pos_at_last_flush_ = state_.pos_;
+ }
+ return IOStatus::OK();
+}
+
+IOStatus TestFSWritableFile::Sync(const IOOptions& options,
+ IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ if (target_->use_direct_io()) {
+ // For Direct IO mode, we don't buffer anything in TestFSWritableFile.
+ // So just return
+ return IOStatus::OK();
+ }
+ IOStatus io_s = target_->Append(state_.buffer_, options, dbg);
+ state_.buffer_.resize(0);
+ // Ignore sync errors
+ target_->Sync(options, dbg).PermitUncheckedError();
+ state_.pos_at_last_sync_ = state_.pos_;
+ fs_->WritableFileSynced(state_);
+ return io_s;
+}
+
+IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ MutexLock l(&mutex_);
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ // Assumes caller passes consecutive byte ranges.
+ uint64_t sync_limit = offset + nbytes;
+ uint64_t buf_begin =
+ state_.pos_at_last_sync_ < 0 ? 0 : state_.pos_at_last_sync_;
+
+ IOStatus io_s;
+ if (sync_limit < buf_begin) {
+ return io_s;
+ }
+ uint64_t num_to_sync = std::min(static_cast<uint64_t>(state_.buffer_.size()),
+ sync_limit - buf_begin);
+ Slice buf_to_sync(state_.buffer_.data(), num_to_sync);
+ io_s = target_->Append(buf_to_sync, options, dbg);
+ state_.buffer_ = state_.buffer_.substr(num_to_sync);
+ // Ignore sync errors
+ target_->RangeSync(offset, nbytes, options, dbg).PermitUncheckedError();
+ state_.pos_at_last_sync_ = offset + num_to_sync;
+ fs_->WritableFileSynced(state_);
+ return io_s;
+}
+
+TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/,
+ std::unique_ptr<FSRandomRWFile>&& f,
+ FaultInjectionTestFS* fs)
+ : target_(std::move(f)), file_opened_(true), fs_(fs) {
+ assert(target_ != nullptr);
+}
+
+TestFSRandomRWFile::~TestFSRandomRWFile() {
+ if (file_opened_) {
+ Close(IOOptions(), nullptr).PermitUncheckedError();
+ }
+}
+
+IOStatus TestFSRandomRWFile::Write(uint64_t offset, const Slice& data,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ return target_->Write(offset, data, options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n,
+ const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) const {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ return target_->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Close(const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ file_opened_ = false;
+ return target_->Close(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Flush(const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ return target_->Flush(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Sync(const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ return target_->Sync(options, dbg);
+}
+
+TestFSRandomAccessFile::TestFSRandomAccessFile(
+ const std::string& /*fname*/, std::unique_ptr<FSRandomAccessFile>&& f,
+ FaultInjectionTestFS* fs)
+ : target_(std::move(f)), fs_(fs) {
+ assert(target_ != nullptr);
+}
+
+IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
+ const IOOptions& options, Slice* result,
+ char* scratch,
+ IODebugContext* dbg) const {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ IOStatus s = target_->Read(offset, n, options, result, scratch, dbg);
+ if (s.ok()) {
+ s = fs_->InjectThreadSpecificReadError(
+ FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(),
+ scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr);
+ }
+ if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected read error");
+ }
+ return s;
+}
+
+IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!fs_->IsFilesystemActive()) {
+ return fs_->GetError();
+ }
+ IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg);
+ bool injected_error = false;
+ for (size_t i = 0; i < num_reqs; i++) {
+ if (!reqs[i].status.ok()) {
+ // Already seeing an error.
+ break;
+ }
+ bool this_injected_error;
+ reqs[i].status = fs_->InjectThreadSpecificReadError(
+ FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq,
+ &(reqs[i].result), use_direct_io(), reqs[i].scratch,
+ /*need_count_increase=*/true,
+ /*fault_injected=*/&this_injected_error);
+ injected_error |= this_injected_error;
+ }
+ if (s.ok()) {
+ s = fs_->InjectThreadSpecificReadError(
+ FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr,
+ use_direct_io(), nullptr, /*need_count_increase=*/!injected_error,
+ /*fault_injected=*/nullptr);
+ }
+ if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected read error");
+ }
+ return s;
+}
+
+size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ if (fs_->ShouldFailGetUniqueId()) {
+ return 0;
+ } else {
+ return target_->GetUniqueId(id, max_size);
+ }
+}
+IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) {
+ IOStatus s = target()->Read(n, options, result, scratch, dbg);
+ if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected seq read error");
+ }
+ return s;
+}
+
+IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) {
+ IOStatus s =
+ target()->PositionedRead(offset, n, options, result, scratch, dbg);
+ if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected seq positioned read error");
+ }
+ return s;
+}
+
+IOStatus FaultInjectionTestFS::NewDirectory(
+ const std::string& name, const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+ std::unique_ptr<FSDirectory> r;
+ IOStatus io_s = target()->NewDirectory(name, options, &r, dbg);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ result->reset(
+ new TestFSDirectory(this, TestFSTrimDirname(name), r.release()));
+ return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::NewWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ if (ShouldUseDiretWritable(fname)) {
+ return target()->NewWritableFile(fname, file_opts, result, dbg);
+ }
+
+ IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg);
+ if (io_s.ok()) {
+ result->reset(
+ new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+ // WritableFileWriter* file is opened
+ // again then it will be truncated - so forget our saved state.
+ UntrackFile(fname);
+ {
+ MutexLock l(&mutex_);
+ open_managed_files_.insert(fname);
+ auto dir_and_name = TestFSGetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ // The new file could overwrite an old one. Here we simplify
+ // the implementation by assuming no file of this name after
+ // dropping unsynced files.
+ list[dir_and_name.second] = kNewFileNoOverwrite;
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::ReopenWritableFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ if (ShouldUseDiretWritable(fname)) {
+ return target()->ReopenWritableFile(fname, file_opts, result, dbg);
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ bool exists;
+ IOStatus io_s,
+ exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */);
+ if (exists_s.IsNotFound()) {
+ exists = false;
+ } else if (exists_s.ok()) {
+ exists = true;
+ } else {
+ io_s = exists_s;
+ exists = false;
+ }
+
+ if (io_s.ok()) {
+ io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+ }
+
+ // Only track files we created. Files created outside of this
+ // `FaultInjectionTestFS` are not eligible for tracking/data dropping
+ // (for example, they may contain data a previous db_stress run expects to
+ // be recovered). This could be extended to track/drop data appended once
+ // the file is under `FaultInjectionTestFS`'s control.
+ if (io_s.ok()) {
+ bool should_track;
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(fname) != db_file_state_.end()) {
+ // It was written by this `FileSystem` earlier.
+ assert(exists);
+ should_track = true;
+ } else if (!exists) {
+ // It was created by this `FileSystem` just now.
+ should_track = true;
+ open_managed_files_.insert(fname);
+ auto dir_and_name = TestFSGetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ list[dir_and_name.second] = kNewFileNoOverwrite;
+ } else {
+ should_track = false;
+ }
+ }
+ if (should_track) {
+ result->reset(
+ new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomRWFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ if (ShouldUseDiretWritable(fname)) {
+ return target()->NewRandomRWFile(fname, file_opts, result, dbg);
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+ if (io_s.ok()) {
+ result->reset(new TestFSRandomRWFile(fname, std::move(*result), this));
+ // WritableFileWriter* file is opened
+ // again then it will be truncated - so forget our saved state.
+ UntrackFile(fname);
+ {
+ MutexLock l(&mutex_);
+ open_managed_files_.insert(fname);
+ auto dir_and_name = TestFSGetDirAndName(fname);
+ auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+ // It could be overwriting an old file, but we simplify the
+ // implementation by ignoring it.
+ list[dir_and_name.second] = kNewFileNoOverwrite;
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ if (ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected error when open random access file");
+ }
+ IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr,
+ false, nullptr,
+ /*need_count_increase=*/true,
+ /*fault_injected=*/nullptr);
+ if (io_s.ok()) {
+ io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+ }
+ if (io_s.ok()) {
+ result->reset(new TestFSRandomAccessFile(fname, std::move(*result), this));
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewSequentialFile(
+ const std::string& fname, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+
+ if (ShouldInjectRandomReadError()) {
+ return IOStatus::IOError("Injected read error when creating seq file");
+ }
+ IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+ if (io_s.ok()) {
+ result->reset(new TestFSSequentialFile(std::move(*result), this));
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg);
+ if (io_s.ok()) {
+ UntrackFile(f);
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::RenameFile(const std::string& s,
+ const std::string& t,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ // We preserve contents of overwritten files up to a size threshold.
+ // We could keep previous file in another name, but we need to worry about
+ // garbage collect the those files. We do it if it is needed later.
+ // We ignore I/O errors here for simplicity.
+ std::string previous_contents = kNewFileNoOverwrite;
+ if (target()->FileExists(t, IOOptions(), nullptr).ok()) {
+ uint64_t file_size;
+ if (target()->GetFileSize(t, IOOptions(), &file_size, nullptr).ok() &&
+ file_size < 1024) {
+ ReadFileToString(target(), t, &previous_contents).PermitUncheckedError();
+ }
+ }
+ IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg);
+
+ if (io_s.ok()) {
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ db_file_state_.erase(s);
+ }
+
+ auto sdn = TestFSGetDirAndName(s);
+ auto tdn = TestFSGetDirAndName(t);
+ if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+ auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+ assert(tlist.find(tdn.second) == tlist.end());
+ tlist[tdn.second] = previous_contents;
+ }
+ }
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::LinkFile(const std::string& s,
+ const std::string& t,
+ const IOOptions& options,
+ IODebugContext* dbg) {
+ if (!IsFilesystemActive()) {
+ return GetError();
+ }
+ {
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ // Using the value in `dir_to_new_files_since_last_sync_` for the source file
+ // may be a more reasonable choice.
+ std::string previous_contents = kNewFileNoOverwrite;
+
+ IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg);
+
+ if (io_s.ok()) {
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ }
+
+ auto sdn = TestFSGetDirAndName(s);
+ auto tdn = TestFSGetDirAndName(t);
+ if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+ dir_to_new_files_since_last_sync_[sdn.first].end()) {
+ auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+ assert(tlist.find(tdn.second) == tlist.end());
+ tlist[tdn.second] = previous_contents;
+ }
+ }
+ IOStatus in_s = InjectMetadataWriteError();
+ if (!in_s.ok()) {
+ return in_s;
+ }
+ }
+
+ return io_s;
+}
+
+void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ db_file_state_[state.filename_] = state;
+ open_managed_files_.erase(state.filename_);
+ }
+}
+
+void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+ db_file_state_.insert(std::make_pair(state.filename_, state));
+ } else {
+ db_file_state_[state.filename_] = state;
+ }
+ }
+}
+
+void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) {
+ MutexLock l(&mutex_);
+ if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+ if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+ db_file_state_.insert(std::make_pair(state.filename_, state));
+ } else {
+ db_file_state_[state.filename_] = state;
+ }
+ }
+}
+
+IOStatus FaultInjectionTestFS::DropUnsyncedFileData() {
+ IOStatus io_s;
+ MutexLock l(&mutex_);
+ for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+ io_s.ok() && it != db_file_state_.end(); ++it) {
+ FSFileState& fs_state = it->second;
+ if (!fs_state.IsFullySynced()) {
+ io_s = fs_state.DropUnsyncedData();
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DropRandomUnsyncedFileData(Random* rnd) {
+ IOStatus io_s;
+ MutexLock l(&mutex_);
+ for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+ io_s.ok() && it != db_file_state_.end(); ++it) {
+ FSFileState& fs_state = it->second;
+ if (!fs_state.IsFullySynced()) {
+ io_s = fs_state.DropRandomUnsyncedData(rnd);
+ }
+ }
+ return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync(
+ const IOOptions& options, IODebugContext* dbg) {
+ // Because DeleteFile access this container make a copy to avoid deadlock
+ std::map<std::string, std::map<std::string, std::string>> map_copy;
+ {
+ MutexLock l(&mutex_);
+ map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+ dir_to_new_files_since_last_sync_.end());
+ }
+
+ for (auto& pair : map_copy) {
+ for (auto& file_pair : pair.second) {
+ if (file_pair.second == kNewFileNoOverwrite) {
+ IOStatus io_s =
+ DeleteFile(pair.first + "/" + file_pair.first, options, dbg);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ } else {
+ IOStatus io_s =
+ WriteStringToFile(target(), file_pair.second,
+ pair.first + "/" + file_pair.first, true);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ }
+ }
+ }
+ return IOStatus::OK();
+}
+
+void FaultInjectionTestFS::ResetState() {
+ MutexLock l(&mutex_);
+ db_file_state_.clear();
+ dir_to_new_files_since_last_sync_.clear();
+ SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestFS::UntrackFile(const std::string& f) {
+ MutexLock l(&mutex_);
+ auto dir_and_name = TestFSGetDirAndName(f);
+ dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+ dir_and_name.second);
+ db_file_state_.erase(f);
+ open_managed_files_.erase(f);
+}
+
+IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
+ ErrorOperation op, Slice* result, bool direct_io, char* scratch,
+ bool need_count_increase, bool* fault_injected) {
+ bool dummy_bool;
+ bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool;
+ ret_fault_injected = false;
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) {
+ return IOStatus::OK();
+ }
+
+ if (ctx->rand.OneIn(ctx->one_in)) {
+ if (ctx->count == 0) {
+ ctx->message = "";
+ }
+ if (need_count_increase) {
+ ctx->count++;
+ }
+ if (ctx->callstack) {
+ free(ctx->callstack);
+ }
+ ctx->callstack = port::SaveStack(&ctx->frames);
+
+ if (op != ErrorOperation::kMultiReadSingleReq) {
+ // Likely non-per read status code for MultiRead
+ ctx->message += "error; ";
+ ret_fault_injected = true;
+ return IOStatus::IOError();
+ } else if (Random::GetTLSInstance()->OneIn(8)) {
+ assert(result);
+ // For a small chance, set the failure to status but turn the
+ // result to be empty, which is supposed to be caught for a check.
+ *result = Slice();
+ ctx->message += "inject empty result; ";
+ ret_fault_injected = true;
+ } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) &&
+ scratch != nullptr && result->data() == scratch) {
+ assert(result);
+ // With direct I/O, many extra bytes might be read so corrupting
+ // one byte might not cause checksum mismatch. Skip checksum
+ // corruption injection.
+ // We only corrupt data if the result is filled to `scratch`. For other
+ // cases, the data might not be able to be modified (e.g mmaped files)
+ // or has unintended side effects.
+ // For a small chance, set the failure to status but corrupt the
+ // result in a way that checksum checking is supposed to fail.
+ // Corrupt the last byte, which is supposed to be a checksum byte
+ // It would work for CRC. Not 100% sure for xxhash and will adjust
+ // if it is not the case.
+ const_cast<char*>(result->data())[result->size() - 1]++;
+ ctx->message += "corrupt last byte; ";
+ ret_fault_injected = true;
+ } else {
+ ctx->message += "error result multiget single; ";
+ ret_fault_injected = true;
+ return IOStatus::IOError();
+ }
+ }
+ return IOStatus::OK();
+}
+
+bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name,
+ uint64_t* number, FileType* type) {
+ std::size_t found = file_name.find_last_of("/");
+ std::string file = file_name.substr(found);
+ return ParseFileName(file, number, type);
+}
+
+IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) {
+ MutexLock l(&mutex_);
+ if (!enable_write_error_injection_ || !write_error_one_in_) {
+ return IOStatus::OK();
+ }
+ bool allowed_type = false;
+
+ if (inject_for_all_file_types_) {
+ allowed_type = true;
+ } else {
+ uint64_t number;
+ FileType cur_type = kTempFile;
+ if (TryParseFileName(file_name, &number, &cur_type)) {
+ for (const auto& type : write_error_allowed_types_) {
+ if (cur_type == type) {
+ allowed_type = true;
+ }
+ }
+ }
+ }
+
+ if (allowed_type) {
+ if (write_error_rand_.OneIn(write_error_one_in_)) {
+ return GetError();
+ }
+ }
+ return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::InjectMetadataWriteError() {
+ {
+ MutexLock l(&mutex_);
+ if (!enable_metadata_write_error_injection_ ||
+ !metadata_write_error_one_in_ ||
+ !write_error_rand_.OneIn(metadata_write_error_one_in_)) {
+ return IOStatus::OK();
+ }
+ }
+ TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected");
+ return IOStatus::IOError();
+}
+
+void FaultInjectionTestFS::PrintFaultBacktrace() {
+#if defined(OS_LINUX)
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ if (ctx == nullptr) {
+ return;
+ }
+ fprintf(stderr, "Injected error type = %d\n", ctx->type);
+ fprintf(stderr, "Message: %s\n", ctx->message.c_str());
+ port::PrintAndFreeStack(ctx->callstack, ctx->frames);
+ ctx->callstack = nullptr;
+#endif
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_fs.h b/src/rocksdb/utilities/fault_injection_fs.h
new file mode 100644
index 000000000..53c9ccb6f
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_fs.h
@@ -0,0 +1,584 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can similate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/file_system.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFSWritableFile;
+class FaultInjectionTestFS;
+
+struct FSFileState {
+ std::string filename_;
+ ssize_t pos_;
+ ssize_t pos_at_last_sync_;
+ ssize_t pos_at_last_flush_;
+ std::string buffer_;
+
+ explicit FSFileState(const std::string& filename)
+ : filename_(filename),
+ pos_(-1),
+ pos_at_last_sync_(-1),
+ pos_at_last_flush_(-1) {}
+
+ FSFileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+ bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+ IOStatus DropUnsyncedData();
+
+ IOStatus DropRandomUnsyncedData(Random* rand);
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSWritableFile : public FSWritableFile {
+ public:
+ explicit TestFSWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>&& f,
+ FaultInjectionTestFS* fs);
+ virtual ~TestFSWritableFile();
+ virtual IOStatus Append(const Slice& data, const IOOptions&,
+ IODebugContext*) override;
+ virtual IOStatus Append(const Slice& data, const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override;
+ virtual IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->Truncate(size, options, dbg);
+ }
+ virtual IOStatus Close(const IOOptions& options,
+ IODebugContext* dbg) override;
+ virtual IOStatus Flush(const IOOptions&, IODebugContext*) override;
+ virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+ virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+ virtual bool IsSyncThreadSafe() const override { return true; }
+ virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override {
+ return target_->PositionedAppend(data, offset, options, dbg);
+ }
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ const DataVerificationInfo& verification_info,
+ IODebugContext* dbg) override;
+ virtual size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ virtual bool use_direct_io() const override {
+ return target_->use_direct_io();
+ };
+
+ private:
+ FSFileState state_; // Need protection by mutex_
+ FileOptions file_opts_;
+ std::unique_ptr<FSWritableFile> target_;
+ bool writable_file_opened_;
+ FaultInjectionTestFS* fs_;
+ port::Mutex mutex_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSRandomRWFile : public FSRandomRWFile {
+ public:
+ explicit TestFSRandomRWFile(const std::string& fname,
+ std::unique_ptr<FSRandomRWFile>&& f,
+ FaultInjectionTestFS* fs);
+ virtual ~TestFSRandomRWFile();
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+ std::unique_ptr<FSRandomRWFile> target_;
+ bool file_opened_;
+ FaultInjectionTestFS* fs_;
+};
+
+class TestFSRandomAccessFile : public FSRandomAccessFile {
+ public:
+ explicit TestFSRandomAccessFile(const std::string& fname,
+ std::unique_ptr<FSRandomAccessFile>&& f,
+ FaultInjectionTestFS* fs);
+ ~TestFSRandomAccessFile() override {}
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+ IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+ const IOOptions& options, IODebugContext* dbg) override;
+ size_t GetRequiredBufferAlignment() const override {
+ return target_->GetRequiredBufferAlignment();
+ }
+ bool use_direct_io() const override { return target_->use_direct_io(); }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override;
+
+ private:
+ std::unique_ptr<FSRandomAccessFile> target_;
+ FaultInjectionTestFS* fs_;
+};
+
+class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
+ public:
+ explicit TestFSSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+ FaultInjectionTestFS* fs)
+ : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {}
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override;
+
+ private:
+ FaultInjectionTestFS* fs_;
+};
+
+class TestFSDirectory : public FSDirectory {
+ public:
+ explicit TestFSDirectory(FaultInjectionTestFS* fs, std::string dirname,
+ FSDirectory* dir)
+ : fs_(fs), dirname_(dirname), dir_(dir) {}
+ ~TestFSDirectory() {}
+
+ virtual IOStatus Fsync(const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus Close(const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus FsyncWithDirOptions(
+ const IOOptions& options, IODebugContext* dbg,
+ const DirFsyncOptions& dir_fsync_options) override;
+
+ private:
+ FaultInjectionTestFS* fs_;
+ std::string dirname_;
+ std::unique_ptr<FSDirectory> dir_;
+};
+
+class FaultInjectionTestFS : public FileSystemWrapper {
+ public:
+ explicit FaultInjectionTestFS(const std::shared_ptr<FileSystem>& base)
+ : FileSystemWrapper(base),
+ filesystem_active_(true),
+ filesystem_writable_(false),
+ thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)),
+ enable_write_error_injection_(false),
+ enable_metadata_write_error_injection_(false),
+ write_error_rand_(0),
+ write_error_one_in_(0),
+ metadata_write_error_one_in_(0),
+ read_error_one_in_(0),
+ ingest_data_corruption_before_write_(false),
+ fail_get_file_unique_id_(false) {}
+ virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); }
+
+ static const char* kClassName() { return "FaultInjectionTestFS"; }
+ const char* Name() const override { return kClassName(); }
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* r,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus RenameFile(const std::string& s, const std::string& t,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual IOStatus LinkFile(const std::string& src, const std::string& target,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+ virtual IOStatus GetFreeSpace(const std::string& path,
+ const IOOptions& options, uint64_t* disk_free,
+ IODebugContext* dbg) override {
+ IOStatus io_s;
+ if (!IsFilesystemActive() &&
+ error_.subcode() == IOStatus::SubCode::kNoSpace) {
+ *disk_free = 0;
+ } else {
+ io_s = target()->GetFreeSpace(path, options, disk_free, dbg);
+ }
+ return io_s;
+ }
+
+ void WritableFileClosed(const FSFileState& state);
+
+ void WritableFileSynced(const FSFileState& state);
+
+ void WritableFileAppended(const FSFileState& state);
+
+ IOStatus DropUnsyncedFileData();
+
+ IOStatus DropRandomUnsyncedFileData(Random* rnd);
+
+ IOStatus DeleteFilesCreatedAfterLastDirSync(const IOOptions& options,
+ IODebugContext* dbg);
+
+ void ResetState();
+
+ void UntrackFile(const std::string& f);
+
+ void SyncDir(const std::string& dirname) {
+ MutexLock l(&mutex_);
+ dir_to_new_files_since_last_sync_.erase(dirname);
+ }
+
+ // Setting the filesystem to inactive is the test equivalent to simulating a
+ // system reset. Setting to inactive will freeze our saved filesystem state so
+ // that it will stop being recorded. It can then be reset back to the state at
+ // the time of the reset.
+ bool IsFilesystemActive() {
+ MutexLock l(&mutex_);
+ return filesystem_active_;
+ }
+
+ // Setting filesystem_writable_ makes NewWritableFile. ReopenWritableFile,
+ // and NewRandomRWFile bypass FaultInjectionTestFS and go directly to the
+ // target FS
+ bool IsFilesystemDirectWritable() {
+ MutexLock l(&mutex_);
+ return filesystem_writable_;
+ }
+ bool ShouldUseDiretWritable(const std::string& file_name) {
+ MutexLock l(&mutex_);
+ if (filesystem_writable_) {
+ return true;
+ }
+ FileType file_type = kTempFile;
+ uint64_t file_number = 0;
+ if (!TryParseFileName(file_name, &file_number, &file_type)) {
+ return false;
+ }
+ return skip_direct_writable_types_.find(file_type) !=
+ skip_direct_writable_types_.end();
+ }
+ void SetFilesystemActiveNoLock(
+ bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+ error.PermitUncheckedError();
+ filesystem_active_ = active;
+ if (!active) {
+ error_ = error;
+ }
+ }
+ void SetFilesystemActive(
+ bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+ MutexLock l(&mutex_);
+ error.PermitUncheckedError();
+ SetFilesystemActiveNoLock(active, error);
+ }
+ void SetFilesystemDirectWritable(bool writable) {
+ MutexLock l(&mutex_);
+ filesystem_writable_ = writable;
+ }
+ void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+
+ IOStatus GetError() { return error_; }
+
+ void SetFileSystemIOError(IOStatus io_error) {
+ MutexLock l(&mutex_);
+ io_error.PermitUncheckedError();
+ error_ = io_error;
+ }
+
+ // To simulate the data corruption before data is written in FS
+ void IngestDataCorruptionBeforeWrite() {
+ MutexLock l(&mutex_);
+ ingest_data_corruption_before_write_ = true;
+ }
+
+ void NoDataCorruptionBeforeWrite() {
+ MutexLock l(&mutex_);
+ ingest_data_corruption_before_write_ = false;
+ }
+
+ bool ShouldDataCorruptionBeforeWrite() {
+ MutexLock l(&mutex_);
+ return ingest_data_corruption_before_write_;
+ }
+
+ void SetChecksumHandoffFuncType(const ChecksumType& func_type) {
+ MutexLock l(&mutex_);
+ checksum_handoff_func_tpye_ = func_type;
+ }
+
+ const ChecksumType& GetChecksumHandoffFuncType() {
+ MutexLock l(&mutex_);
+ return checksum_handoff_func_tpye_;
+ }
+
+ void SetFailGetUniqueId(bool flag) {
+ MutexLock l(&mutex_);
+ fail_get_file_unique_id_ = flag;
+ }
+
+ bool ShouldFailGetUniqueId() {
+ MutexLock l(&mutex_);
+ return fail_get_file_unique_id_;
+ }
+
+ // Specify what the operation, so we can inject the right type of error
+ enum ErrorOperation : char {
+ kRead = 0,
+ kMultiReadSingleReq = 1,
+ kMultiRead = 2,
+ kOpen,
+ };
+
+ // Set thread-local parameters for error injection. The first argument,
+ // seed is the seed for the random number generator, and one_in determines
+ // the probability of injecting error (i.e an error is injected with
+ // 1/one_in probability)
+ void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) {
+ struct ErrorContext* ctx =
+ static_cast<struct ErrorContext*>(thread_local_error_->Get());
+ if (ctx == nullptr) {
+ ctx = new ErrorContext(seed);
+ thread_local_error_->Reset(ctx);
+ }
+ ctx->one_in = one_in;
+ ctx->count = 0;
+ }
+
+ static void DeleteThreadLocalErrorContext(void* p) {
+ ErrorContext* ctx = static_cast<ErrorContext*>(p);
+ delete ctx;
+ }
+
+ // This is to set the parameters for the write error injection.
+ // seed is the seed for the random number generator, and one_in determines
+ // the probability of injecting error (i.e an error is injected with
+ // 1/one_in probability). For write error, we can specify the error we
+ // want to inject. Types decides the file types we want to inject the
+ // error (e.g., Wal files, SST files), which is empty by default.
+ void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error,
+ bool inject_for_all_file_types,
+ const std::vector<FileType>& types) {
+ MutexLock l(&mutex_);
+ Random tmp_rand(seed);
+ error.PermitUncheckedError();
+ error_ = error;
+ write_error_rand_ = tmp_rand;
+ write_error_one_in_ = one_in;
+ inject_for_all_file_types_ = inject_for_all_file_types;
+ write_error_allowed_types_ = types;
+ }
+
+ void SetSkipDirectWritableTypes(const std::set<FileType>& types) {
+ MutexLock l(&mutex_);
+ skip_direct_writable_types_ = types;
+ }
+
+ void SetRandomMetadataWriteError(int one_in) {
+ MutexLock l(&mutex_);
+ metadata_write_error_one_in_ = one_in;
+ }
+ // If the value is not 0, it is enabled. Otherwise, it is disabled.
+ void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; }
+
+ bool ShouldInjectRandomReadError() {
+ return read_error_one_in() &&
+ Random::GetTLSInstance()->OneIn(read_error_one_in());
+ }
+
+ // Inject an write error with randomlized parameter and the predefined
+ // error type. Only the allowed file types will inject the write error
+ IOStatus InjectWriteError(const std::string& file_name);
+
+ // Ingest error to metadata operations.
+ IOStatus InjectMetadataWriteError();
+
+ // Inject an error. For a READ operation, a status of IOError(), a
+ // corruption in the contents of scratch, or truncation of slice
+ // are the types of error with equal probability. For OPEN,
+ // its always an IOError.
+ // fault_injected returns whether a fault is injected. It is needed
+ // because some fault is inected with IOStatus to be OK.
+ IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice,
+ bool direct_io, char* scratch,
+ bool need_count_increase,
+ bool* fault_injected);
+
+ // Get the count of how many times we injected since the previous call
+ int GetAndResetErrorCount() {
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ int count = 0;
+ if (ctx != nullptr) {
+ count = ctx->count;
+ ctx->count = 0;
+ }
+ return count;
+ }
+
+ void EnableErrorInjection() {
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ if (ctx) {
+ ctx->enable_error_injection = true;
+ }
+ }
+
+ void EnableWriteErrorInjection() {
+ MutexLock l(&mutex_);
+ enable_write_error_injection_ = true;
+ }
+ void EnableMetadataWriteErrorInjection() {
+ MutexLock l(&mutex_);
+ enable_metadata_write_error_injection_ = true;
+ }
+
+ void DisableWriteErrorInjection() {
+ MutexLock l(&mutex_);
+ enable_write_error_injection_ = false;
+ }
+
+ void DisableErrorInjection() {
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ if (ctx) {
+ ctx->enable_error_injection = false;
+ }
+ }
+
+ void DisableMetadataWriteErrorInjection() {
+ MutexLock l(&mutex_);
+ enable_metadata_write_error_injection_ = false;
+ }
+
+ int read_error_one_in() const { return read_error_one_in_.load(); }
+
+ int write_error_one_in() const { return write_error_one_in_; }
+
+ // We capture a backtrace every time a fault is injected, for debugging
+ // purposes. This call prints the backtrace to stderr and frees the
+ // saved callstack
+ void PrintFaultBacktrace();
+
+ private:
+ port::Mutex mutex_;
+ std::map<std::string, FSFileState> db_file_state_;
+ std::set<std::string> open_managed_files_;
+ // directory -> (file name -> file contents to recover)
+ // When data is recovered from unsyned parent directory, the files with
+ // empty file contents to recover is deleted. Those with non-empty ones
+ // will be recovered to content accordingly.
+ std::unordered_map<std::string, std::map<std::string, std::string>>
+ dir_to_new_files_since_last_sync_;
+ bool filesystem_active_; // Record flushes, syncs, writes
+ bool filesystem_writable_; // Bypass FaultInjectionTestFS and go directly
+ // to underlying FS for writable files
+ IOStatus error_;
+
+ enum ErrorType : int {
+ kErrorTypeStatus = 0,
+ kErrorTypeCorruption,
+ kErrorTypeTruncated,
+ kErrorTypeMax
+ };
+
+ struct ErrorContext {
+ Random rand;
+ int one_in;
+ int count;
+ bool enable_error_injection;
+ void* callstack;
+ std::string message;
+ int frames;
+ ErrorType type;
+
+ explicit ErrorContext(uint32_t seed)
+ : rand(seed),
+ enable_error_injection(false),
+ callstack(nullptr),
+ frames(0) {}
+ ~ErrorContext() {
+ if (callstack) {
+ free(callstack);
+ }
+ }
+ };
+
+ std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+ bool enable_write_error_injection_;
+ bool enable_metadata_write_error_injection_;
+ Random write_error_rand_;
+ int write_error_one_in_;
+ int metadata_write_error_one_in_;
+ std::atomic<int> read_error_one_in_;
+ bool inject_for_all_file_types_;
+ std::vector<FileType> write_error_allowed_types_;
+ // File types where direct writable is skipped.
+ std::set<FileType> skip_direct_writable_types_;
+ bool ingest_data_corruption_before_write_;
+ ChecksumType checksum_handoff_func_tpye_;
+ bool fail_get_file_unique_id_;
+
+ // Extract number of type from file name. Return false if failing to fine
+ // them.
+ bool TryParseFileName(const std::string& file_name, uint64_t* number,
+ FileType* type);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.cc b/src/rocksdb/utilities/fault_injection_secondary_cache.cc
new file mode 100644
index 000000000..2758c2a19
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_secondary_cache.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+
+#include "utilities/fault_injection_secondary_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FaultInjectionSecondaryCache::ResultHandle::UpdateHandleValue(
+ FaultInjectionSecondaryCache::ResultHandle* handle) {
+ ErrorContext* ctx = handle->cache_->GetErrorContext();
+ if (!ctx->rand.OneIn(handle->cache_->prob_)) {
+ handle->value_ = handle->base_->Value();
+ handle->size_ = handle->base_->Size();
+ }
+ handle->base_.reset();
+}
+
+bool FaultInjectionSecondaryCache::ResultHandle::IsReady() {
+ bool ready = true;
+ if (base_) {
+ ready = base_->IsReady();
+ if (ready) {
+ UpdateHandleValue(this);
+ }
+ }
+ return ready;
+}
+
+void FaultInjectionSecondaryCache::ResultHandle::Wait() {
+ base_->Wait();
+ UpdateHandleValue(this);
+}
+
+void* FaultInjectionSecondaryCache::ResultHandle::Value() { return value_; }
+
+size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; }
+
+void FaultInjectionSecondaryCache::ResultHandle::WaitAll(
+ FaultInjectionSecondaryCache* cache,
+ std::vector<SecondaryCacheResultHandle*> handles) {
+ std::vector<SecondaryCacheResultHandle*> base_handles;
+ for (SecondaryCacheResultHandle* hdl : handles) {
+ FaultInjectionSecondaryCache::ResultHandle* handle =
+ static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+ if (!handle->base_) {
+ continue;
+ }
+ base_handles.emplace_back(handle->base_.get());
+ }
+
+ cache->base_->WaitAll(base_handles);
+ for (SecondaryCacheResultHandle* hdl : handles) {
+ FaultInjectionSecondaryCache::ResultHandle* handle =
+ static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+ if (handle->base_) {
+ UpdateHandleValue(handle);
+ }
+ }
+}
+
+FaultInjectionSecondaryCache::ErrorContext*
+FaultInjectionSecondaryCache::GetErrorContext() {
+ ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+ if (!ctx) {
+ ctx = new ErrorContext(seed_);
+ thread_local_error_->Reset(ctx);
+ }
+
+ return ctx;
+}
+
+Status FaultInjectionSecondaryCache::Insert(
+ const Slice& key, void* value, const Cache::CacheItemHelper* helper) {
+ ErrorContext* ctx = GetErrorContext();
+ if (ctx->rand.OneIn(prob_)) {
+ return Status::IOError();
+ }
+
+ return base_->Insert(key, value, helper);
+}
+
+std::unique_ptr<SecondaryCacheResultHandle>
+FaultInjectionSecondaryCache::Lookup(const Slice& key,
+ const Cache::CreateCallback& create_cb,
+ bool wait, bool advise_erase,
+ bool& is_in_sec_cache) {
+ ErrorContext* ctx = GetErrorContext();
+ if (base_is_compressed_sec_cache_) {
+ if (ctx->rand.OneIn(prob_)) {
+ return nullptr;
+ } else {
+ return base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache);
+ }
+ } else {
+ std::unique_ptr<SecondaryCacheResultHandle> hdl =
+ base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache);
+ if (wait && ctx->rand.OneIn(prob_)) {
+ hdl.reset();
+ }
+ return std::unique_ptr<FaultInjectionSecondaryCache::ResultHandle>(
+ new FaultInjectionSecondaryCache::ResultHandle(this, std::move(hdl)));
+ }
+}
+
+void FaultInjectionSecondaryCache::Erase(const Slice& key) {
+ base_->Erase(key);
+}
+
+void FaultInjectionSecondaryCache::WaitAll(
+ std::vector<SecondaryCacheResultHandle*> handles) {
+ if (base_is_compressed_sec_cache_) {
+ ErrorContext* ctx = GetErrorContext();
+ std::vector<SecondaryCacheResultHandle*> base_handles;
+ for (SecondaryCacheResultHandle* hdl : handles) {
+ if (ctx->rand.OneIn(prob_)) {
+ continue;
+ }
+ base_handles.push_back(hdl);
+ }
+ base_->WaitAll(base_handles);
+ } else {
+ FaultInjectionSecondaryCache::ResultHandle::WaitAll(this, handles);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.h b/src/rocksdb/utilities/fault_injection_secondary_cache.h
new file mode 100644
index 000000000..5321df626
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_secondary_cache.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/secondary_cache.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+// Its used by db_stress to verify correctness in the presence of
+// secondary cache errors.
+//
+class FaultInjectionSecondaryCache : public SecondaryCache {
+ public:
+ explicit FaultInjectionSecondaryCache(
+ const std::shared_ptr<SecondaryCache>& base, uint32_t seed, int prob)
+ : base_(base),
+ seed_(seed),
+ prob_(prob),
+ thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) {
+ if (std::strcmp(base_->Name(), "CompressedSecondaryCache") == 0) {
+ base_is_compressed_sec_cache_ = true;
+ }
+ }
+
+ virtual ~FaultInjectionSecondaryCache() override {}
+
+ const char* Name() const override { return "FaultInjectionSecondaryCache"; }
+
+ Status Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper) override;
+
+ std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+ const Slice& key, const Cache::CreateCallback& create_cb, bool wait,
+ bool advise_erase, bool& is_in_sec_cache) override;
+
+ bool SupportForceErase() const override { return base_->SupportForceErase(); }
+
+ void Erase(const Slice& key) override;
+
+ void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override;
+
+ Status SetCapacity(size_t capacity) override {
+ return base_->SetCapacity(capacity);
+ }
+
+ Status GetCapacity(size_t& capacity) override {
+ return base_->GetCapacity(capacity);
+ }
+
+ std::string GetPrintableOptions() const override {
+ return base_->GetPrintableOptions();
+ }
+
+ private:
+ class ResultHandle : public SecondaryCacheResultHandle {
+ public:
+ ResultHandle(FaultInjectionSecondaryCache* cache,
+ std::unique_ptr<SecondaryCacheResultHandle>&& base)
+ : cache_(cache), base_(std::move(base)), value_(nullptr), size_(0) {}
+
+ ~ResultHandle() override {}
+
+ bool IsReady() override;
+
+ void Wait() override;
+
+ void* Value() override;
+
+ size_t Size() override;
+
+ static void WaitAll(FaultInjectionSecondaryCache* cache,
+ std::vector<SecondaryCacheResultHandle*> handles);
+
+ private:
+ static void UpdateHandleValue(ResultHandle* handle);
+
+ FaultInjectionSecondaryCache* cache_;
+ std::unique_ptr<SecondaryCacheResultHandle> base_;
+ void* value_;
+ size_t size_;
+ };
+
+ static void DeleteThreadLocalErrorContext(void* p) {
+ ErrorContext* ctx = static_cast<ErrorContext*>(p);
+ delete ctx;
+ }
+
+ const std::shared_ptr<SecondaryCache> base_;
+ uint32_t seed_;
+ int prob_;
+ bool base_is_compressed_sec_cache_{false};
+
+ struct ErrorContext {
+ Random rand;
+
+ explicit ErrorContext(uint32_t seed) : rand(seed) {}
+ };
+ std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+
+ ErrorContext* GetErrorContext();
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/leveldb_options/leveldb_options.cc b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
new file mode 100644
index 000000000..125c3d956
--- /dev/null
+++ b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/leveldb_options.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LevelDBOptions::LevelDBOptions()
+ : comparator(BytewiseComparator()),
+ create_if_missing(false),
+ error_if_exists(false),
+ paranoid_checks(false),
+ env(Env::Default()),
+ info_log(nullptr),
+ write_buffer_size(4 << 20),
+ max_open_files(1000),
+ block_cache(nullptr),
+ block_size(4096),
+ block_restart_interval(16),
+ compression(kSnappyCompression),
+ filter_policy(nullptr) {}
+
+Options ConvertOptions(const LevelDBOptions& leveldb_options) {
+ Options options = Options();
+ options.create_if_missing = leveldb_options.create_if_missing;
+ options.error_if_exists = leveldb_options.error_if_exists;
+ options.paranoid_checks = leveldb_options.paranoid_checks;
+ options.env = leveldb_options.env;
+ options.info_log.reset(leveldb_options.info_log);
+ options.write_buffer_size = leveldb_options.write_buffer_size;
+ options.max_open_files = leveldb_options.max_open_files;
+ options.compression = leveldb_options.compression;
+
+ BlockBasedTableOptions table_options;
+ table_options.block_cache.reset(leveldb_options.block_cache);
+ table_options.block_size = leveldb_options.block_size;
+ table_options.block_restart_interval = leveldb_options.block_restart_interval;
+ table_options.filter_policy.reset(leveldb_options.filter_policy);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ return options;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/memory/memory_test.cc b/src/rocksdb/utilities/memory/memory_test.cc
new file mode 100644
index 000000000..0b043af0e
--- /dev/null
+++ b/src/rocksdb/utilities/memory/memory_test.cc
@@ -0,0 +1,279 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemoryTest : public testing::Test {
+ public:
+ MemoryTest() : kDbDir(test::PerThreadDBPath("memory_test")), rnd_(301) {
+ assert(Env::Default()->CreateDirIfMissing(kDbDir).ok());
+ }
+
+ std::string GetDBName(int id) { return kDbDir + "db_" + std::to_string(id); }
+
+ void UpdateUsagesHistory(const std::vector<DB*>& dbs) {
+ std::map<MemoryUtil::UsageType, uint64_t> usage_by_type;
+ ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type));
+ for (int i = 0; i < MemoryUtil::kNumUsageTypes; ++i) {
+ usage_history_[i].push_back(
+ usage_by_type[static_cast<MemoryUtil::UsageType>(i)]);
+ }
+ }
+
+ void GetCachePointersFromTableFactory(
+ const TableFactory* factory,
+ std::unordered_set<const Cache*>* cache_set) {
+ const auto bbto = factory->GetOptions<BlockBasedTableOptions>();
+ if (bbto != nullptr) {
+ cache_set->insert(bbto->block_cache.get());
+ cache_set->insert(bbto->block_cache_compressed.get());
+ }
+ }
+
+ void GetCachePointers(const std::vector<DB*>& dbs,
+ std::unordered_set<const Cache*>* cache_set) {
+ cache_set->clear();
+
+ for (auto* db : dbs) {
+ assert(db);
+
+ // Cache from DBImpl
+ StackableDB* sdb = dynamic_cast<StackableDB*>(db);
+ DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
+ if (db_impl != nullptr) {
+ cache_set->insert(db_impl->TEST_table_cache());
+ }
+
+ // Cache from DBOptions
+ cache_set->insert(db->GetDBOptions().row_cache.get());
+
+ // Cache from table factories
+ std::unordered_map<std::string, const ImmutableCFOptions*> iopts_map;
+ if (db_impl != nullptr) {
+ ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map));
+ }
+ for (auto pair : iopts_map) {
+ GetCachePointersFromTableFactory(pair.second->table_factory.get(),
+ cache_set);
+ }
+ }
+ }
+
+ Status GetApproximateMemoryUsageByType(
+ const std::vector<DB*>& dbs,
+ std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
+ std::unordered_set<const Cache*> cache_set;
+ GetCachePointers(dbs, &cache_set);
+
+ return MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+ usage_by_type);
+ }
+
+ const std::string kDbDir;
+ Random rnd_;
+ std::vector<uint64_t> usage_history_[MemoryUtil::kNumUsageTypes];
+};
+
+TEST_F(MemoryTest, SharedBlockCacheTotal) {
+ std::vector<DB*> dbs;
+ std::vector<uint64_t> usage_by_type;
+ const int kNumDBs = 10;
+ const int kKeySize = 100;
+ const int kValueSize = 500;
+ Options opt;
+ opt.create_if_missing = true;
+ opt.write_buffer_size = kKeySize + kValueSize;
+ opt.max_write_buffer_number = 10;
+ opt.min_write_buffer_number_to_merge = 10;
+ opt.disable_auto_compactions = true;
+ BlockBasedTableOptions bbt_opts;
+ bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
+ for (int i = 0; i < kNumDBs; ++i) {
+ ASSERT_OK(DestroyDB(GetDBName(i), opt));
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
+ dbs.push_back(db);
+ }
+
+ std::vector<std::string> keys_by_db[kNumDBs];
+
+ // Fill one memtable per Put to make memtable use more memory.
+ for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
+ for (int i = 0; i < kNumDBs; ++i) {
+ for (int j = 0; j < 100; ++j) {
+ keys_by_db[i].emplace_back(rnd_.RandomString(kKeySize));
+ ASSERT_OK(dbs[i]->Put(WriteOptions(), keys_by_db[i].back(),
+ rnd_.RandomString(kValueSize)));
+ }
+ ASSERT_OK(dbs[i]->Flush(FlushOptions()));
+ }
+ }
+ for (int i = 0; i < kNumDBs; ++i) {
+ for (auto& key : keys_by_db[i]) {
+ std::string value;
+ ASSERT_OK(dbs[i]->Get(ReadOptions(), key, &value));
+ }
+ UpdateUsagesHistory(dbs);
+ }
+ for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size();
+ ++i) {
+ // Expect EQ as we didn't flush more memtables.
+ ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+ usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+ }
+ for (int i = 0; i < kNumDBs; ++i) {
+ delete dbs[i];
+ }
+}
+
+TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
+ std::vector<DB*> dbs;
+ std::vector<uint64_t> usage_by_type;
+ std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
+ const int kNumDBs = 10;
+ // These key/value sizes ensure each KV has its own memtable. Note that the
+ // minimum write_buffer_size allowed is 64 KB.
+ const int kKeySize = 100;
+ const int kValueSize = 1 << 16;
+ Options opt;
+ opt.create_if_missing = true;
+ opt.create_missing_column_families = true;
+ opt.write_buffer_size = kKeySize + kValueSize;
+ opt.max_write_buffer_number = 10;
+ opt.min_write_buffer_number_to_merge = 10;
+ opt.disable_auto_compactions = true;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs = {
+ {kDefaultColumnFamilyName, ColumnFamilyOptions(opt)},
+ {"one", ColumnFamilyOptions(opt)},
+ {"two", ColumnFamilyOptions(opt)},
+ };
+
+ for (int i = 0; i < kNumDBs; ++i) {
+ ASSERT_OK(DestroyDB(GetDBName(i), opt));
+ std::vector<ColumnFamilyHandle*> handles;
+ dbs.emplace_back();
+ vec_handles.emplace_back();
+ ASSERT_OK(DB::Open(DBOptions(opt), GetDBName(i), cf_descs,
+ &vec_handles.back(), &dbs.back()));
+ }
+
+ // Fill one memtable per Put to make memtable use more memory.
+ for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
+ for (int i = 0; i < kNumDBs; ++i) {
+ for (auto* handle : vec_handles[i]) {
+ ASSERT_OK(dbs[i]->Put(WriteOptions(), handle,
+ rnd_.RandomString(kKeySize),
+ rnd_.RandomString(kValueSize)));
+ UpdateUsagesHistory(dbs);
+ }
+ }
+ }
+ // Expect the usage history is monotonically increasing
+ for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size();
+ ++i) {
+ ASSERT_GT(usage_history_[MemoryUtil::kMemTableTotal][i],
+ usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+ ASSERT_GT(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+ usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+ ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+ usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+ }
+
+ size_t usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size();
+ std::vector<Iterator*> iters;
+
+ // Create an iterator and flush all memtables for each db
+ for (int i = 0; i < kNumDBs; ++i) {
+ iters.push_back(dbs[i]->NewIterator(ReadOptions()));
+ ASSERT_OK(dbs[i]->Flush(FlushOptions()));
+
+ for (int j = 0; j < 100; ++j) {
+ std::string value;
+ ASSERT_NOK(
+ dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value));
+ }
+
+ UpdateUsagesHistory(dbs);
+ }
+ for (size_t i = usage_check_point;
+ i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) {
+ // Since memtables are pinned by iterators, we don't expect the
+ // memory usage of all the memtables decreases as they are pinned
+ // by iterators.
+ ASSERT_GE(usage_history_[MemoryUtil::kMemTableTotal][i],
+ usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+ // Expect the usage history from the "usage_decay_point" is
+ // monotonically decreasing.
+ ASSERT_LT(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+ usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+ // Expect the usage history of the table readers increases
+ // as we flush tables.
+ ASSERT_GT(usage_history_[MemoryUtil::kTableReadersTotal][i],
+ usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+ ASSERT_GT(usage_history_[MemoryUtil::kCacheTotal][i],
+ usage_history_[MemoryUtil::kCacheTotal][i - 1]);
+ }
+ usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size();
+ for (int i = 0; i < kNumDBs; ++i) {
+ // iterator is not used.
+ ASSERT_OK(iters[i]->status());
+ delete iters[i];
+ UpdateUsagesHistory(dbs);
+ }
+ for (size_t i = usage_check_point;
+ i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) {
+ // Expect the usage of all memtables decreasing as we delete iterators.
+ ASSERT_LT(usage_history_[MemoryUtil::kMemTableTotal][i],
+ usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+ // Since the memory usage of un-flushed memtables is only affected
+ // by Put and flush, we expect EQ here as we only delete iterators.
+ ASSERT_EQ(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+ usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+ // Expect EQ as we didn't flush more memtables.
+ ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+ usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+ }
+
+ for (int i = 0; i < kNumDBs; ++i) {
+ for (auto* handle : vec_handles[i]) {
+ delete handle;
+ }
+ delete dbs[i];
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+#else
+ return 0;
+#endif
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ printf("Skipped in RocksDBLite as utilities are not supported.\n");
+ return 0;
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/memory/memory_util.cc b/src/rocksdb/utilities/memory/memory_util.cc
new file mode 100644
index 000000000..13c81aec4
--- /dev/null
+++ b/src/rocksdb/utilities/memory/memory_util.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/memory_util.h"
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status MemoryUtil::GetApproximateMemoryUsageByType(
+ const std::vector<DB*>& dbs,
+ const std::unordered_set<const Cache*> cache_set,
+ std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
+ usage_by_type->clear();
+
+ // MemTable
+ for (auto* db : dbs) {
+ uint64_t usage = 0;
+ if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+ &usage)) {
+ (*usage_by_type)[MemoryUtil::kMemTableTotal] += usage;
+ }
+ if (db->GetAggregatedIntProperty(DB::Properties::kCurSizeAllMemTables,
+ &usage)) {
+ (*usage_by_type)[MemoryUtil::kMemTableUnFlushed] += usage;
+ }
+ }
+
+ // Table Readers
+ for (auto* db : dbs) {
+ uint64_t usage = 0;
+ if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem,
+ &usage)) {
+ (*usage_by_type)[MemoryUtil::kTableReadersTotal] += usage;
+ }
+ }
+
+ // Cache
+ for (const auto* cache : cache_set) {
+ if (cache != nullptr) {
+ (*usage_by_type)[MemoryUtil::kCacheTotal] += cache->GetUsage();
+ }
+ }
+
+ return Status::OK();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/memory_allocators.h b/src/rocksdb/utilities/memory_allocators.h
new file mode 100644
index 000000000..c9e77a5b7
--- /dev/null
+++ b/src/rocksdb/utilities/memory_allocators.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/memory_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A memory allocator using new/delete
+class DefaultMemoryAllocator : public MemoryAllocator {
+ public:
+ static const char* kClassName() { return "DefaultMemoryAllocator"; }
+ const char* Name() const override { return kClassName(); }
+ void* Allocate(size_t size) override {
+ return static_cast<void*>(new char[size]);
+ }
+
+ void Deallocate(void* p) override { delete[] static_cast<char*>(p); }
+};
+
+// Base class for a MemoryAllocator. This implementation does nothing
+// and implements the methods in failuse mode (assert if the methods are
+// invoked). Implementations can extend this class and override these methods
+// when they are enabled via compiler switches (e.g., the
+// JeMallocMemoryAllocator can define these methods if ROCKSDB_JEMALLOC is
+// defined at compile time. If compiled in "disabled" mode, this class provides
+// default/failure implementations. If compiled in "enabled" mode, the derived
+// class needs to provide the appopriate "enabled" methods for the "real"
+// implementation. Failure of the "real" implementation to implement ovreride
+// any of these methods will result in an assert failure.
+class BaseMemoryAllocator : public MemoryAllocator {
+ public:
+ void* Allocate(size_t /*size*/) override {
+ assert(false);
+ return nullptr;
+ }
+
+ void Deallocate(void* /*p*/) override { assert(false); }
+};
+
+// A Wrapped MemoryAllocator. Delegates the memory allcator functions to the
+// wrapped one.
+class MemoryAllocatorWrapper : public MemoryAllocator {
+ public:
+ // Initialize an MemoryAllocatorWrapper that delegates all calls to *t
+ explicit MemoryAllocatorWrapper(const std::shared_ptr<MemoryAllocator>& t);
+ ~MemoryAllocatorWrapper() override {}
+
+ // Return the target to which to forward all calls
+ MemoryAllocator* target() const { return target_.get(); }
+ // Allocate a block of at least size. Has to be thread-safe.
+ void* Allocate(size_t size) override { return target_->Allocate(size); }
+
+ // Deallocate previously allocated block. Has to be thread-safe.
+ void Deallocate(void* p) override { return target_->Deallocate(p); }
+
+ // Returns the memory size of the block allocated at p. The default
+ // implementation that just returns the original allocation_size is fine.
+ size_t UsableSize(void* p, size_t allocation_size) const override {
+ return target_->UsableSize(p, allocation_size);
+ }
+
+ const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+ std::shared_ptr<MemoryAllocator> target_;
+};
+
+// A memory allocator that counts the number of allocations and deallocations
+// This class is useful if the number of memory allocations/dellocations is
+// important.
+class CountedMemoryAllocator : public MemoryAllocatorWrapper {
+ public:
+ CountedMemoryAllocator()
+ : MemoryAllocatorWrapper(std::make_shared<DefaultMemoryAllocator>()),
+ allocations_(0),
+ deallocations_(0) {}
+
+ explicit CountedMemoryAllocator(const std::shared_ptr<MemoryAllocator>& t)
+ : MemoryAllocatorWrapper(t), allocations_(0), deallocations_(0) {}
+ static const char* kClassName() { return "CountedMemoryAllocator"; }
+ const char* Name() const override { return kClassName(); }
+ std::string GetId() const override { return std::string(Name()); }
+ void* Allocate(size_t size) override {
+ allocations_++;
+ return MemoryAllocatorWrapper::Allocate(size);
+ }
+
+ void Deallocate(void* p) override {
+ deallocations_++;
+ MemoryAllocatorWrapper::Deallocate(p);
+ }
+ uint64_t GetNumAllocations() const { return allocations_; }
+ uint64_t GetNumDeallocations() const { return deallocations_; }
+
+ private:
+ std::atomic<uint64_t> allocations_;
+ std::atomic<uint64_t> deallocations_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators.cc b/src/rocksdb/utilities/merge_operators.cc
new file mode 100644
index 000000000..c97e9ce25
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators.cc
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/merge_operators.h"
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+static bool LoadMergeOperator(const std::string& id,
+ std::shared_ptr<MergeOperator>* result) {
+ bool success = true;
+ // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
+ // Requires these classes be moved into a header file...
+ if (id == "put" || id == "PutOperator") {
+ *result = MergeOperators::CreatePutOperator();
+ } else if (id == "put_v1") {
+ *result = MergeOperators::CreateDeprecatedPutOperator();
+ } else if (id == "uint64add" || id == "UInt64AddOperator") {
+ *result = MergeOperators::CreateUInt64AddOperator();
+ } else if (id == "max" || id == "MaxOperator") {
+ *result = MergeOperators::CreateMaxOperator();
+#ifdef ROCKSDB_LITE
+ // The remainder of the classes are handled by the ObjectRegistry in
+ // non-LITE mode
+ } else if (id == StringAppendOperator::kNickName() ||
+ id == StringAppendOperator::kClassName()) {
+ *result = MergeOperators::CreateStringAppendOperator();
+ } else if (id == StringAppendTESTOperator::kNickName() ||
+ id == StringAppendTESTOperator::kClassName()) {
+ *result = MergeOperators::CreateStringAppendTESTOperator();
+ } else if (id == BytesXOROperator::kNickName() ||
+ id == BytesXOROperator::kClassName()) {
+ *result = MergeOperators::CreateBytesXOROperator();
+ } else if (id == SortList::kNickName() || id == SortList::kClassName()) {
+ *result = MergeOperators::CreateSortOperator();
+#endif // ROCKSDB_LITE
+ } else {
+ success = false;
+ }
+ return success;
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMergeOperators(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ size_t num_types;
+ library.AddFactory<MergeOperator>(
+ ObjectLibrary::PatternEntry(StringAppendOperator::kClassName())
+ .AnotherName(StringAppendOperator::kNickName()),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new StringAppendOperator(","));
+ return guard->get();
+ });
+ library.AddFactory<MergeOperator>(
+ ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName())
+ .AnotherName(StringAppendTESTOperator::kNickName()),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new StringAppendTESTOperator(","));
+ return guard->get();
+ });
+ library.AddFactory<MergeOperator>(
+ ObjectLibrary::PatternEntry(SortList::kClassName())
+ .AnotherName(SortList::kNickName()),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new SortList());
+ return guard->get();
+ });
+ library.AddFactory<MergeOperator>(
+ ObjectLibrary::PatternEntry(BytesXOROperator::kClassName())
+ .AnotherName(BytesXOROperator::kNickName()),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /*errmsg*/) {
+ guard->reset(new BytesXOROperator());
+ return guard->get();
+ });
+
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+
+Status MergeOperator::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<MergeOperator>* result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<MergeOperator>(config_options, value,
+ LoadMergeOperator, result);
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateFromStringId(
+ const std::string& id) {
+ std::shared_ptr<MergeOperator> result;
+ Status s = MergeOperator::CreateFromString(ConfigOptions(), id, &result);
+ if (s.ok()) {
+ return result;
+ } else {
+ // Empty or unknown, just return nullptr
+ return nullptr;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators.h b/src/rocksdb/utilities/merge_operators.h
new file mode 100644
index 000000000..9b90107e3
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeOperators {
+ public:
+ static std::shared_ptr<MergeOperator> CreatePutOperator();
+ static std::shared_ptr<MergeOperator> CreateDeprecatedPutOperator();
+ static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
+ static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
+ static std::shared_ptr<MergeOperator> CreateStringAppendOperator(
+ char delim_char);
+ static std::shared_ptr<MergeOperator> CreateStringAppendOperator(
+ const std::string& delim);
+ static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
+ static std::shared_ptr<MergeOperator> CreateMaxOperator();
+ static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
+ static std::shared_ptr<MergeOperator> CreateSortOperator();
+
+ // Will return a different merge operator depending on the string.
+ static std::shared_ptr<MergeOperator> CreateFromStringId(
+ const std::string& name);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.cc b/src/rocksdb/utilities/merge_operators/bytesxor.cc
new file mode 100644
index 000000000..fa09c18ea
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/bytesxor.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/merge_operators/bytesxor.h"
+
+#include <algorithm>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateBytesXOROperator() {
+ return std::make_shared<BytesXOROperator>();
+}
+
+bool BytesXOROperator::Merge(const Slice& /*key*/, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* /*logger*/) const {
+ XOR(existing_value, value, new_value);
+ return true;
+}
+
+void BytesXOROperator::XOR(const Slice* existing_value, const Slice& value,
+ std::string* new_value) const {
+ if (!existing_value) {
+ new_value->clear();
+ new_value->assign(value.data(), value.size());
+ return;
+ }
+
+ size_t min_size = std::min(existing_value->size(), value.size());
+ size_t max_size = std::max(existing_value->size(), value.size());
+
+ new_value->clear();
+ new_value->reserve(max_size);
+
+ const char* existing_value_data = existing_value->data();
+ const char* value_data = value.data();
+
+ for (size_t i = 0; i < min_size; i++) {
+ new_value->push_back(existing_value_data[i] ^ value_data[i]);
+ }
+
+ if (existing_value->size() == max_size) {
+ for (size_t i = min_size; i < max_size; i++) {
+ new_value->push_back(existing_value_data[i]);
+ }
+ } else {
+ assert(value.size() == max_size);
+ for (size_t i = min_size; i < max_size; i++) {
+ new_value->push_back(value_data[i]);
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.h b/src/rocksdb/utilities/merge_operators/bytesxor.h
new file mode 100644
index 000000000..3c7baacce
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/bytesxor.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A 'model' merge operator that XORs two (same sized) array of bytes.
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class BytesXOROperator : public AssociativeMergeOperator {
+ public:
+ // XORs the two array of bytes one byte at a time and stores the result
+ // in new_value. len is the number of xored bytes, and the length of new_value
+ virtual bool Merge(const Slice& key, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* logger) const override;
+
+ static const char* kClassName() { return "BytesXOR"; }
+ static const char* kNickName() { return "bytesxor"; }
+
+ const char* NickName() const override { return kNickName(); }
+ const char* Name() const override { return kClassName(); }
+
+ void XOR(const Slice* existing_value, const Slice& value,
+ std::string* new_value) const;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/max.cc b/src/rocksdb/utilities/merge_operators/max.cc
new file mode 100644
index 000000000..de4abfa6f
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/max.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::Slice;
+
+namespace { // anonymous namespace
+
+// Merge operator that picks the maximum operand, Comparison is based on
+// Slice::compare
+class MaxOperator : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ Slice& max = merge_out->existing_operand;
+ if (merge_in.existing_value) {
+ max = Slice(merge_in.existing_value->data(),
+ merge_in.existing_value->size());
+ } else if (max.data() == nullptr) {
+ max = Slice();
+ }
+
+ for (const auto& op : merge_in.operand_list) {
+ if (max.compare(op) < 0) {
+ max = op;
+ }
+ }
+
+ return true;
+ }
+
+ bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+ const Slice& right_operand, std::string* new_value,
+ Logger* /*logger*/) const override {
+ if (left_operand.compare(right_operand) >= 0) {
+ new_value->assign(left_operand.data(), left_operand.size());
+ } else {
+ new_value->assign(right_operand.data(), right_operand.size());
+ }
+ return true;
+ }
+
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ Slice max;
+ for (const auto& operand : operand_list) {
+ if (max.compare(operand) < 0) {
+ max = operand;
+ }
+ }
+
+ new_value->assign(max.data(), max.size());
+ return true;
+ }
+
+ static const char* kClassName() { return "MaxOperator"; }
+ static const char* kNickName() { return "max"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+};
+
+} // end of anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateMaxOperator() {
+ return std::make_shared<MaxOperator>();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/put.cc b/src/rocksdb/utilities/merge_operators/put.cc
new file mode 100644
index 000000000..ccf9ff21f
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/put.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+namespace { // anonymous namespace
+
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::Slice;
+
+// A merge operator that mimics Put semantics
+// Since this merge-operator will not be used in production,
+// it is implemented as a non-associative merge operator to illustrate the
+// new interface and for testing purposes. (That is, we inherit from
+// the MergeOperator class rather than the AssociativeMergeOperator
+// which would be simpler in this case).
+//
+// From the client-perspective, semantics are the same.
+class PutOperator : public MergeOperator {
+ public:
+ bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+ const std::deque<std::string>& operand_sequence,
+ std::string* new_value, Logger* /*logger*/) const override {
+ // Put basically only looks at the current/latest value
+ assert(!operand_sequence.empty());
+ assert(new_value != nullptr);
+ new_value->assign(operand_sequence.back());
+ return true;
+ }
+
+ bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+ const Slice& right_operand, std::string* new_value,
+ Logger* /*logger*/) const override {
+ new_value->assign(right_operand.data(), right_operand.size());
+ return true;
+ }
+
+ using MergeOperator::PartialMergeMulti;
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const override {
+ new_value->assign(operand_list.back().data(), operand_list.back().size());
+ return true;
+ }
+
+ static const char* kClassName() { return "PutOperator"; }
+ static const char* kNickName() { return "put_v1"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+};
+
+class PutOperatorV2 : public PutOperator {
+ bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+ const std::deque<std::string>& /*operand_sequence*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ assert(false);
+ return false;
+ }
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override {
+ // Put basically only looks at the current/latest value
+ assert(!merge_in.operand_list.empty());
+ merge_out->existing_operand = merge_in.operand_list.back();
+ return true;
+ }
+
+ static const char* kNickName() { return "put"; }
+ const char* NickName() const override { return kNickName(); }
+};
+
+} // end of anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateDeprecatedPutOperator() {
+ return std::make_shared<PutOperator>();
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreatePutOperator() {
+ return std::make_shared<PutOperatorV2>();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/sortlist.cc b/src/rocksdb/utilities/merge_operators/sortlist.cc
new file mode 100644
index 000000000..67bfc7e5e
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/sortlist.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include "utilities/merge_operators/sortlist.h"
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SortList::FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ std::vector<int> left;
+ for (Slice slice : merge_in.operand_list) {
+ std::vector<int> right;
+ MakeVector(right, slice);
+ left = Merge(left, right);
+ }
+ for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+ merge_out->new_value.append(std::to_string(left[i])).append(",");
+ }
+ merge_out->new_value.append(std::to_string(left.back()));
+ return true;
+}
+
+bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+ const Slice& right_operand, std::string* new_value,
+ Logger* /*logger*/) const {
+ std::vector<int> left;
+ std::vector<int> right;
+ MakeVector(left, left_operand);
+ MakeVector(right, right_operand);
+ left = Merge(left, right);
+ for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+ new_value->append(std::to_string(left[i])).append(",");
+ }
+ new_value->append(std::to_string(left.back()));
+ return true;
+}
+
+bool SortList::PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* /*logger*/) const {
+ (void)operand_list;
+ (void)new_value;
+ return true;
+}
+
+void SortList::MakeVector(std::vector<int>& operand, Slice slice) const {
+ do {
+ const char* begin = slice.data_;
+ while (*slice.data_ != ',' && *slice.data_) slice.data_++;
+ operand.push_back(std::stoi(std::string(begin, slice.data_)));
+ } while (0 != *slice.data_++);
+}
+
+std::vector<int> SortList::Merge(std::vector<int>& left,
+ std::vector<int>& right) const {
+ // Fill the resultant vector with sorted results from both vectors
+ std::vector<int> result;
+ unsigned left_it = 0, right_it = 0;
+
+ while (left_it < left.size() && right_it < right.size()) {
+ // If the left value is smaller than the right it goes next
+ // into the resultant vector
+ if (left[left_it] < right[right_it]) {
+ result.push_back(left[left_it]);
+ left_it++;
+ } else {
+ result.push_back(right[right_it]);
+ right_it++;
+ }
+ }
+
+ // Push the remaining data from both vectors onto the resultant
+ while (left_it < left.size()) {
+ result.push_back(left[left_it]);
+ left_it++;
+ }
+
+ while (right_it < right.size()) {
+ result.push_back(right[right_it]);
+ right_it++;
+ }
+
+ return result;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateSortOperator() {
+ return std::make_shared<SortList>();
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/sortlist.h b/src/rocksdb/utilities/merge_operators/sortlist.h
new file mode 100644
index 000000000..eaa4e76fb
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/sortlist.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// A MergeOperator for RocksDB that implements Merge Sort.
+// It is built using the MergeOperator interface. The operator works by taking
+// an input which contains one or more merge operands where each operand is a
+// list of sorted ints and merges them to form a large sorted list.
+#pragma once
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SortList : public MergeOperator {
+ public:
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+ const Slice& right_operand, std::string* new_value,
+ Logger* /*logger*/) const override;
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const override;
+
+ static const char* kClassName() { return "MergeSortOperator"; }
+ static const char* kNickName() { return "sortlist"; }
+
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+
+ void MakeVector(std::vector<int>& operand, Slice slice) const;
+
+ private:
+ std::vector<int> Merge(std::vector<int>& left, std::vector<int>& right) const;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc
new file mode 100644
index 000000000..5092cabcb
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc
@@ -0,0 +1,78 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend.h"
+
+#include <assert.h>
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+ stringappend_merge_type_info = {
+#ifndef ROCKSDB_LITE
+ {"delimiter",
+ {0, OptionType::kString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+} // namespace
+// Constructor: also specify the delimiter character.
+StringAppendOperator::StringAppendOperator(char delim_char)
+ : delim_(1, delim_char) {
+ RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
+}
+
+StringAppendOperator::StringAppendOperator(const std::string& delim)
+ : delim_(delim) {
+ RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendOperator::Merge(const Slice& /*key*/,
+ const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* /*logger*/) const {
+ // Clear the *new_value for writing.
+ assert(new_value);
+ new_value->clear();
+
+ if (!existing_value) {
+ // No existing_value. Set *new_value = value
+ new_value->assign(value.data(), value.size());
+ } else {
+ // Generic append (existing_value != null).
+ // Reserve *new_value to correct size, and apply concatenation.
+ new_value->reserve(existing_value->size() + delim_.size() + value.size());
+ new_value->assign(existing_value->data(), existing_value->size());
+ new_value->append(delim_);
+ new_value->append(value.data(), value.size());
+ }
+
+ return true;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
+ return std::make_shared<StringAppendOperator>(',');
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(
+ char delim_char) {
+ return std::make_shared<StringAppendOperator>(delim_char);
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(
+ const std::string& delim) {
+ return std::make_shared<StringAppendOperator>(delim);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h
new file mode 100644
index 000000000..153532382
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h
@@ -0,0 +1,32 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StringAppendOperator : public AssociativeMergeOperator {
+ public:
+ // Constructor: specify delimiter
+ explicit StringAppendOperator(char delim_char);
+ explicit StringAppendOperator(const std::string& delim);
+
+ virtual bool Merge(const Slice& key, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* logger) const override;
+
+ static const char* kClassName() { return "StringAppendOperator"; }
+ static const char* kNickName() { return "stringappend"; }
+ virtual const char* Name() const override { return kClassName(); }
+ virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+ std::string delim_; // The delimiter is inserted between elements
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
new file mode 100644
index 000000000..36cb9ee34
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
@@ -0,0 +1,132 @@
+/**
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend2.h"
+
+#include <assert.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+ stringappend2_merge_type_info = {
+#ifndef ROCKSDB_LITE
+ {"delimiter",
+ {0, OptionType::kString, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+} // namespace
+
+// Constructor: also specify the delimiter character.
+StringAppendTESTOperator::StringAppendTESTOperator(char delim_char)
+ : delim_(1, delim_char) {
+ RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
+}
+
+StringAppendTESTOperator::StringAppendTESTOperator(const std::string& delim)
+ : delim_(delim) {
+ RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendTESTOperator::FullMergeV2(
+ const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ // Clear the *new_value for writing.
+ merge_out->new_value.clear();
+
+ if (merge_in.existing_value == nullptr && merge_in.operand_list.size() == 1) {
+ // Only one operand
+ merge_out->existing_operand = merge_in.operand_list.back();
+ return true;
+ }
+
+ // Compute the space needed for the final result.
+ size_t numBytes = 0;
+
+ for (auto it = merge_in.operand_list.begin();
+ it != merge_in.operand_list.end(); ++it) {
+ numBytes += it->size() + delim_.size();
+ }
+
+ // Only print the delimiter after the first entry has been printed
+ bool printDelim = false;
+
+ // Prepend the *existing_value if one exists.
+ if (merge_in.existing_value) {
+ merge_out->new_value.reserve(numBytes + merge_in.existing_value->size());
+ merge_out->new_value.append(merge_in.existing_value->data(),
+ merge_in.existing_value->size());
+ printDelim = true;
+ } else if (numBytes) {
+ // Without the existing (initial) value, the delimiter before the first of
+ // subsequent operands becomes redundant.
+ merge_out->new_value.reserve(numBytes - delim_.size());
+ }
+
+ // Concatenate the sequence of strings (and add a delimiter between each)
+ for (auto it = merge_in.operand_list.begin();
+ it != merge_in.operand_list.end(); ++it) {
+ if (printDelim) {
+ merge_out->new_value.append(delim_);
+ }
+ merge_out->new_value.append(it->data(), it->size());
+ printDelim = true;
+ }
+
+ return true;
+}
+
+bool StringAppendTESTOperator::PartialMergeMulti(
+ const Slice& /*key*/, const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/, Logger* /*logger*/) const {
+ return false;
+}
+
+// A version of PartialMerge that actually performs "partial merging".
+// Use this to simulate the exact behaviour of the StringAppendOperator.
+bool StringAppendTESTOperator::_AssocPartialMergeMulti(
+ const Slice& /*key*/, const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* /*logger*/) const {
+ // Clear the *new_value for writing
+ assert(new_value);
+ new_value->clear();
+ assert(operand_list.size() >= 2);
+
+ // Generic append
+ // Determine and reserve correct size for *new_value.
+ size_t size = 0;
+ for (const auto& operand : operand_list) {
+ size += operand.size();
+ }
+ size += (operand_list.size() - 1) * delim_.length(); // Delimiters
+ new_value->reserve(size);
+
+ // Apply concatenation
+ new_value->assign(operand_list.front().data(), operand_list.front().size());
+
+ for (std::deque<Slice>::const_iterator it = operand_list.begin() + 1;
+ it != operand_list.end(); ++it) {
+ new_value->append(delim_);
+ new_value->append(it->data(), it->size());
+ }
+
+ return true;
+}
+
+std::shared_ptr<MergeOperator>
+MergeOperators::CreateStringAppendTESTOperator() {
+ return std::make_shared<StringAppendTESTOperator>(',');
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h
new file mode 100644
index 000000000..75389e4ae
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h
@@ -0,0 +1,52 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+/**
+ * A TEST MergeOperator for rocksdb that implements string append.
+ * It is built using the MergeOperator interface rather than the simpler
+ * AssociativeMergeOperator interface. This is useful for testing/benchmarking.
+ * While the two operators are semantically the same, all production code
+ * should use the StringAppendOperator defined in stringappend.{h,cc}. The
+ * operator defined in the present file is primarily for testing.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include <deque>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StringAppendTESTOperator : public MergeOperator {
+ public:
+ // Constructor with delimiter
+ explicit StringAppendTESTOperator(char delim_char);
+ explicit StringAppendTESTOperator(const std::string& delim);
+
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ virtual bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const override;
+
+ static const char* kClassName() { return "StringAppendTESTOperator"; }
+ static const char* kNickName() { return "stringappendtest"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+
+ private:
+ // A version of PartialMerge that actually performs "partial merging".
+ // Use this to simulate the exact behaviour of the StringAppendOperator.
+ bool _AssocPartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const;
+
+ std::string delim_; // The delimiter is inserted between elements
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
new file mode 100644
index 000000000..22b6144af
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
@@ -0,0 +1,640 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+/**
+ * An persistent map : key -> (list of strings), using rocksdb merge.
+ * This file is a test-harness / use-case for the StringAppendOperator.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook, Inc.
+ */
+
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+#include <iostream>
+#include <map>
+#include <tuple>
+
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Path to the database on file system
+const std::string kDbName = test::PerThreadDBPath("stringappend_test");
+
+namespace {
+// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
+std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ MergeOperator* mergeOperator;
+ if (delim.size() == 1) {
+ mergeOperator = new StringAppendOperator(delim[0]);
+ } else {
+ mergeOperator = new StringAppendOperator(delim);
+ }
+ options.merge_operator.reset(mergeOperator);
+ EXPECT_OK(DB::Open(options, kDbName, &db));
+ return std::shared_ptr<DB>(db);
+}
+
+#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite
+// Open a TtlDB with a non-associative StringAppendTESTOperator
+std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
+ DBWithTTL* db;
+ Options options;
+ options.create_if_missing = true;
+ MergeOperator* mergeOperator;
+ if (delim.size() == 1) {
+ mergeOperator = new StringAppendTESTOperator(delim[0]);
+ } else {
+ mergeOperator = new StringAppendTESTOperator(delim);
+ }
+ options.merge_operator.reset(mergeOperator);
+ EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
+ return std::shared_ptr<DB>(db);
+}
+#endif // !ROCKSDB_LITE
+} // namespace
+
+/// StringLists represents a set of string-lists, each with a key-index.
+/// Supports Append(list, string) and Get(list)
+class StringLists {
+ public:
+ // Constructor: specifies the rocksdb db
+ /* implicit */
+ StringLists(std::shared_ptr<DB> db)
+ : db_(db), merge_option_(), get_option_() {
+ assert(db);
+ }
+
+ // Append string val onto the list defined by key; return true on success
+ bool Append(const std::string& key, const std::string& val) {
+ Slice valSlice(val.data(), val.size());
+ auto s = db_->Merge(merge_option_, key, valSlice);
+
+ if (s.ok()) {
+ return true;
+ } else {
+ std::cerr << "ERROR " << s.ToString() << std::endl;
+ return false;
+ }
+ }
+
+ // Returns the list of strings associated with key (or "" if does not exist)
+ bool Get(const std::string& key, std::string* const result) {
+ assert(result != nullptr); // we should have a place to store the result
+ auto s = db_->Get(get_option_, key, result);
+
+ if (s.ok()) {
+ return true;
+ }
+
+ // Either key does not exist, or there is some error.
+ *result = ""; // Always return empty string (just for convention)
+
+ // NotFound is okay; just return empty (similar to std::map)
+ // But network or db errors, etc, should fail the test (or at least yell)
+ if (!s.IsNotFound()) {
+ std::cerr << "ERROR " << s.ToString() << std::endl;
+ }
+
+ // Always return false if s.ok() was not true
+ return false;
+ }
+
+ private:
+ std::shared_ptr<DB> db_;
+ WriteOptions merge_option_;
+ ReadOptions get_option_;
+};
+
+// The class for unit-testing
+class StringAppendOperatorTest : public testing::Test,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ StringAppendOperatorTest() {
+ EXPECT_OK(
+ DestroyDB(kDbName, Options())); // Start each test with a fresh DB
+ }
+
+ void SetUp() override {
+#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite
+ bool if_use_ttl = GetParam();
+ if (if_use_ttl) {
+ fprintf(stderr, "Running tests with ttl db and generic operator.\n");
+ StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
+ return;
+ }
+#endif // !ROCKSDB_LITE
+ fprintf(stderr, "Running tests with regular db and operator.\n");
+ StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
+ }
+
+ using OpenFuncPtr = std::shared_ptr<DB> (*)(const std::string&);
+
+ // Allows user to open databases with different configurations.
+ // e.g.: Can open a DB or a TtlDB, etc.
+ static void SetOpenDbFunction(OpenFuncPtr func) { OpenDb = func; }
+
+ protected:
+ static OpenFuncPtr OpenDb;
+};
+StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb =
+ nullptr;
+
+// THE TEST CASES BEGIN HERE
+
+TEST_P(StringAppendOperatorTest, IteratorTest) {
+ auto db_ = OpenDb(",");
+ StringLists slists(db_);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ slists.Append("k2", "a1");
+ slists.Append("k2", "a2");
+ slists.Append("k2", "a3");
+
+ std::string res;
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it(
+ db_->NewIterator(ReadOptions()));
+ std::string k1("k1");
+ std::string k2("k2");
+ bool first = true;
+ for (it->Seek(k1); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ ASSERT_EQ(res, "v1,v2,v3");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "a1,a2,a3");
+ }
+ }
+ slists.Append("k2", "a4");
+ slists.Append("k1", "v4");
+
+ // Snapshot should still be the same. Should ignore a4 and v4.
+ first = true;
+ for (it->Seek(k1); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ ASSERT_EQ(res, "v1,v2,v3");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "a1,a2,a3");
+ }
+ }
+
+ // Should release the snapshot and be aware of the new stuff now
+ it.reset(db_->NewIterator(ReadOptions()));
+ first = true;
+ for (it->Seek(k1); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ ASSERT_EQ(res, "v1,v2,v3,v4");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "a1,a2,a3,a4");
+ }
+ }
+
+ // start from k2 this time.
+ for (it->Seek(k2); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ ASSERT_EQ(res, "v1,v2,v3,v4");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "a1,a2,a3,a4");
+ }
+ }
+
+ slists.Append("k3", "g1");
+
+ it.reset(db_->NewIterator(ReadOptions()));
+ first = true;
+ std::string k3("k3");
+ for (it->Seek(k2); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ ASSERT_EQ(res, "a1,a2,a3,a4");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "g1");
+ }
+ }
+ for (it->Seek(k3); it->Valid(); it->Next()) {
+ res = it->value().ToString();
+ if (first) {
+ // should not be hit
+ ASSERT_EQ(res, "a1,a2,a3,a4");
+ first = false;
+ } else {
+ ASSERT_EQ(res, "g1");
+ }
+ }
+}
+
+TEST_P(StringAppendOperatorTest, SimpleTest) {
+ auto db = OpenDb(",");
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+ ASSERT_EQ(res, "v1,v2,v3");
+}
+
+TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
+ auto db = OpenDb("|");
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+ ASSERT_EQ(res, "v1|v2|v3");
+}
+
+TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
+ auto db = OpenDb("");
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+ ASSERT_EQ(res, "v1v2v3");
+}
+
+TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
+ auto db = OpenDb("<>");
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+ ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
+ std::string delimiter = "<>";
+ auto db = OpenDb(delimiter);
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ delimiter.clear();
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+ ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+ auto db = OpenDb("!");
+ StringLists slists(db);
+
+ slists.Append("random_key", "single_val");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("random_key", &res));
+ ASSERT_EQ(res, "single_val");
+}
+
+TEST_P(StringAppendOperatorTest, VariousKeys) {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+
+ slists.Append("c", "asdasd");
+ slists.Append("a", "x");
+ slists.Append("b", "y");
+ slists.Append("a", "t");
+ slists.Append("a", "r");
+ slists.Append("b", "2");
+ slists.Append("c", "asdasd");
+
+ std::string a, b, c;
+ bool sa, sb, sc;
+ sa = slists.Get("a", &a);
+ sb = slists.Get("b", &b);
+ sc = slists.Get("c", &c);
+
+ ASSERT_TRUE(sa && sb && sc); // All three keys should have been found
+
+ ASSERT_EQ(a, "x\nt\nr");
+ ASSERT_EQ(b, "y\n2");
+ ASSERT_EQ(c, "asdasd\nasdasd");
+}
+
+// Generate semi random keys/words from a small distribution.
+TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
+ auto db = OpenDb(" ");
+ StringLists slists(db);
+
+ // Generate a list of random keys and values
+ const int kWordCount = 15;
+ std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf",
+ "342839", "dsuha", "mabuais", "sadajsid",
+ "jf9834hf", "2d9j89", "dj9823jd", "a",
+ "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+ const int kKeyCount = 6;
+ std::string keys[] = {"dhaiusdhu", "denidw", "daisda",
+ "keykey", "muki", "shzassdianmd"};
+
+ // Will store a local copy of all data in order to verify correctness
+ std::map<std::string, std::string> parallel_copy;
+
+ // Generate a bunch of random queries (Append and Get)!
+ enum query_t { APPEND_OP, GET_OP, NUM_OPS };
+ Random randomGen(1337); // deterministic seed; always get same results!
+
+ const int kNumQueries = 30;
+ for (int q = 0; q < kNumQueries; ++q) {
+ // Generate a random query (Append or Get) and random parameters
+ query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+ std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+ std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+ // Apply the query and any checks.
+ if (query == APPEND_OP) {
+ // Apply the rocksdb test-harness Append defined above
+ slists.Append(key, word); // apply the rocksdb append
+
+ // Apply the similar "Append" to the parallel copy
+ if (parallel_copy[key].size() > 0) {
+ parallel_copy[key] += " " + word;
+ } else {
+ parallel_copy[key] = word;
+ }
+
+ } else if (query == GET_OP) {
+ // Assumes that a non-existent key just returns <empty>
+ std::string res;
+ slists.Get(key, &res);
+ ASSERT_EQ(res, parallel_copy[key]);
+ }
+ }
+}
+
+TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+ auto db = OpenDb(" ");
+ StringLists slists(db);
+
+ // Generate a list of random keys and values
+ const int kWordCount = 15;
+ std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf",
+ "342839", "dsuha", "mabuais", "sadajsid",
+ "jf9834hf", "2d9j89", "dj9823jd", "a",
+ "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+ const int kKeyCount = 6;
+ std::string keys[] = {"dhaiusdhu", "denidw", "daisda",
+ "keykey", "muki", "shzassdianmd"};
+
+ // Will store a local copy of all data in order to verify correctness
+ std::map<std::string, std::string> parallel_copy;
+
+ // Generate a bunch of random queries (Append and Get)!
+ enum query_t { APPEND_OP, GET_OP, NUM_OPS };
+ Random randomGen(9138204); // deterministic seed
+
+ const int kNumQueries = 1000;
+ for (int q = 0; q < kNumQueries; ++q) {
+ // Generate a random query (Append or Get) and random parameters
+ query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+ std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+ std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+ // Apply the query and any checks.
+ if (query == APPEND_OP) {
+ // Apply the rocksdb test-harness Append defined above
+ slists.Append(key, word); // apply the rocksdb append
+
+ // Apply the similar "Append" to the parallel copy
+ if (parallel_copy[key].size() > 0) {
+ parallel_copy[key] += " " + word;
+ } else {
+ parallel_copy[key] = word;
+ }
+
+ } else if (query == GET_OP) {
+ // Assumes that a non-existent key just returns <empty>
+ std::string res;
+ slists.Get(key, &res);
+ ASSERT_EQ(res, parallel_copy[key]);
+ }
+ }
+}
+
+TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
+ // Perform the following operations in limited scope
+ {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+
+ slists.Append("c", "asdasd");
+ slists.Append("a", "x");
+ slists.Append("b", "y");
+ slists.Append("a", "t");
+ slists.Append("a", "r");
+ slists.Append("b", "2");
+ slists.Append("c", "asdasd");
+
+ std::string a, b, c;
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+
+ ASSERT_EQ(a, "x\nt\nr");
+ ASSERT_EQ(b, "y\n2");
+ ASSERT_EQ(c, "asdasd\nasdasd");
+ }
+
+ // Reopen the database (the previous changes should persist / be remembered)
+ {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+
+ slists.Append("c", "bbnagnagsx");
+ slists.Append("a", "sa");
+ slists.Append("b", "df");
+ slists.Append("a", "gh");
+ slists.Append("a", "jk");
+ slists.Append("b", "l;");
+ slists.Append("c", "rogosh");
+
+ // The previous changes should be on disk (L0)
+ // The most recent changes should be in memory (MemTable)
+ // Hence, this will test both Get() paths.
+ std::string a, b, c;
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+
+ ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+ ASSERT_EQ(b, "y\n2\ndf\nl;");
+ ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+ }
+
+ // Reopen the database (the previous changes should persist / be remembered)
+ {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+
+ // All changes should be on disk. This will test VersionSet Get()
+ std::string a, b, c;
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+
+ ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+ ASSERT_EQ(b, "y\n2\ndf\nl;");
+ ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+ }
+}
+
+TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+ // Perform the following operations in limited scope
+ {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+ std::string a, b, c;
+
+ // Append, Flush, Get
+ slists.Append("c", "asdasd");
+ ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+ ASSERT_TRUE(slists.Get("c", &c));
+ ASSERT_EQ(c, "asdasd");
+
+ // Append, Flush, Append, Get
+ slists.Append("a", "x");
+ slists.Append("b", "y");
+ ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+ slists.Append("a", "t");
+ slists.Append("a", "r");
+ slists.Append("b", "2");
+
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_EQ(a, "x\nt\nr");
+
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_EQ(b, "y\n2");
+
+ // Append, Get
+ ASSERT_TRUE(slists.Append("c", "asdasd"));
+ ASSERT_TRUE(slists.Append("b", "monkey"));
+
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+
+ ASSERT_EQ(a, "x\nt\nr");
+ ASSERT_EQ(b, "y\n2\nmonkey");
+ ASSERT_EQ(c, "asdasd\nasdasd");
+ }
+
+ // Reopen the database (the previous changes should persist / be remembered)
+ {
+ auto db = OpenDb("\n");
+ StringLists slists(db);
+ std::string a, b, c;
+
+ // Get (Quick check for persistence of previous database)
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_EQ(a, "x\nt\nr");
+
+ // Append, Compact, Get
+ slists.Append("c", "bbnagnagsx");
+ slists.Append("a", "sa");
+ slists.Append("b", "df");
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+ ASSERT_EQ(a, "x\nt\nr\nsa");
+ ASSERT_EQ(b, "y\n2\nmonkey\ndf");
+ ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
+
+ // Append, Get
+ slists.Append("a", "gh");
+ slists.Append("a", "jk");
+ slists.Append("b", "l;");
+ slists.Append("c", "rogosh");
+ ASSERT_TRUE(slists.Get("a", &a));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_TRUE(slists.Get("c", &c));
+ ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+ ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+ ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+ // Compact, Get
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+ ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+ ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+ // Append, Flush, Compact, Get
+ slists.Append("b", "afcg");
+ ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_TRUE(slists.Get("b", &b));
+ ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
+ }
+}
+
+TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+ auto db = OpenDb(std::string(1, '\0'));
+ StringLists slists(db);
+
+ slists.Append("k1", "v1");
+ slists.Append("k1", "v2");
+ slists.Append("k1", "v3");
+
+ std::string res;
+ ASSERT_TRUE(slists.Get("k1", &res));
+
+ // Construct the desired string. Default constructor doesn't like '\0' chars.
+ std::string checker("v1,v2,v3"); // Verify that the string is right size.
+ checker[2] = '\0'; // Use null delimiter instead of comma.
+ checker[5] = '\0';
+ ASSERT_EQ(checker.size(), 8); // Verify it is still the correct size
+
+ // Check that the rocksdb result string matches the desired string
+ ASSERT_EQ(res.size(), checker.size());
+ ASSERT_EQ(res, checker);
+}
+
+INSTANTIATE_TEST_CASE_P(StringAppendOperatorTest, StringAppendOperatorTest,
+ testing::Bool());
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/merge_operators/uint64add.cc b/src/rocksdb/utilities/merge_operators/uint64add.cc
new file mode 100644
index 000000000..5be2f5641
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/uint64add.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace { // anonymous namespace
+
+using ROCKSDB_NAMESPACE::AssociativeMergeOperator;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::Slice;
+
+// A 'model' merge operator with uint64 addition semantics
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class UInt64AddOperator : public AssociativeMergeOperator {
+ public:
+ bool Merge(const Slice& /*key*/, const Slice* existing_value,
+ const Slice& value, std::string* new_value,
+ Logger* logger) const override {
+ uint64_t orig_value = 0;
+ if (existing_value) {
+ orig_value = DecodeInteger(*existing_value, logger);
+ }
+ uint64_t operand = DecodeInteger(value, logger);
+
+ assert(new_value);
+ new_value->clear();
+ ROCKSDB_NAMESPACE::PutFixed64(new_value, orig_value + operand);
+
+ return true; // Return true always since corruption will be treated as 0
+ }
+
+ static const char* kClassName() { return "UInt64AddOperator"; }
+ static const char* kNickName() { return "uint64add"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kNickName(); }
+
+ private:
+ // Takes the string and decodes it into a uint64_t
+ // On error, prints a message and returns 0
+ uint64_t DecodeInteger(const Slice& value, Logger* logger) const {
+ uint64_t result = 0;
+
+ if (value.size() == sizeof(uint64_t)) {
+ result = ROCKSDB_NAMESPACE::DecodeFixed64(value.data());
+ } else if (logger != nullptr) {
+ // If value is corrupted, treat it as 0
+ ROCKS_LOG_ERROR(logger,
+ "uint64 value corruption, size: %" ROCKSDB_PRIszt
+ " > %" ROCKSDB_PRIszt,
+ value.size(), sizeof(uint64_t));
+ }
+
+ return result;
+ }
+};
+
+} // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateUInt64AddOperator() {
+ return std::make_shared<UInt64AddOperator>();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/object_registry.cc b/src/rocksdb/utilities/object_registry.cc
new file mode 100644
index 000000000..18834783d
--- /dev/null
+++ b/src/rocksdb/utilities/object_registry.cc
@@ -0,0 +1,383 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include <ctype.h>
+
+#include "logging/logging.h"
+#include "port/lang.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+namespace {
+bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
+ // If it is numeric, everything up to the match must be a number
+ int digits = 0;
+ if (target[start] == '-') {
+ start++; // Allow negative numbers
+ }
+ while (start < pos) {
+ if (!isdigit(target[start++])) {
+ return false;
+ } else {
+ digits++;
+ }
+ }
+ return (digits > 0);
+}
+
+bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
+ int digits = 0;
+ if (target[start] == '-') {
+ start++; // Allow negative numbers
+ }
+ for (bool point = false; start < pos; start++) {
+ if (target[start] == '.') {
+ if (point) {
+ return false;
+ } else {
+ point = true;
+ }
+ } else if (!isdigit(target[start])) {
+ return false;
+ } else {
+ digits++;
+ }
+ }
+ return (digits > 0);
+}
+} // namespace
+
+size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
+ size_t start, Quantifier mode, const std::string &target, size_t tlen,
+ const std::string &separator) const {
+ size_t slen = separator.size();
+ // See if there is enough space. If so, find the separator
+ if (tlen < start + slen) {
+ return std::string::npos; // not enough space left
+ } else if (mode == kMatchExact) {
+ // Exact mode means the next thing we are looking for is the separator
+ if (target.compare(start, slen, separator) != 0) {
+ return std::string::npos;
+ } else {
+ return start + slen; // Found the separator, return where we found it
+ }
+ } else {
+ auto pos = start + 1;
+ if (!separator.empty()) {
+ pos = target.find(separator, pos);
+ }
+ if (pos == std::string::npos) {
+ return pos;
+ } else if (mode == kMatchInteger) {
+ if (!MatchesInteger(target, start, pos)) {
+ return std::string::npos;
+ }
+ } else if (mode == kMatchDecimal) {
+ if (!MatchesDecimal(target, start, pos)) {
+ return std::string::npos;
+ }
+ }
+ return pos + slen;
+ }
+}
+
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+ size_t nlen,
+ const std::string &target,
+ size_t tlen) const {
+ if (separators_.empty()) {
+ assert(optional_); // If there are no separators, it must be only a name
+ return nlen == tlen && name == target;
+ } else if (nlen == tlen) { // The lengths are the same
+ return optional_ && name == target;
+ } else if (tlen < nlen + slength_) {
+ // The target is not long enough
+ return false;
+ } else if (target.compare(0, nlen, name) != 0) {
+ return false; // Target does not start with name
+ } else {
+ // Loop through all of the separators one at a time matching them.
+ // Note that we first match the separator and then its quantifiers.
+ // Since we expect the separator first, we start with an exact match
+ // Subsequent matches will use the quantifier of the previous separator
+ size_t start = nlen;
+ auto mode = kMatchExact;
+ for (size_t idx = 0; idx < separators_.size(); ++idx) {
+ const auto &separator = separators_[idx];
+ start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
+ if (start == std::string::npos) {
+ return false;
+ } else {
+ mode = separator.second;
+ }
+ }
+ // We have matched all of the separators. Now check that what is left
+ // unmatched in the target is acceptable.
+ if (mode == kMatchExact) {
+ return (start == tlen);
+ } else if (start > tlen || (start == tlen && mode != kMatchZeroOrMore)) {
+ return false;
+ } else if (mode == kMatchInteger) {
+ return MatchesInteger(target, start, tlen);
+ } else if (mode == kMatchDecimal) {
+ return MatchesDecimal(target, start, tlen);
+ }
+ }
+ return true;
+}
+
+bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+ auto tlen = target.size();
+ if (MatchesTarget(name_, nlength_, target, tlen)) {
+ return true;
+ } else if (!names_.empty()) {
+ for (const auto &alt : names_) {
+ if (MatchesTarget(alt, alt.size(), target, tlen)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+ std::unique_lock<std::mutex> lock(mu_);
+ *types = factories_.size();
+ size_t factories = 0;
+ for (const auto &e : factories_) {
+ factories += e.second.size();
+ }
+ return factories;
+}
+
+size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
+ std::unique_lock<std::mutex> lock(mu_);
+ auto iter = factories_.find(type);
+ if (iter != factories_.end()) {
+ return iter->second.size();
+ } else {
+ return 0;
+ }
+}
+
+void ObjectLibrary::GetFactoryNames(const std::string &type,
+ std::vector<std::string> *names) const {
+ assert(names);
+ std::unique_lock<std::mutex> lock(mu_);
+ auto iter = factories_.find(type);
+ if (iter != factories_.end()) {
+ for (const auto &f : iter->second) {
+ names->push_back(f->Name());
+ }
+ }
+}
+
+void ObjectLibrary::GetFactoryTypes(
+ std::unordered_set<std::string> *types) const {
+ assert(types);
+ std::unique_lock<std::mutex> lock(mu_);
+ for (const auto &iter : factories_) {
+ types->insert(iter.first);
+ }
+}
+
+void ObjectLibrary::Dump(Logger *logger) const {
+ std::unique_lock<std::mutex> lock(mu_);
+ if (logger != nullptr && !factories_.empty()) {
+ ROCKS_LOG_HEADER(logger, " Registered Library: %s\n", id_.c_str());
+ for (const auto &iter : factories_) {
+ ROCKS_LOG_HEADER(logger, " Registered factories for type[%s] ",
+ iter.first.c_str());
+ bool printed_one = false;
+ for (const auto &e : iter.second) {
+ ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
+ printed_one = true;
+ }
+ }
+ }
+}
+
+// Returns the Default singleton instance of the ObjectLibrary
+// This instance will contain most of the "standard" registered objects
+std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+ // Use avoid destruction here so the default ObjectLibrary will not be
+ // statically destroyed and long-lived.
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectLibrary>, instance)
+ (std::make_shared<ObjectLibrary>("default"));
+ return instance;
+}
+
+ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary> &library) {
+ libraries_.push_back(library);
+ for (const auto &b : builtins_) {
+ RegisterPlugin(b.first, b.second);
+ }
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::Default() {
+ // Use avoid destruction here so the default ObjectRegistry will not be
+ // statically destroyed and long-lived.
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectRegistry>, instance)
+ (std::make_shared<ObjectRegistry>(ObjectLibrary::Default()));
+ return instance;
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+ return std::make_shared<ObjectRegistry>(Default());
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
+ const std::shared_ptr<ObjectRegistry> &parent) {
+ return std::make_shared<ObjectRegistry>(parent);
+}
+
+Status ObjectRegistry::SetManagedObject(
+ const std::string &type, const std::string &id,
+ const std::shared_ptr<Customizable> &object) {
+ std::string object_key = ToManagedObjectKey(type, id);
+ std::shared_ptr<Customizable> curr;
+ if (parent_ != nullptr) {
+ curr = parent_->GetManagedObject(type, id);
+ }
+ if (curr == nullptr) {
+ // We did not find the object in any parent. Update in the current
+ std::unique_lock<std::mutex> lock(objects_mutex_);
+ auto iter = managed_objects_.find(object_key);
+ if (iter != managed_objects_.end()) { // The object exists
+ curr = iter->second.lock();
+ if (curr != nullptr && curr != object) {
+ return Status::InvalidArgument("Object already exists: ", object_key);
+ } else {
+ iter->second = object;
+ }
+ } else {
+ // The object does not exist. Add it
+ managed_objects_[object_key] = object;
+ }
+ } else if (curr != object) {
+ return Status::InvalidArgument("Object already exists: ", object_key);
+ }
+ return Status::OK();
+}
+
+std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
+ const std::string &type, const std::string &id) const {
+ {
+ std::unique_lock<std::mutex> lock(objects_mutex_);
+ auto iter = managed_objects_.find(ToManagedObjectKey(type, id));
+ if (iter != managed_objects_.end()) {
+ return iter->second.lock();
+ }
+ }
+ if (parent_ != nullptr) {
+ return parent_->GetManagedObject(type, id);
+ } else {
+ return nullptr;
+ }
+}
+
+Status ObjectRegistry::ListManagedObjects(
+ const std::string &type, const std::string &name,
+ std::vector<std::shared_ptr<Customizable>> *results) const {
+ {
+ std::string key = ToManagedObjectKey(type, name);
+ std::unique_lock<std::mutex> lock(objects_mutex_);
+ for (auto iter = managed_objects_.lower_bound(key);
+ iter != managed_objects_.end() && StartsWith(iter->first, key);
+ ++iter) {
+ auto shared = iter->second.lock();
+ if (shared != nullptr) {
+ if (name.empty() || shared->IsInstanceOf(name)) {
+ results->emplace_back(shared);
+ }
+ }
+ }
+ }
+ if (parent_ != nullptr) {
+ return parent_->ListManagedObjects(type, name, results);
+ } else {
+ return Status::OK();
+ }
+}
+
+// Returns the number of registered types for this registry.
+// If specified (not-null), types is updated to include the names of the
+// registered types.
+size_t ObjectRegistry::GetFactoryCount(const std::string &type) const {
+ size_t count = 0;
+ if (parent_ != nullptr) {
+ count = parent_->GetFactoryCount(type);
+ }
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ for (const auto &library : libraries_) {
+ count += library->GetFactoryCount(type);
+ }
+ return count;
+}
+
+void ObjectRegistry::GetFactoryNames(const std::string &type,
+ std::vector<std::string> *names) const {
+ assert(names);
+ names->clear();
+ if (parent_ != nullptr) {
+ parent_->GetFactoryNames(type, names);
+ }
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ for (const auto &library : libraries_) {
+ library->GetFactoryNames(type, names);
+ }
+}
+
+void ObjectRegistry::GetFactoryTypes(
+ std::unordered_set<std::string> *types) const {
+ assert(types);
+ if (parent_ != nullptr) {
+ parent_->GetFactoryTypes(types);
+ }
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ for (const auto &library : libraries_) {
+ library->GetFactoryTypes(types);
+ }
+}
+
+void ObjectRegistry::Dump(Logger *logger) const {
+ if (logger != nullptr) {
+ std::unique_lock<std::mutex> lock(library_mutex_);
+ if (!plugins_.empty()) {
+ ROCKS_LOG_HEADER(logger, " Registered Plugins:");
+ bool printed_one = false;
+ for (const auto &plugin : plugins_) {
+ ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ",
+ plugin.c_str());
+ printed_one = true;
+ }
+ ROCKS_LOG_HEADER(logger, "\n");
+ }
+ for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+ iter->get()->Dump(logger);
+ }
+ }
+ if (parent_ != nullptr) {
+ parent_->Dump(logger);
+ }
+}
+
+int ObjectRegistry::RegisterPlugin(const std::string &name,
+ const RegistrarFunc &func) {
+ if (!name.empty() && func != nullptr) {
+ plugins_.push_back(name);
+ return AddLibrary(name)->Register(func, name);
+ } else {
+ return -1;
+ }
+}
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/object_registry_test.cc b/src/rocksdb/utilities/object_registry_test.cc
new file mode 100644
index 000000000..90cd155ee
--- /dev/null
+++ b/src/rocksdb/utilities/object_registry_test.cc
@@ -0,0 +1,872 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObjRegistryTest : public testing::Test {
+ public:
+ static int num_a, num_b;
+};
+
+int ObjRegistryTest::num_a = 0;
+int ObjRegistryTest::num_b = 0;
+static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->AddFactory<Env>(
+ ObjectLibrary::PatternEntry("a", false).AddSeparator("://"),
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+ std::string* /* errmsg */) {
+ ++ObjRegistryTest::num_a;
+ return Env::Default();
+ });
+
+class WrappedEnv : public EnvWrapper {
+ private:
+ std::string id_;
+
+ public:
+ WrappedEnv(Env* t, const std::string& id) : EnvWrapper(t), id_(id) {}
+ const char* Name() const override { return id_.c_str(); }
+ std::string GetId() const override { return id_; }
+};
+static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->AddFactory<Env>(
+ ObjectLibrary::PatternEntry("b", false).AddSeparator("://"),
+ [](const std::string& uri, std::unique_ptr<Env>* env_guard,
+ std::string* /* errmsg */) {
+ ++ObjRegistryTest::num_b;
+ // Env::Default() is a singleton so we can't grant ownership directly to
+ // the caller - we must wrap it first.
+ env_guard->reset(new WrappedEnv(Env::Default(), uri));
+ return env_guard->get();
+ });
+
+TEST_F(ObjRegistryTest, Basics) {
+ std::string msg;
+ std::unique_ptr<Env> guard;
+ Env* a_env = nullptr;
+
+ auto registry = ObjectRegistry::NewInstance();
+ ASSERT_NOK(registry->NewStaticObject<Env>("c://test", &a_env));
+ ASSERT_NOK(registry->NewUniqueObject<Env>("c://test", &guard));
+ ASSERT_EQ(a_env, nullptr);
+ ASSERT_EQ(guard, nullptr);
+ ASSERT_EQ(0, num_a);
+ ASSERT_EQ(0, num_b);
+
+ ASSERT_OK(registry->NewStaticObject<Env>("a://test", &a_env));
+ ASSERT_NE(a_env, nullptr);
+ ASSERT_EQ(1, num_a);
+ ASSERT_EQ(0, num_b);
+
+ ASSERT_OK(registry->NewUniqueObject<Env>("b://test", &guard));
+ ASSERT_NE(guard, nullptr);
+ ASSERT_EQ(1, num_a);
+ ASSERT_EQ(1, num_b);
+
+ Env* b_env = nullptr;
+ ASSERT_NOK(registry->NewStaticObject<Env>("b://test", &b_env));
+ ASSERT_EQ(b_env, nullptr);
+ ASSERT_EQ(1, num_a);
+ ASSERT_EQ(2, num_b); // Created but rejected as not static
+
+ b_env = a_env;
+ ASSERT_NOK(registry->NewStaticObject<Env>("b://test", &b_env));
+ ASSERT_EQ(b_env, a_env);
+ ASSERT_EQ(1, num_a);
+ ASSERT_EQ(3, num_b);
+
+ b_env = guard.get();
+ ASSERT_NOK(registry->NewUniqueObject<Env>("a://test", &guard));
+ ASSERT_EQ(guard.get(), b_env); // Unchanged
+ ASSERT_EQ(2, num_a); // Created one but rejected it as not unique
+ ASSERT_EQ(3, num_b);
+}
+
+TEST_F(ObjRegistryTest, LocalRegistry) {
+ Env* env = nullptr;
+ auto registry = ObjectRegistry::NewInstance();
+ std::shared_ptr<ObjectLibrary> library =
+ std::make_shared<ObjectLibrary>("local");
+ registry->AddLibrary(library);
+ library->AddFactory<Env>(
+ "test-local",
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+ std::string* /* errmsg */) { return Env::Default(); });
+
+ ObjectLibrary::Default()->AddFactory<Env>(
+ "test-global",
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+ std::string* /* errmsg */) { return Env::Default(); });
+
+ ASSERT_NOK(
+ ObjectRegistry::NewInstance()->NewStaticObject<Env>("test-local", &env));
+ ASSERT_EQ(env, nullptr);
+ ASSERT_OK(
+ ObjectRegistry::NewInstance()->NewStaticObject<Env>("test-global", &env));
+ ASSERT_NE(env, nullptr);
+ ASSERT_OK(registry->NewStaticObject<Env>("test-local", &env));
+ ASSERT_NE(env, nullptr);
+ ASSERT_OK(registry->NewStaticObject<Env>("test-global", &env));
+ ASSERT_NE(env, nullptr);
+}
+
+static int RegisterTestUnguarded(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<Env>(
+ "unguarded",
+ [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+ std::string* /* errmsg */) { return Env::Default(); });
+ library.AddFactory<Env>(
+ "guarded", [](const std::string& uri, std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new WrappedEnv(Env::Default(), uri));
+ return guard->get();
+ });
+ return 2;
+}
+
+TEST_F(ObjRegistryTest, CheckShared) {
+ std::shared_ptr<Env> shared;
+ std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+ registry->AddLibrary("shared", RegisterTestUnguarded, "");
+
+ ASSERT_OK(registry->NewSharedObject<Env>("guarded", &shared));
+ ASSERT_NE(shared, nullptr);
+ shared.reset();
+ ASSERT_NOK(registry->NewSharedObject<Env>("unguarded", &shared));
+ ASSERT_EQ(shared, nullptr);
+}
+
+TEST_F(ObjRegistryTest, CheckStatic) {
+ Env* env = nullptr;
+ std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+ registry->AddLibrary("static", RegisterTestUnguarded, "");
+
+ ASSERT_NOK(registry->NewStaticObject<Env>("guarded", &env));
+ ASSERT_EQ(env, nullptr);
+ env = nullptr;
+ ASSERT_OK(registry->NewStaticObject<Env>("unguarded", &env));
+ ASSERT_NE(env, nullptr);
+}
+
+TEST_F(ObjRegistryTest, CheckUnique) {
+ std::unique_ptr<Env> unique;
+ std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+ registry->AddLibrary("unique", RegisterTestUnguarded, "");
+
+ ASSERT_OK(registry->NewUniqueObject<Env>("guarded", &unique));
+ ASSERT_NE(unique, nullptr);
+ unique.reset();
+ ASSERT_NOK(registry->NewUniqueObject<Env>("unguarded", &unique));
+ ASSERT_EQ(unique, nullptr);
+}
+
+TEST_F(ObjRegistryTest, FailingFactory) {
+ std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+ std::shared_ptr<ObjectLibrary> library =
+ std::make_shared<ObjectLibrary>("failing");
+ registry->AddLibrary(library);
+ library->AddFactory<Env>(
+ "failing", [](const std::string& /*uri*/,
+ std::unique_ptr<Env>* /*guard */, std::string* errmsg) {
+ *errmsg = "Bad Factory";
+ return nullptr;
+ });
+ std::unique_ptr<Env> unique;
+ std::shared_ptr<Env> shared;
+ Env* pointer = nullptr;
+ Status s;
+ s = registry->NewUniqueObject<Env>("failing", &unique);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ s = registry->NewSharedObject<Env>("failing", &shared);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ s = registry->NewStaticObject<Env>("failing", &pointer);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ s = registry->NewUniqueObject<Env>("missing", &unique);
+ ASSERT_TRUE(s.IsNotSupported());
+ s = registry->NewSharedObject<Env>("missing", &shared);
+ ASSERT_TRUE(s.IsNotSupported());
+ s = registry->NewStaticObject<Env>("missing", &pointer);
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(ObjRegistryTest, TestRegistryParents) {
+ auto grand = ObjectRegistry::Default();
+ auto parent = ObjectRegistry::NewInstance(); // parent with a grandparent
+ auto uncle = ObjectRegistry::NewInstance(grand);
+ auto child = ObjectRegistry::NewInstance(parent);
+ auto cousin = ObjectRegistry::NewInstance(uncle);
+
+ auto library = parent->AddLibrary("parent");
+ library->AddFactory<Env>(
+ "parent", [](const std::string& uri, std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new WrappedEnv(Env::Default(), uri));
+ return guard->get();
+ });
+ library = cousin->AddLibrary("cousin");
+ library->AddFactory<Env>(
+ "cousin", [](const std::string& uri, std::unique_ptr<Env>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new WrappedEnv(Env::Default(), uri));
+ return guard->get();
+ });
+
+ Env* env = nullptr;
+ std::unique_ptr<Env> guard;
+ std::string msg;
+
+ // a:://* is registered in Default, so they should all work
+ ASSERT_OK(parent->NewStaticObject<Env>("a://test", &env));
+ ASSERT_OK(child->NewStaticObject<Env>("a://test", &env));
+ ASSERT_OK(uncle->NewStaticObject<Env>("a://test", &env));
+ ASSERT_OK(cousin->NewStaticObject<Env>("a://test", &env));
+
+ // The parent env is only registered for parent, not uncle,
+ // So parent and child should return success and uncle and cousin should fail
+ ASSERT_OK(parent->NewUniqueObject<Env>("parent", &guard));
+ ASSERT_OK(child->NewUniqueObject<Env>("parent", &guard));
+ ASSERT_NOK(uncle->NewUniqueObject<Env>("parent", &guard));
+ ASSERT_NOK(cousin->NewUniqueObject<Env>("parent", &guard));
+
+ // The cousin is only registered in the cousin, so all of the others should
+ // fail
+ ASSERT_OK(cousin->NewUniqueObject<Env>("cousin", &guard));
+ ASSERT_NOK(parent->NewUniqueObject<Env>("cousin", &guard));
+ ASSERT_NOK(child->NewUniqueObject<Env>("cousin", &guard));
+ ASSERT_NOK(uncle->NewUniqueObject<Env>("cousin", &guard));
+}
+
+class MyCustomizable : public Customizable {
+ public:
+ static const char* Type() { return "MyCustomizable"; }
+ MyCustomizable(const char* prefix, const std::string& id) : id_(id) {
+ name_ = id_.substr(0, strlen(prefix) - 1);
+ }
+ const char* Name() const override { return name_.c_str(); }
+ std::string GetId() const override { return id_; }
+
+ private:
+ std::string id_;
+ std::string name_;
+};
+
+TEST_F(ObjRegistryTest, TestFactoryCount) {
+ std::string msg;
+ auto grand = ObjectRegistry::Default();
+ auto local = ObjectRegistry::NewInstance();
+ std::unordered_set<std::string> grand_types, local_types;
+ std::vector<std::string> grand_names, local_names;
+
+ // Check how many types we have on startup.
+ // Grand should equal local
+ grand->GetFactoryTypes(&grand_types);
+ local->GetFactoryTypes(&local_types);
+ ASSERT_EQ(grand_types, local_types);
+ size_t grand_count = grand->GetFactoryCount(Env::Type());
+ size_t local_count = local->GetFactoryCount(Env::Type());
+
+ ASSERT_EQ(grand_count, local_count);
+ grand->GetFactoryNames(Env::Type(), &grand_names);
+ local->GetFactoryNames(Env::Type(), &local_names);
+ ASSERT_EQ(grand_names.size(), grand_count);
+ ASSERT_EQ(local_names.size(), local_count);
+ ASSERT_EQ(grand_names, local_names);
+
+ // Add an Env to the local registry.
+ // This will add one factory.
+ auto library = local->AddLibrary("local");
+ library->AddFactory<Env>(
+ "A", [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+ std::string* /* errmsg */) { return nullptr; });
+ ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type()));
+ ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type()));
+ local->GetFactoryTypes(&local_types);
+ local->GetFactoryNames(Env::Type(), &local_names);
+ ASSERT_EQ(grand_names.size() + 1, local_names.size());
+ ASSERT_EQ(local_names.size(), local->GetFactoryCount(Env::Type()));
+
+ if (grand_count == 0) {
+ // There were no Env when we started. Should have one more type
+ // than previously
+ ASSERT_NE(grand_types, local_types);
+ ASSERT_EQ(grand_types.size() + 1, local_types.size());
+ } else {
+ // There was an Env type when we started. The types should match
+ ASSERT_EQ(grand_types, local_types);
+ }
+
+ // Add a MyCustomizable to the registry. This should be a new type
+ library->AddFactory<MyCustomizable>(
+ "MY", [](const std::string& /*uri*/,
+ std::unique_ptr<MyCustomizable>* /*guard */,
+ std::string* /* errmsg */) { return nullptr; });
+ ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type()));
+ ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type()));
+ ASSERT_EQ(0U, grand->GetFactoryCount(MyCustomizable::Type()));
+ ASSERT_EQ(1U, local->GetFactoryCount(MyCustomizable::Type()));
+
+ local->GetFactoryNames(MyCustomizable::Type(), &local_names);
+ ASSERT_EQ(1U, local_names.size());
+ ASSERT_EQ(local_names[0], "MY");
+
+ local->GetFactoryTypes(&local_types);
+ ASSERT_EQ(grand_count == 0 ? 2 : grand_types.size() + 1, local_types.size());
+
+ // Add the same name again. We should now have 2 factories.
+ library->AddFactory<MyCustomizable>(
+ "MY", [](const std::string& /*uri*/,
+ std::unique_ptr<MyCustomizable>* /*guard */,
+ std::string* /* errmsg */) { return nullptr; });
+ local->GetFactoryNames(MyCustomizable::Type(), &local_names);
+ ASSERT_EQ(2U, local_names.size());
+}
+
+TEST_F(ObjRegistryTest, TestManagedObjects) {
+ auto registry = ObjectRegistry::NewInstance();
+ auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+ auto m_a2 = std::make_shared<MyCustomizable>("", "A");
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+
+ ASSERT_NOK(registry->SetManagedObject<MyCustomizable>(m_a2));
+ ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+ m_a1.reset();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a2));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a2);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedObjects) {
+ auto registry = ObjectRegistry::NewInstance();
+ auto m_a = std::make_shared<MyCustomizable>("", "A");
+ auto m_b = std::make_shared<MyCustomizable>("", "B");
+ std::vector<std::shared_ptr<MyCustomizable>> objects;
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 0U);
+ ASSERT_OK(registry->SetManagedObject(m_a));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_a);
+
+ ASSERT_OK(registry->SetManagedObject(m_b));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 2U);
+ ASSERT_OK(registry->ListManagedObjects("A", &objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_a);
+ ASSERT_OK(registry->ListManagedObjects("B", &objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_b);
+ ASSERT_OK(registry->ListManagedObjects("C", &objects));
+ ASSERT_EQ(objects.size(), 0U);
+
+ m_a.reset();
+ objects.clear();
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_b);
+
+ m_b.reset();
+ objects.clear();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestAlternateNames) {
+ auto registry = ObjectRegistry::NewInstance();
+ auto m_a = std::make_shared<MyCustomizable>("", "A");
+ auto m_b = std::make_shared<MyCustomizable>("", "B");
+ std::vector<std::shared_ptr<MyCustomizable>> objects;
+ // Test no objects exist
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 0U);
+
+ // Mark "TheOne" to be A
+ ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_a);
+
+ // Try to mark "TheOne" again.
+ ASSERT_NOK(registry->SetManagedObject("TheOne", m_b));
+ ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+
+ // Add "A" as a managed object. Registered 2x
+ ASSERT_OK(registry->SetManagedObject(m_a));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 2U);
+
+ // Delete "A".
+ m_a.reset();
+ objects.clear();
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+ ASSERT_OK(registry->SetManagedObject("TheOne", m_b));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_b);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 1U);
+ ASSERT_EQ(objects.front(), m_b);
+
+ m_b.reset();
+ objects.clear();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+ ASSERT_OK(registry->ListManagedObjects(&objects));
+ ASSERT_EQ(objects.size(), 0U);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedClasses) {
+ class MyCustomizable2 : public MyCustomizable {
+ public:
+ static const char* Type() { return "MyCustomizable2"; }
+ MyCustomizable2(const char* prefix, const std::string& id)
+ : MyCustomizable(prefix, id) {}
+ };
+
+ auto registry = ObjectRegistry::NewInstance();
+ auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+ auto m_a2 = std::make_shared<MyCustomizable2>("", "A");
+ std::vector<std::shared_ptr<MyCustomizable>> obj1s;
+ std::vector<std::shared_ptr<MyCustomizable2>> obj2s;
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+ ASSERT_OK(registry->SetManagedObject(m_a1));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+ ASSERT_OK(registry->SetManagedObject(m_a2));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+ ASSERT_OK(registry->ListManagedObjects(&obj1s));
+ ASSERT_OK(registry->ListManagedObjects(&obj2s));
+ ASSERT_EQ(obj1s.size(), 1U);
+ ASSERT_EQ(obj2s.size(), 1U);
+ ASSERT_EQ(obj1s.front(), m_a1);
+ ASSERT_EQ(obj2s.front(), m_a2);
+ m_a1.reset();
+ obj1s.clear();
+ obj2s.clear();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+
+ m_a2.reset();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestManagedObjectsWithParent) {
+ auto base = ObjectRegistry::NewInstance();
+ auto registry = ObjectRegistry::NewInstance(base);
+
+ auto m_a = std::make_shared<MyCustomizable>("", "A");
+ auto m_b = std::make_shared<MyCustomizable>("", "A");
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_OK(base->SetManagedObject(m_a));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+
+ ASSERT_NOK(registry->SetManagedObject(m_b));
+ ASSERT_OK(registry->SetManagedObject(m_a));
+
+ m_a.reset();
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+ ASSERT_OK(registry->SetManagedObject(m_b));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_b);
+}
+
+TEST_F(ObjRegistryTest, TestGetOrCreateManagedObject) {
+ auto registry = ObjectRegistry::NewInstance();
+ registry->AddLibrary("test")->AddFactory<MyCustomizable>(
+ ObjectLibrary::PatternEntry::AsIndividualId("MC"),
+ [](const std::string& uri, std::unique_ptr<MyCustomizable>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new MyCustomizable("MC", uri));
+ return guard->get();
+ });
+ std::shared_ptr<MyCustomizable> m_a, m_b, obj;
+ std::vector<std::shared_ptr<MyCustomizable>> objs;
+
+ std::unordered_map<std::string, std::string> opt_map;
+
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), nullptr);
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@B#1"), nullptr);
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &m_b));
+ ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), m_a);
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &obj));
+ ASSERT_EQ(obj, m_a);
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+ ASSERT_EQ(obj, m_b);
+ ASSERT_OK(registry->ListManagedObjects(&objs));
+ ASSERT_EQ(objs.size(), 2U);
+
+ objs.clear();
+ m_a.reset();
+ obj.reset();
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+ ASSERT_EQ(1, m_a.use_count());
+ ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+ ASSERT_EQ(2, obj.use_count());
+}
+
+TEST_F(ObjRegistryTest, RegisterPlugin) {
+ std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+ std::unique_ptr<Env> guard;
+ Env* env = nullptr;
+
+ ASSERT_NOK(registry->NewObject<Env>("unguarded", &env, &guard));
+ ASSERT_EQ(registry->RegisterPlugin("Missing", nullptr), -1);
+ ASSERT_EQ(registry->RegisterPlugin("", RegisterTestUnguarded), -1);
+ ASSERT_GT(registry->RegisterPlugin("Valid", RegisterTestUnguarded), 0);
+ ASSERT_OK(registry->NewObject<Env>("unguarded", &env, &guard));
+ ASSERT_NE(env, nullptr);
+}
+class PatternEntryTest : public testing::Test {};
+
+TEST_F(PatternEntryTest, TestSimpleEntry) {
+ ObjectLibrary::PatternEntry entry("ABC", true);
+
+ ASSERT_TRUE(entry.Matches("ABC"));
+ ASSERT_FALSE(entry.Matches("AABC"));
+ ASSERT_FALSE(entry.Matches("ABCA"));
+ ASSERT_FALSE(entry.Matches("AABCA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("BC"));
+ ASSERT_FALSE(entry.Matches("ABD"));
+ ASSERT_FALSE(entry.Matches("BCA"));
+}
+
+TEST_F(PatternEntryTest, TestPatternEntry) {
+ // Matches A:+
+ ObjectLibrary::PatternEntry entry("A", false);
+ entry.AddSeparator(":");
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("B"));
+ ASSERT_FALSE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA:B"));
+ ASSERT_FALSE(entry.Matches("AA:BB"));
+ ASSERT_TRUE(entry.Matches("A:B"));
+ ASSERT_TRUE(entry.Matches("A:BB"));
+
+ entry.SetOptional(true); // Now matches "A" or "A:+"
+ ASSERT_TRUE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("B"));
+ ASSERT_FALSE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA:B"));
+ ASSERT_FALSE(entry.Matches("AA:BB"));
+ ASSERT_TRUE(entry.Matches("A:B"));
+ ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, MatchZeroOrMore) {
+ // Matches A:*
+ ObjectLibrary::PatternEntry entry("A", false);
+ entry.AddSeparator(":", false);
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("B"));
+ ASSERT_TRUE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("B:"));
+ ASSERT_FALSE(entry.Matches("B:A"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA:B"));
+ ASSERT_FALSE(entry.Matches("AA:BB"));
+ ASSERT_TRUE(entry.Matches("A:B"));
+ ASSERT_TRUE(entry.Matches("A:BB"));
+
+ entry.SetOptional(true); // Now matches "A" or "A:*"
+ ASSERT_TRUE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("B"));
+ ASSERT_TRUE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("B:"));
+ ASSERT_FALSE(entry.Matches("B:A"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA:B"));
+ ASSERT_FALSE(entry.Matches("AA:BB"));
+ ASSERT_TRUE(entry.Matches("A:B"));
+ ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, TestSuffixEntry) {
+ ObjectLibrary::PatternEntry entry("AA", true);
+ entry.AddSuffix("BB");
+
+ ASSERT_TRUE(entry.Matches("AA"));
+ ASSERT_TRUE(entry.Matches("AABB"));
+
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AB"));
+ ASSERT_FALSE(entry.Matches("B"));
+ ASSERT_FALSE(entry.Matches("BB"));
+ ASSERT_FALSE(entry.Matches("ABA"));
+ ASSERT_FALSE(entry.Matches("BBAA"));
+ ASSERT_FALSE(entry.Matches("AABBA"));
+ ASSERT_FALSE(entry.Matches("AABBB"));
+}
+
+TEST_F(PatternEntryTest, TestNumericEntry) {
+ ObjectLibrary::PatternEntry entry("A", false);
+ entry.AddNumber(":");
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_TRUE(entry.Matches("A:1"));
+ ASSERT_TRUE(entry.Matches("A:11"));
+ ASSERT_FALSE(entry.Matches("AA:1"));
+ ASSERT_FALSE(entry.Matches("AA:11"));
+ ASSERT_FALSE(entry.Matches("A:B"));
+ ASSERT_FALSE(entry.Matches("A:1B"));
+ ASSERT_FALSE(entry.Matches("A:B1"));
+
+ entry.AddSeparator(":", false);
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_TRUE(entry.Matches("A:1:"));
+ ASSERT_TRUE(entry.Matches("A:11:"));
+ ASSERT_FALSE(entry.Matches("A:1"));
+ ASSERT_FALSE(entry.Matches("A:B1:"));
+ ASSERT_FALSE(entry.Matches("A:1B:"));
+ ASSERT_FALSE(entry.Matches("A::"));
+}
+
+TEST_F(PatternEntryTest, TestDoubleEntry) {
+ ObjectLibrary::PatternEntry entry("A", false);
+ entry.AddNumber(":", false);
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("A:"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA:1"));
+ ASSERT_FALSE(entry.Matches("AA:11"));
+ ASSERT_FALSE(entry.Matches("A:B"));
+ ASSERT_FALSE(entry.Matches("A:1B"));
+ ASSERT_FALSE(entry.Matches("A:B1"));
+ ASSERT_TRUE(entry.Matches("A:1"));
+ ASSERT_TRUE(entry.Matches("A:11"));
+ ASSERT_TRUE(entry.Matches("A:1.1"));
+ ASSERT_TRUE(entry.Matches("A:11.11"));
+ ASSERT_TRUE(entry.Matches("A:1."));
+ ASSERT_TRUE(entry.Matches("A:.1"));
+ ASSERT_TRUE(entry.Matches("A:0.1"));
+ ASSERT_TRUE(entry.Matches("A:1.0"));
+ ASSERT_TRUE(entry.Matches("A:1.0"));
+
+ ASSERT_FALSE(entry.Matches("A:1.0."));
+ ASSERT_FALSE(entry.Matches("A:1.0.2"));
+ ASSERT_FALSE(entry.Matches("A:.1.0"));
+ ASSERT_FALSE(entry.Matches("A:..10"));
+ ASSERT_FALSE(entry.Matches("A:10.."));
+ ASSERT_FALSE(entry.Matches("A:."));
+
+ entry.AddSeparator(":", false);
+ ASSERT_FALSE(entry.Matches("A:1"));
+ ASSERT_FALSE(entry.Matches("A:1.0"));
+
+ ASSERT_TRUE(entry.Matches("A:11:"));
+ ASSERT_TRUE(entry.Matches("A:1.1:"));
+ ASSERT_TRUE(entry.Matches("A:11.11:"));
+ ASSERT_TRUE(entry.Matches("A:1.:"));
+ ASSERT_TRUE(entry.Matches("A:.1:"));
+ ASSERT_TRUE(entry.Matches("A:0.1:"));
+ ASSERT_TRUE(entry.Matches("A:1.0:"));
+ ASSERT_TRUE(entry.Matches("A:1.0:"));
+
+ ASSERT_FALSE(entry.Matches("A:1.0.:"));
+ ASSERT_FALSE(entry.Matches("A:1.0.2:"));
+ ASSERT_FALSE(entry.Matches("A:.1.0:"));
+ ASSERT_FALSE(entry.Matches("A:..10:"));
+ ASSERT_FALSE(entry.Matches("A:10..:"));
+ ASSERT_FALSE(entry.Matches("A:.:"));
+ ASSERT_FALSE(entry.Matches("A::"));
+}
+
+TEST_F(PatternEntryTest, TestIndividualIdEntry) {
+ auto entry = ObjectLibrary::PatternEntry::AsIndividualId("AA");
+ ASSERT_TRUE(entry.Matches("AA"));
+ ASSERT_TRUE(entry.Matches("AA@123#456"));
+ ASSERT_TRUE(entry.Matches("AA@deadbeef#id"));
+
+ ASSERT_FALSE(entry.Matches("A"));
+ ASSERT_FALSE(entry.Matches("AAA"));
+ ASSERT_FALSE(entry.Matches("AA@123"));
+ ASSERT_FALSE(entry.Matches("AA@123#"));
+ ASSERT_FALSE(entry.Matches("AA@#123"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNameEntry) {
+ ObjectLibrary::PatternEntry entry("A");
+ entry.AnotherName("B");
+ ASSERT_TRUE(entry.Matches("A"));
+ ASSERT_TRUE(entry.Matches("B"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("BB"));
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("BA"));
+ ASSERT_FALSE(entry.Matches("AB"));
+}
+
+TEST_F(PatternEntryTest, TestTwoPatternEntry) {
+ ObjectLibrary::PatternEntry entry("AA", false);
+ entry.AddSeparator(":");
+ entry.AddSeparator(":");
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA::"));
+ ASSERT_FALSE(entry.Matches("AA::12"));
+ ASSERT_TRUE(entry.Matches("AA:1:2"));
+ ASSERT_TRUE(entry.Matches("AA:1:2:"));
+
+ ObjectLibrary::PatternEntry entry2("AA", false);
+ entry2.AddSeparator("::");
+ entry2.AddSeparator("##");
+ ASSERT_FALSE(entry2.Matches("AA"));
+ ASSERT_FALSE(entry2.Matches("AA:"));
+ ASSERT_FALSE(entry2.Matches("AA::"));
+ ASSERT_FALSE(entry2.Matches("AA::#"));
+ ASSERT_FALSE(entry2.Matches("AA::##"));
+ ASSERT_FALSE(entry2.Matches("AA##1::2"));
+ ASSERT_FALSE(entry2.Matches("AA::123##"));
+ ASSERT_TRUE(entry2.Matches("AA::1##2"));
+ ASSERT_TRUE(entry2.Matches("AA::12##34:"));
+ ASSERT_TRUE(entry2.Matches("AA::12::34##56"));
+ ASSERT_TRUE(entry2.Matches("AA::12##34::56"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNumbersEntry) {
+ ObjectLibrary::PatternEntry entry("AA", false);
+ entry.AddNumber(":");
+ entry.AddNumber(":");
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AA:"));
+ ASSERT_FALSE(entry.Matches("AA::"));
+ ASSERT_FALSE(entry.Matches("AA::12"));
+ ASSERT_FALSE(entry.Matches("AA:1:2:"));
+ ASSERT_TRUE(entry.Matches("AA:1:2"));
+ ASSERT_TRUE(entry.Matches("AA:12:23456"));
+
+ ObjectLibrary::PatternEntry entry2("AA", false);
+ entry2.AddNumber(":");
+ entry2.AddNumber("#");
+ ASSERT_FALSE(entry2.Matches("AA"));
+ ASSERT_FALSE(entry2.Matches("AA:"));
+ ASSERT_FALSE(entry2.Matches("AA:#"));
+ ASSERT_FALSE(entry2.Matches("AA#:"));
+ ASSERT_FALSE(entry2.Matches("AA:123#"));
+ ASSERT_FALSE(entry2.Matches("AA:123#B"));
+ ASSERT_FALSE(entry2.Matches("AA:B#123"));
+ ASSERT_TRUE(entry2.Matches("AA:1#2"));
+ ASSERT_FALSE(entry2.Matches("AA:123#23:"));
+ ASSERT_FALSE(entry2.Matches("AA::12#234"));
+}
+
+TEST_F(PatternEntryTest, TestPatternAndSuffix) {
+ ObjectLibrary::PatternEntry entry("AA", false);
+ entry.AddSeparator("::");
+ entry.AddSuffix("##");
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("AA::"));
+ ASSERT_FALSE(entry.Matches("AA::##"));
+ ASSERT_FALSE(entry.Matches("AB::1##"));
+ ASSERT_FALSE(entry.Matches("AB::1##2"));
+ ASSERT_FALSE(entry.Matches("AA##1::"));
+ ASSERT_TRUE(entry.Matches("AA::1##"));
+ ASSERT_FALSE(entry.Matches("AA::1###"));
+
+ ObjectLibrary::PatternEntry entry2("AA", false);
+ entry2.AddSuffix("::");
+ entry2.AddSeparator("##");
+ ASSERT_FALSE(entry2.Matches("AA"));
+ ASSERT_FALSE(entry2.Matches("AA::"));
+ ASSERT_FALSE(entry2.Matches("AA::##"));
+ ASSERT_FALSE(entry2.Matches("AB::1##"));
+ ASSERT_FALSE(entry2.Matches("AB::1##2"));
+ ASSERT_TRUE(entry2.Matches("AA::##12"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNamesAndPattern) {
+ ObjectLibrary::PatternEntry entry("AA", true);
+ entry.AddSeparator("::");
+ entry.AnotherName("BBB");
+ ASSERT_TRUE(entry.Matches("AA"));
+ ASSERT_TRUE(entry.Matches("AA::1"));
+ ASSERT_TRUE(entry.Matches("BBB"));
+ ASSERT_TRUE(entry.Matches("BBB::2"));
+
+ ASSERT_FALSE(entry.Matches("AA::"));
+ ASSERT_FALSE(entry.Matches("AAA::"));
+ ASSERT_FALSE(entry.Matches("BBB::"));
+
+ entry.SetOptional(false);
+ ASSERT_FALSE(entry.Matches("AA"));
+ ASSERT_FALSE(entry.Matches("BBB"));
+
+ ASSERT_FALSE(entry.Matches("AA::"));
+ ASSERT_FALSE(entry.Matches("AAA::"));
+ ASSERT_FALSE(entry.Matches("BBB::"));
+
+ ASSERT_TRUE(entry.Matches("AA::1"));
+ ASSERT_TRUE(entry.Matches("BBB::2"));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else // ROCKSDB_LITE
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc
new file mode 100644
index 000000000..e93d2152d
--- /dev/null
+++ b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/option_change_migration.h"
+
+#ifndef ROCKSDB_LITE
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// Return a version of Options `opts` that allow us to open/write into a DB
+// without triggering an automatic compaction or stalling. This is guaranteed
+// by disabling automatic compactions and using huge values for stalling
+// triggers.
+Options GetNoCompactionOptions(const Options& opts) {
+ Options ret_opts = opts;
+ ret_opts.disable_auto_compactions = true;
+ ret_opts.level0_slowdown_writes_trigger = 999999;
+ ret_opts.level0_stop_writes_trigger = 999999;
+ ret_opts.soft_pending_compaction_bytes_limit = 0;
+ ret_opts.hard_pending_compaction_bytes_limit = 0;
+ return ret_opts;
+}
+
+Status OpenDb(const Options& options, const std::string& dbname,
+ std::unique_ptr<DB>* db) {
+ db->reset();
+ DB* tmpdb;
+ Status s = DB::Open(options, dbname, &tmpdb);
+ if (s.ok()) {
+ db->reset(tmpdb);
+ }
+ return s;
+}
+
+// l0_file_size specifies size of file on L0. Files will be range partitioned
+// after a full compaction so they are likely qualified to put on L0. If
+// left as 0, the files are compacted in a single file and put to L0. Otherwise,
+// will try to compact the files as size l0_file_size.
+Status CompactToLevel(const Options& options, const std::string& dbname,
+ int dest_level, uint64_t l0_file_size, bool need_reopen) {
+ std::unique_ptr<DB> db;
+ Options no_compact_opts = GetNoCompactionOptions(options);
+ if (dest_level == 0) {
+ if (l0_file_size == 0) {
+ // Single file.
+ l0_file_size = 999999999999999;
+ }
+ // L0 has strict sequenceID requirements to files to it. It's safer
+ // to only put one compacted file to there.
+ // This is only used for converting to universal compaction with
+ // only one level. In this case, compacting to one file is also
+ // optimal.
+ no_compact_opts.target_file_size_base = l0_file_size;
+ no_compact_opts.max_compaction_bytes = l0_file_size;
+ }
+ Status s = OpenDb(no_compact_opts, dbname, &db);
+ if (!s.ok()) {
+ return s;
+ }
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = dest_level;
+ if (dest_level == 0) {
+ // cannot use kForceOptimized because the compaction is expected to
+ // generate one output file
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ }
+ s = db->CompactRange(cro, nullptr, nullptr);
+
+ if (s.ok() && need_reopen) {
+ // Need to restart DB to rewrite the manifest file.
+ // In order to open a DB with specific num_levels, the manifest file should
+ // contain no record that mentiones any level beyond num_levels. Issuing a
+ // full compaction will move all the data to a level not exceeding
+ // num_levels, but the manifest may still contain previous record mentioning
+ // a higher level. Reopening the DB will force the manifest to be rewritten
+ // so that those records will be cleared.
+ db.reset();
+ s = OpenDb(no_compact_opts, dbname, &db);
+ }
+ return s;
+}
+
+Status MigrateToUniversal(std::string dbname, const Options& old_opts,
+ const Options& new_opts) {
+ if (old_opts.num_levels <= new_opts.num_levels ||
+ old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ return Status::OK();
+ } else {
+ bool need_compact = false;
+ {
+ std::unique_ptr<DB> db;
+ Options opts = GetNoCompactionOptions(old_opts);
+ Status s = OpenDb(opts, dbname, &db);
+ if (!s.ok()) {
+ return s;
+ }
+ ColumnFamilyMetaData metadata;
+ db->GetColumnFamilyMetaData(&metadata);
+ if (!metadata.levels.empty() &&
+ metadata.levels.back().level >= new_opts.num_levels) {
+ need_compact = true;
+ }
+ }
+ if (need_compact) {
+ return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1,
+ /*l0_file_size=*/0, true);
+ }
+ return Status::OK();
+ }
+}
+
+Status MigrateToLevelBase(std::string dbname, const Options& old_opts,
+ const Options& new_opts) {
+ if (!new_opts.level_compaction_dynamic_level_bytes) {
+ if (old_opts.num_levels == 1) {
+ return Status::OK();
+ }
+ // Compact everything to level 1 to guarantee it can be safely opened.
+ Options opts = old_opts;
+ opts.target_file_size_base = new_opts.target_file_size_base;
+ // Although sometimes we can open the DB with the new option without error,
+ // We still want to compact the files to avoid the LSM tree to stuck
+ // in bad shape. For example, if the user changed the level size
+ // multiplier from 4 to 8, with the same data, we will have fewer
+ // levels. Unless we issue a full comaction, the LSM tree may stuck
+ // with more levels than needed and it won't recover automatically.
+ return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true);
+ } else {
+ // Compact everything to the last level to guarantee it can be safely
+ // opened.
+ if (old_opts.num_levels == 1) {
+ return Status::OK();
+ } else if (new_opts.num_levels > old_opts.num_levels) {
+ // Dynamic level mode requires data to be put in the last level first.
+ return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1,
+ /*l0_file_size=*/0, false);
+ } else {
+ Options opts = old_opts;
+ opts.target_file_size_base = new_opts.target_file_size_base;
+ return CompactToLevel(opts, dbname, new_opts.num_levels - 1,
+ /*l0_file_size=*/0, true);
+ }
+ }
+}
+} // namespace
+
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+ const Options& new_opts) {
+ if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ // LSM generated by FIFO compaction can be opened by any compaction.
+ return Status::OK();
+ } else if (new_opts.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal) {
+ return MigrateToUniversal(dbname, old_opts, new_opts);
+ } else if (new_opts.compaction_style ==
+ CompactionStyle::kCompactionStyleLevel) {
+ return MigrateToLevelBase(dbname, old_opts, new_opts);
+ } else if (new_opts.compaction_style ==
+ CompactionStyle::kCompactionStyleFIFO) {
+ uint64_t l0_file_size = 0;
+ if (new_opts.compaction_options_fifo.max_table_files_size > 0) {
+ // Create at least 8 files when max_table_files_size hits, so that the DB
+ // doesn't just disappear. This in fact violates the FIFO condition, but
+ // otherwise, the migrated DB is unlikley to be usable.
+ l0_file_size = new_opts.compaction_options_fifo.max_table_files_size / 8;
+ }
+ return CompactToLevel(old_opts, dbname, 0, l0_file_size, true);
+ } else {
+ return Status::NotSupported(
+ "Do not how to migrate to this compaction style");
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+#else
+namespace ROCKSDB_NAMESPACE {
+Status OptionChangeMigration(std::string /*dbname*/,
+ const Options& /*old_opts*/,
+ const Options& /*new_opts*/) {
+ return Status::NotSupported();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc
new file mode 100644
index 000000000..71af45db1
--- /dev/null
+++ b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc
@@ -0,0 +1,550 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/option_change_migration.h"
+
+#include <set>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionChangeMigrationTests
+ : public DBTestBase,
+ public testing::WithParamInterface<
+ std::tuple<int, int, bool, int, int, bool, uint64_t>> {
+ public:
+ DBOptionChangeMigrationTests()
+ : DBTestBase("db_option_change_migration_test", /*env_do_fsync=*/true) {
+ level1_ = std::get<0>(GetParam());
+ compaction_style1_ = std::get<1>(GetParam());
+ is_dynamic1_ = std::get<2>(GetParam());
+
+ level2_ = std::get<3>(GetParam());
+ compaction_style2_ = std::get<4>(GetParam());
+ is_dynamic2_ = std::get<5>(GetParam());
+ fifo_max_table_files_size_ = std::get<6>(GetParam());
+ }
+
+ // Required if inheriting from testing::WithParamInterface<>
+ static void SetUpTestCase() {}
+ static void TearDownTestCase() {}
+
+ int level1_;
+ int compaction_style1_;
+ bool is_dynamic1_;
+
+ int level2_;
+ int compaction_style2_;
+ bool is_dynamic2_;
+
+ uint64_t fifo_max_table_files_size_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBOptionChangeMigrationTests, Migrate1) {
+ Options old_options = CurrentOptions();
+ old_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style1_);
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+ }
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ old_options.max_open_files = -1;
+ }
+ old_options.level0_file_num_compaction_trigger = 3;
+ old_options.write_buffer_size = 64 * 1024;
+ old_options.target_file_size_base = 128 * 1024;
+ // Make level target of L1, L2 to be 200KB and 600KB
+ old_options.num_levels = level1_;
+ old_options.max_bytes_for_level_multiplier = 3;
+ old_options.max_bytes_for_level_base = 200 * 1024;
+
+ Reopen(old_options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate at least 2MB of data
+ for (int num = 0; num < 20; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Will make sure exactly those keys are in the DB after migration.
+ std::set<std::string> keys;
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (; it->Valid(); it->Next()) {
+ keys.insert(it->key().ToString());
+ }
+ }
+ Close();
+
+ Options new_options = old_options;
+ new_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style2_);
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+ }
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ new_options.max_open_files = -1;
+ }
+ if (fifo_max_table_files_size_ != 0) {
+ new_options.compaction_options_fifo.max_table_files_size =
+ fifo_max_table_files_size_;
+ }
+ new_options.target_file_size_base = 256 * 1024;
+ new_options.num_levels = level2_;
+ new_options.max_bytes_for_level_base = 150 * 1024;
+ new_options.max_bytes_for_level_multiplier = 4;
+ ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+ Reopen(new_options);
+
+ // Wait for compaction to finish and make sure it can reopen
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Reopen(new_options);
+
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (std::string key : keys) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(key, it->key().ToString());
+ it->Next();
+ }
+ ASSERT_TRUE(!it->Valid());
+ }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate2) {
+ Options old_options = CurrentOptions();
+ old_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style2_);
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+ }
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ old_options.max_open_files = -1;
+ }
+ old_options.level0_file_num_compaction_trigger = 3;
+ old_options.write_buffer_size = 64 * 1024;
+ old_options.target_file_size_base = 128 * 1024;
+ // Make level target of L1, L2 to be 200KB and 600KB
+ old_options.num_levels = level2_;
+ old_options.max_bytes_for_level_multiplier = 3;
+ old_options.max_bytes_for_level_base = 200 * 1024;
+
+ Reopen(old_options);
+
+ Random rnd(301);
+ int key_idx = 0;
+
+ // Generate at least 2MB of data
+ for (int num = 0; num < 20; num++) {
+ GenerateNewFile(&rnd, &key_idx);
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Will make sure exactly those keys are in the DB after migration.
+ std::set<std::string> keys;
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (; it->Valid(); it->Next()) {
+ keys.insert(it->key().ToString());
+ }
+ }
+
+ Close();
+
+ Options new_options = old_options;
+ new_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style1_);
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+ }
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ new_options.max_open_files = -1;
+ }
+ if (fifo_max_table_files_size_ != 0) {
+ new_options.compaction_options_fifo.max_table_files_size =
+ fifo_max_table_files_size_;
+ }
+ new_options.target_file_size_base = 256 * 1024;
+ new_options.num_levels = level1_;
+ new_options.max_bytes_for_level_base = 150 * 1024;
+ new_options.max_bytes_for_level_multiplier = 4;
+ ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+ Reopen(new_options);
+ // Wait for compaction to finish and make sure it can reopen
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Reopen(new_options);
+
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (std::string key : keys) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(key, it->key().ToString());
+ it->Next();
+ }
+ ASSERT_TRUE(!it->Valid());
+ }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate3) {
+ Options old_options = CurrentOptions();
+ old_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style1_);
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+ }
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ old_options.max_open_files = -1;
+ }
+ old_options.level0_file_num_compaction_trigger = 3;
+ old_options.write_buffer_size = 64 * 1024;
+ old_options.target_file_size_base = 128 * 1024;
+ // Make level target of L1, L2 to be 200KB and 600KB
+ old_options.num_levels = level1_;
+ old_options.max_bytes_for_level_multiplier = 3;
+ old_options.max_bytes_for_level_base = 200 * 1024;
+
+ Reopen(old_options);
+ Random rnd(301);
+ for (int num = 0; num < 20; num++) {
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (num == 9) {
+ // Issue a full compaction to generate some zero-out files
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Will make sure exactly those keys are in the DB after migration.
+ std::set<std::string> keys;
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (; it->Valid(); it->Next()) {
+ keys.insert(it->key().ToString());
+ }
+ }
+ Close();
+
+ Options new_options = old_options;
+ new_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style2_);
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+ }
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ new_options.max_open_files = -1;
+ }
+ if (fifo_max_table_files_size_ != 0) {
+ new_options.compaction_options_fifo.max_table_files_size =
+ fifo_max_table_files_size_;
+ }
+ new_options.target_file_size_base = 256 * 1024;
+ new_options.num_levels = level2_;
+ new_options.max_bytes_for_level_base = 150 * 1024;
+ new_options.max_bytes_for_level_multiplier = 4;
+ ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+ Reopen(new_options);
+
+ // Wait for compaction to finish and make sure it can reopen
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Reopen(new_options);
+
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (std::string key : keys) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(key, it->key().ToString());
+ it->Next();
+ }
+ ASSERT_TRUE(!it->Valid());
+ }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate4) {
+ Options old_options = CurrentOptions();
+ old_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style2_);
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+ }
+ if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ old_options.max_open_files = -1;
+ }
+ old_options.level0_file_num_compaction_trigger = 3;
+ old_options.write_buffer_size = 64 * 1024;
+ old_options.target_file_size_base = 128 * 1024;
+ // Make level target of L1, L2 to be 200KB and 600KB
+ old_options.num_levels = level2_;
+ old_options.max_bytes_for_level_multiplier = 3;
+ old_options.max_bytes_for_level_base = 200 * 1024;
+
+ Reopen(old_options);
+ Random rnd(301);
+ for (int num = 0; num < 20; num++) {
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+ }
+ Flush();
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ if (num == 9) {
+ // Issue a full compaction to generate some zero-out files
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+ }
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ // Will make sure exactly those keys are in the DB after migration.
+ std::set<std::string> keys;
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (; it->Valid(); it->Next()) {
+ keys.insert(it->key().ToString());
+ }
+ }
+
+ Close();
+
+ Options new_options = old_options;
+ new_options.compaction_style =
+ static_cast<CompactionStyle>(compaction_style1_);
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+ }
+ if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ new_options.max_open_files = -1;
+ }
+ if (fifo_max_table_files_size_ != 0) {
+ new_options.compaction_options_fifo.max_table_files_size =
+ fifo_max_table_files_size_;
+ }
+ new_options.target_file_size_base = 256 * 1024;
+ new_options.num_levels = level1_;
+ new_options.max_bytes_for_level_base = 150 * 1024;
+ new_options.max_bytes_for_level_multiplier = 4;
+ ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+ Reopen(new_options);
+ // Wait for compaction to finish and make sure it can reopen
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Reopen(new_options);
+
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (std::string key : keys) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(key, it->key().ToString());
+ it->Next();
+ }
+ ASSERT_TRUE(!it->Valid());
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ DBOptionChangeMigrationTests, DBOptionChangeMigrationTests,
+ ::testing::Values(
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 0 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 0 /* new compaction style */,
+ true /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 0 /* new compaction style */,
+ false, 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 0 /* new compaction style */,
+ true /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 1 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 1 /* old num_levels */, 1 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 0 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 1 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 2 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 3 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 0 /*fifo max_table_files_size*/),
+ std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */, 0),
+ std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 1 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 5 * 1024 * 1024 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+ true /* is dynamic leveling in old option */,
+ 2 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 5 * 1024 * 1024 /*fifo max_table_files_size*/),
+ std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 3 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 5 * 1024 * 1024 /*fifo max_table_files_size*/),
+ std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+ false /* is dynamic leveling in old option */,
+ 4 /* old num_levels */, 2 /* new compaction style */,
+ false /* is dynamic leveling in new option */,
+ 5 * 1024 * 1024 /*fifo max_table_files_size*/)));
+
+class DBOptionChangeMigrationTest : public DBTestBase {
+ public:
+ DBOptionChangeMigrationTest()
+ : DBTestBase("db_option_change_migration_test2", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
+ Options old_options = CurrentOptions();
+ old_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+ old_options.max_compaction_bytes = 200 * 1024;
+ old_options.level_compaction_dynamic_level_bytes = false;
+ old_options.level0_file_num_compaction_trigger = 3;
+ old_options.write_buffer_size = 64 * 1024;
+ old_options.target_file_size_base = 128 * 1024;
+ // Make level target of L1, L2 to be 200KB and 600KB
+ old_options.num_levels = 4;
+ old_options.max_bytes_for_level_multiplier = 3;
+ old_options.max_bytes_for_level_base = 200 * 1024;
+
+ Reopen(old_options);
+ Random rnd(301);
+ for (int num = 0; num < 20; num++) {
+ for (int i = 0; i < 50; i++) {
+ ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+ }
+ }
+ Flush();
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+ // Will make sure exactly those keys are in the DB after migration.
+ std::set<std::string> keys;
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (; it->Valid(); it->Next()) {
+ keys.insert(it->key().ToString());
+ }
+ }
+
+ Close();
+
+ Options new_options = old_options;
+ new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+ new_options.target_file_size_base = 256 * 1024;
+ new_options.num_levels = 1;
+ new_options.max_bytes_for_level_base = 150 * 1024;
+ new_options.max_bytes_for_level_multiplier = 4;
+ ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+ Reopen(new_options);
+ // Wait for compaction to finish and make sure it can reopen
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Reopen(new_options);
+
+ {
+ std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+ it->SeekToFirst();
+ for (std::string key : keys) {
+ ASSERT_TRUE(it->Valid());
+ ASSERT_EQ(key, it->key().ToString());
+ it->Next();
+ }
+ ASSERT_TRUE(!it->Valid());
+ ASSERT_OK(it->status());
+ }
+}
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/options/options_util.cc b/src/rocksdb/utilities/options/options_util.cc
new file mode 100644
index 000000000..00c4b981a
--- /dev/null
+++ b/src/rocksdb/utilities/options/options_util.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/options_util.h"
+
+#include "file/filename.h"
+#include "options/options_parser.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block_based_table_factory.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status LoadOptionsFromFile(const std::string& file_name, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options,
+ std::shared_ptr<Cache>* cache) {
+ ConfigOptions config_options;
+ config_options.ignore_unknown_options = ignore_unknown_options;
+ config_options.input_strings_escaped = true;
+ config_options.env = env;
+
+ return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs,
+ cache);
+}
+
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+ const std::string& file_name, DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ std::shared_ptr<Cache>* cache) {
+ RocksDBOptionsParser parser;
+ const auto& fs = config_options.env->GetFileSystem();
+ Status s = parser.Parse(config_options, file_name, fs.get());
+ if (!s.ok()) {
+ return s;
+ }
+ *db_options = *parser.db_opt();
+ const std::vector<std::string>& cf_names = *parser.cf_names();
+ const std::vector<ColumnFamilyOptions>& cf_opts = *parser.cf_opts();
+ cf_descs->clear();
+ for (size_t i = 0; i < cf_opts.size(); ++i) {
+ cf_descs->push_back({cf_names[i], cf_opts[i]});
+ if (cache != nullptr) {
+ TableFactory* tf = cf_opts[i].table_factory.get();
+ if (tf != nullptr) {
+ auto* opts = tf->GetOptions<BlockBasedTableOptions>();
+ if (opts != nullptr) {
+ opts->block_cache = *cache;
+ }
+ }
+ }
+ }
+ return Status::OK();
+}
+
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+ std::string* options_file_name) {
+ Status s;
+ std::string latest_file_name;
+ uint64_t latest_time_stamp = 0;
+ std::vector<std::string> file_names;
+ s = env->GetChildren(dbpath, &file_names);
+ if (s.IsNotFound()) {
+ return Status::NotFound(Status::kPathNotFound,
+ "No options files found in the DB directory.",
+ dbpath);
+ } else if (!s.ok()) {
+ return s;
+ }
+ for (auto& file_name : file_names) {
+ uint64_t time_stamp;
+ FileType type;
+ if (ParseFileName(file_name, &time_stamp, &type) && type == kOptionsFile) {
+ if (time_stamp > latest_time_stamp) {
+ latest_time_stamp = time_stamp;
+ latest_file_name = file_name;
+ }
+ }
+ }
+ if (latest_file_name.size() == 0) {
+ return Status::NotFound(Status::kPathNotFound,
+ "No options files found in the DB directory.",
+ dbpath);
+ }
+ *options_file_name = latest_file_name;
+ return Status::OK();
+}
+
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+ DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ bool ignore_unknown_options,
+ std::shared_ptr<Cache>* cache) {
+ ConfigOptions config_options;
+ config_options.ignore_unknown_options = ignore_unknown_options;
+ config_options.input_strings_escaped = true;
+ config_options.env = env;
+
+ return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache);
+}
+
+Status LoadLatestOptions(const ConfigOptions& config_options,
+ const std::string& dbpath, DBOptions* db_options,
+ std::vector<ColumnFamilyDescriptor>* cf_descs,
+ std::shared_ptr<Cache>* cache) {
+ std::string options_file_name;
+ Status s =
+ GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
+ if (!s.ok()) {
+ return s;
+ }
+ return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name,
+ db_options, cf_descs, cache);
+}
+
+Status CheckOptionsCompatibility(
+ const std::string& dbpath, Env* env, const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& cf_descs,
+ bool ignore_unknown_options) {
+ ConfigOptions config_options(db_options);
+ config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+ config_options.ignore_unknown_options = ignore_unknown_options;
+ config_options.input_strings_escaped = true;
+ config_options.env = env;
+ return CheckOptionsCompatibility(config_options, dbpath, db_options,
+ cf_descs);
+}
+
+Status CheckOptionsCompatibility(
+ const ConfigOptions& config_options, const std::string& dbpath,
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& cf_descs) {
+ std::string options_file_name;
+ Status s =
+ GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyOptions> cf_opts;
+ for (const auto& cf_desc : cf_descs) {
+ cf_names.push_back(cf_desc.name);
+ cf_opts.push_back(cf_desc.options);
+ }
+
+ const auto& fs = config_options.env->GetFileSystem();
+
+ return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+ config_options, db_options, cf_names, cf_opts,
+ dbpath + "/" + options_file_name, fs.get());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/options/options_util_test.cc b/src/rocksdb/utilities/options/options_util_test.cc
new file mode 100644
index 000000000..1c3b41ff2
--- /dev/null
+++ b/src/rocksdb/utilities/options/options_util_test.cc
@@ -0,0 +1,779 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/options_util.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <unordered_map>
+
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "options/options_parser.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsUtilTest : public testing::Test {
+ public:
+ OptionsUtilTest() : rnd_(0xFB) {
+ env_.reset(NewMemEnv(Env::Default()));
+ dbname_ = test::PerThreadDBPath("options_util_test");
+ }
+
+ protected:
+ std::unique_ptr<Env> env_;
+ std::string dbname_;
+ Random rnd_;
+};
+
+TEST_F(OptionsUtilTest, SaveAndLoad) {
+ const size_t kCFCount = 5;
+
+ DBOptions db_opt;
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyOptions> cf_opts;
+ test::RandomInitDBOptions(&db_opt, &rnd_);
+ for (size_t i = 0; i < kCFCount; ++i) {
+ cf_names.push_back(i == 0 ? kDefaultColumnFamilyName
+ : test::RandomName(&rnd_, 10));
+ cf_opts.emplace_back();
+ test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_);
+ }
+
+ const std::string kFileName = "OPTIONS-123456";
+ ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+ env_->GetFileSystem().get()));
+
+ DBOptions loaded_db_opt;
+ std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+ ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
+ &loaded_cf_descs));
+ ConfigOptions exact;
+ exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+ ASSERT_OK(
+ RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
+ test::RandomInitDBOptions(&db_opt, &rnd_);
+ ASSERT_NOK(
+ RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
+
+ for (size_t i = 0; i < kCFCount; ++i) {
+ ASSERT_EQ(cf_names[i], loaded_cf_descs[i].name);
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+ exact, cf_opts[i], loaded_cf_descs[i].options));
+ ASSERT_OK(RocksDBOptionsParser::VerifyTableFactory(
+ exact, cf_opts[i].table_factory.get(),
+ loaded_cf_descs[i].options.table_factory.get()));
+ test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_);
+ ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+ exact, cf_opts[i], loaded_cf_descs[i].options));
+ }
+
+ ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_opts[0])));
+ for (size_t i = 0; i < kCFCount; ++i) {
+ if (cf_opts[i].compaction_filter) {
+ delete cf_opts[i].compaction_filter;
+ }
+ }
+}
+
+TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) {
+ // creating db
+ DBOptions db_opt;
+ db_opt.create_if_missing = true;
+ // initialize BlockBasedTableOptions
+ std::shared_ptr<Cache> cache = NewLRUCache(1 * 1024);
+ BlockBasedTableOptions bbt_opts;
+ bbt_opts.block_size = 32 * 1024;
+ // saving cf options
+ std::vector<ColumnFamilyOptions> cf_opts;
+ ColumnFamilyOptions default_column_family_opt = ColumnFamilyOptions();
+ default_column_family_opt.table_factory.reset(
+ NewBlockBasedTableFactory(bbt_opts));
+ cf_opts.push_back(default_column_family_opt);
+
+ ColumnFamilyOptions cf_opt_sample = ColumnFamilyOptions();
+ cf_opt_sample.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
+ cf_opts.push_back(cf_opt_sample);
+
+ ColumnFamilyOptions cf_opt_plain_table_opt = ColumnFamilyOptions();
+ cf_opt_plain_table_opt.table_factory.reset(NewPlainTableFactory());
+ cf_opts.push_back(cf_opt_plain_table_opt);
+
+ std::vector<std::string> cf_names;
+ cf_names.push_back(kDefaultColumnFamilyName);
+ cf_names.push_back("cf_sample");
+ cf_names.push_back("cf_plain_table_sample");
+ // Saving DB in file
+ const std::string kFileName = "OPTIONS-LOAD_CACHE_123456";
+ ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+ env_->GetFileSystem().get()));
+ DBOptions loaded_db_opt;
+ std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+
+ ConfigOptions config_options;
+ config_options.ignore_unknown_options = false;
+ config_options.input_strings_escaped = true;
+ config_options.env = env_.get();
+ ASSERT_OK(LoadOptionsFromFile(config_options, kFileName, &loaded_db_opt,
+ &loaded_cf_descs, &cache));
+ for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+ auto* loaded_bbt_opt =
+ loaded_cf_descs[i]
+ .options.table_factory->GetOptions<BlockBasedTableOptions>();
+ // Expect the same cache will be loaded
+ if (loaded_bbt_opt != nullptr) {
+ ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+ }
+ }
+
+ // Test the old interface
+ ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
+ &loaded_cf_descs, false, &cache));
+ for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+ auto* loaded_bbt_opt =
+ loaded_cf_descs[i]
+ .options.table_factory->GetOptions<BlockBasedTableOptions>();
+ // Expect the same cache will be loaded
+ if (loaded_bbt_opt != nullptr) {
+ ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+ }
+ }
+ ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0])));
+}
+
+namespace {
+class DummyTableFactory : public TableFactory {
+ public:
+ DummyTableFactory() {}
+ ~DummyTableFactory() override {}
+
+ const char* Name() const override { return "DummyTableFactory"; }
+
+ using TableFactory::NewTableReader;
+ Status NewTableReader(
+ const ReadOptions& /*ro*/,
+ const TableReaderOptions& /*table_reader_options*/,
+ std::unique_ptr<RandomAccessFileReader>&& /*file*/,
+ uint64_t /*file_size*/, std::unique_ptr<TableReader>* /*table_reader*/,
+ bool /*prefetch_index_and_filter_in_cache*/) const override {
+ return Status::NotSupported();
+ }
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& /*table_builder_options*/,
+ WritableFileWriter* /*file*/) const override {
+ return nullptr;
+ }
+
+ Status ValidateOptions(
+ const DBOptions& /*db_opts*/,
+ const ColumnFamilyOptions& /*cf_opts*/) const override {
+ return Status::NotSupported();
+ }
+
+ std::string GetPrintableOptions() const override { return ""; }
+};
+
+class DummyMergeOperator : public MergeOperator {
+ public:
+ DummyMergeOperator() {}
+ ~DummyMergeOperator() override {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ return false;
+ }
+
+ bool PartialMergeMulti(const Slice& /*key*/,
+ const std::deque<Slice>& /*operand_list*/,
+ std::string* /*new_value*/,
+ Logger* /*logger*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "DummyMergeOperator"; }
+};
+
+class DummySliceTransform : public SliceTransform {
+ public:
+ DummySliceTransform() {}
+ ~DummySliceTransform() override {}
+
+ // Return the name of this transformation.
+ const char* Name() const override { return "DummySliceTransform"; }
+
+ // transform a src in domain to a dst in the range
+ Slice Transform(const Slice& src) const override { return src; }
+
+ // determine whether this is a valid src upon the function applies
+ bool InDomain(const Slice& /*src*/) const override { return false; }
+
+ // determine whether dst=Transform(src) for some src
+ bool InRange(const Slice& /*dst*/) const override { return false; }
+};
+
+} // namespace
+
+TEST_F(OptionsUtilTest, SanityCheck) {
+ DBOptions db_opt;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ const size_t kCFCount = 5;
+ for (size_t i = 0; i < kCFCount; ++i) {
+ cf_descs.emplace_back();
+ cf_descs.back().name =
+ (i == 0) ? kDefaultColumnFamilyName : test::RandomName(&rnd_, 10);
+
+ cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory());
+ // Assign non-null values to prefix_extractors except the first cf.
+ cf_descs.back().options.prefix_extractor.reset(
+ i != 0 ? test::RandomSliceTransform(&rnd_) : nullptr);
+ cf_descs.back().options.merge_operator.reset(
+ test::RandomMergeOperator(&rnd_));
+ }
+
+ db_opt.create_missing_column_families = true;
+ db_opt.create_if_missing = true;
+
+ ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
+ DB* db;
+ std::vector<ColumnFamilyHandle*> handles;
+ // open and persist the options
+ ASSERT_OK(DB::Open(db_opt, dbname_, cf_descs, &handles, &db));
+
+ // close the db
+ for (auto* handle : handles) {
+ delete handle;
+ }
+ delete db;
+
+ ConfigOptions config_options;
+ config_options.ignore_unknown_options = false;
+ config_options.input_strings_escaped = true;
+ config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+ // perform sanity check
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ ASSERT_GE(kCFCount, 5);
+ // merge operator
+ {
+ std::shared_ptr<MergeOperator> merge_op =
+ cf_descs[0].options.merge_operator;
+
+ ASSERT_NE(merge_op.get(), nullptr);
+ cf_descs[0].options.merge_operator.reset();
+ ASSERT_NOK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[0].options.merge_operator.reset(new DummyMergeOperator());
+ ASSERT_NOK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[0].options.merge_operator = merge_op;
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+ }
+
+ // prefix extractor
+ {
+ std::shared_ptr<const SliceTransform> prefix_extractor =
+ cf_descs[1].options.prefix_extractor;
+
+ // It's okay to set prefix_extractor to nullptr.
+ ASSERT_NE(prefix_extractor, nullptr);
+ cf_descs[1].options.prefix_extractor.reset();
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[1].options.prefix_extractor.reset(new DummySliceTransform());
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[1].options.prefix_extractor = prefix_extractor;
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+ }
+
+ // prefix extractor nullptr case
+ {
+ std::shared_ptr<const SliceTransform> prefix_extractor =
+ cf_descs[0].options.prefix_extractor;
+
+ // It's okay to set prefix_extractor to nullptr.
+ ASSERT_EQ(prefix_extractor, nullptr);
+ cf_descs[0].options.prefix_extractor.reset();
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ // It's okay to change prefix_extractor from nullptr to non-nullptr
+ cf_descs[0].options.prefix_extractor.reset(new DummySliceTransform());
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[0].options.prefix_extractor = prefix_extractor;
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+ }
+
+ // comparator
+ {
+ test::SimpleSuffixReverseComparator comparator;
+
+ auto* prev_comparator = cf_descs[2].options.comparator;
+ cf_descs[2].options.comparator = &comparator;
+ ASSERT_NOK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[2].options.comparator = prev_comparator;
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+ }
+
+ // table factory
+ {
+ std::shared_ptr<TableFactory> table_factory =
+ cf_descs[3].options.table_factory;
+
+ ASSERT_NE(table_factory, nullptr);
+ cf_descs[3].options.table_factory.reset(new DummyTableFactory());
+ ASSERT_NOK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+ cf_descs[3].options.table_factory = table_factory;
+ ASSERT_OK(
+ CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+ }
+ ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
+}
+
+TEST_F(OptionsUtilTest, LatestOptionsNotFound) {
+ std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+ Status s;
+ Options options;
+ ConfigOptions config_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+
+ options.env = env.get();
+ options.create_if_missing = true;
+ config_opts.env = options.env;
+ config_opts.ignore_unknown_options = false;
+
+ std::vector<std::string> children;
+
+ std::string options_file_name;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ // First, test where the db directory does not exist
+ ASSERT_NOK(options.env->GetChildren(dbname_, &children));
+
+ s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ // Second, test where the db directory exists but is empty
+ ASSERT_OK(options.env->CreateDir(dbname_));
+
+ s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ // Finally, test where a file exists but is not an "Options" file
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(
+ options.env->NewWritableFile(dbname_ + "/temp.txt", &file, EnvOptions()));
+ ASSERT_OK(file->Close());
+ s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+
+ s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_TRUE(s.IsPathNotFound());
+ ASSERT_OK(options.env->DeleteFile(dbname_ + "/temp.txt"));
+ ASSERT_OK(options.env->DeleteDir(dbname_));
+}
+
+TEST_F(OptionsUtilTest, LoadLatestOptions) {
+ Options options;
+ options.OptimizeForSmallDb();
+ ColumnFamilyDescriptor cf_desc;
+ ConfigOptions config_opts;
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ std::vector<ColumnFamilyHandle*> handles;
+ DB* db;
+ options.create_if_missing = true;
+
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ cf_descs.emplace_back();
+ cf_descs.back().name = kDefaultColumnFamilyName;
+ cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory());
+ cf_descs.emplace_back();
+ cf_descs.back().name = "Plain";
+ cf_descs.back().options.table_factory.reset(NewPlainTableFactory());
+ db_opts.create_missing_column_families = true;
+ db_opts.create_if_missing = true;
+
+ // open and persist the options
+ ASSERT_OK(DB::Open(db_opts, dbname_, cf_descs, &handles, &db));
+
+ std::string options_file_name;
+ std::string new_options_file;
+
+ ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+ ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ ASSERT_EQ(cf_descs.size(), 2U);
+ ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+ db->GetDBOptions(), db_opts));
+ ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+ cf_descs[0].options));
+ ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+ cf_descs[1].options));
+
+ // Now change some of the DBOptions
+ ASSERT_OK(db->SetDBOptions(
+ {{"delayed_write_rate", "1234"}, {"bytes_per_sync", "32768"}}));
+ ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+ ASSERT_NE(options_file_name, new_options_file);
+ ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+ db->GetDBOptions(), db_opts));
+ options_file_name = new_options_file;
+
+ // Now change some of the ColumnFamilyOptions
+ ASSERT_OK(db->SetOptions(handles[1], {{"write_buffer_size", "32768"}}));
+ ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+ ASSERT_NE(options_file_name, new_options_file);
+ ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+ db->GetDBOptions(), db_opts));
+ ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+ cf_descs[0].options));
+ ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+ cf_descs[1].options));
+
+ // close the db
+ for (auto* handle : handles) {
+ delete handle;
+ }
+ delete db;
+ ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
+}
+
+static void WriteOptionsFile(Env* env, const std::string& path,
+ const std::string& options_file, int major,
+ int minor, const std::string& db_opts,
+ const std::string& cf_opts,
+ const std::string& bbt_opts = "") {
+ std::string options_file_header =
+ "\n"
+ "[Version]\n"
+ " rocksdb_version=" +
+ std::to_string(major) + "." + std::to_string(minor) +
+ ".0\n"
+ " options_file_version=1\n";
+
+ std::unique_ptr<WritableFile> wf;
+ ASSERT_OK(env->NewWritableFile(path + "/" + options_file, &wf, EnvOptions()));
+ ASSERT_OK(
+ wf->Append(options_file_header + "[ DBOptions ]\n" + db_opts + "\n"));
+ ASSERT_OK(wf->Append(
+ "[CFOptions \"default\"] # column family must be specified\n" +
+ cf_opts + "\n"));
+ ASSERT_OK(wf->Append("[TableOptions/BlockBasedTable \"default\"]\n" +
+ bbt_opts + "\n"));
+ ASSERT_OK(wf->Close());
+
+ std::string latest_options_file;
+ ASSERT_OK(GetLatestOptionsFileName(path, env, &latest_options_file));
+ ASSERT_EQ(latest_options_file, options_file);
+}
+
+TEST_F(OptionsUtilTest, BadLatestOptions) {
+ Status s;
+ ConfigOptions config_opts;
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ Options options;
+ options.env = env_.get();
+ config_opts.env = env_.get();
+ config_opts.ignore_unknown_options = false;
+ config_opts.delimiter = "\n";
+
+ ConfigOptions ignore_opts = config_opts;
+ ignore_opts.ignore_unknown_options = true;
+
+ std::string options_file_name;
+
+ // Test where the db directory exists but is empty
+ ASSERT_OK(options.env->CreateDir(dbname_));
+ ASSERT_NOK(
+ GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+ ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+
+ // Write an options file for a previous major release with an unknown DB
+ // Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0001", ROCKSDB_MAJOR - 1,
+ ROCKSDB_MINOR, "unknown_db_opt=true", "");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Write an options file for a previous minor release with an unknown CF
+ // Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0002", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR - 1, "", "unknown_cf_opt=true");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Write an options file for a previous minor release with an unknown BBT
+ // Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0003", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR - 1, "", "", "unknown_bbt_opt=true");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Write an options file for the current release with an unknown DB Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0004", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR, "unknown_db_opt=true", "");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Write an options file for the current release with an unknown CF Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0005", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Write an options file for the current release with an invalid DB Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0006", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR, "create_if_missing=hello", "");
+ s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ // Even though ignore_unknown_options=true, we still return an error...
+ s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+ ASSERT_NOK(s);
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ // Write an options file for the next release with an invalid DB Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0007", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR + 1, "create_if_missing=hello", "");
+ ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+ // Write an options file for the next release with an unknown DB Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0008", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR + 1, "unknown_db_opt=true", "");
+ ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ // Ignore the errors for future releases when ignore_unknown_options=true
+ ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+ // Write an options file for the next major release with an unknown CF Option
+ WriteOptionsFile(options.env, dbname_, "OPTIONS-0009", ROCKSDB_MAJOR + 1,
+ ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+ ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+ // Ignore the errors for future releases when ignore_unknown_options=true
+ ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+}
+
+TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
+ DB* db;
+ Options options;
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ std::vector<ColumnFamilyHandle*> handles;
+
+ options.create_if_missing = true;
+
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value0"));
+ delete db;
+
+ auto new_dbname = dbname_ + "_2";
+
+ ASSERT_OK(options.env->RenameFile(dbname_, new_dbname));
+ ASSERT_OK(LoadLatestOptions(new_dbname, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(cf_descs.size(), 1U);
+
+ db_opts.create_if_missing = false;
+ ASSERT_OK(DB::Open(db_opts, new_dbname, cf_descs, &handles, &db));
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), "foo", &value));
+ ASSERT_EQ("value0", value);
+ // close the db
+ for (auto* handle : handles) {
+ delete handle;
+ }
+ delete db;
+ Options new_options(db_opts, cf_descs[0].options);
+ ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs));
+ ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+TEST_F(OptionsUtilTest, WalDirSettings) {
+ DB* db;
+ Options options;
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ std::vector<ColumnFamilyHandle*> handles;
+
+ options.create_if_missing = true;
+
+ // Open a DB with no wal dir set. The wal_dir should stay empty
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, "");
+
+ // Open a DB with wal_dir == dbname. The wal_dir should be set to empty
+ options.wal_dir = dbname_;
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, "");
+
+ // Open a DB with no wal_dir but a db_path==dbname_. The wal_dir should be
+ // empty
+ options.wal_dir = "";
+ options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, "");
+
+ // Open a DB with no wal_dir==dbname_ and db_path==dbname_. The wal_dir
+ // should be empty
+ options.wal_dir = dbname_ + "/";
+ options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, "");
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ // Open a DB with no wal_dir but db_path != db_name. The wal_dir == dbname_
+ options.wal_dir = "";
+ options.db_paths.clear();
+ options.db_paths.emplace_back(dbname_ + "_0",
+ std::numeric_limits<uint64_t>::max());
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, dbname_);
+ ASSERT_OK(DestroyDB(dbname_, options));
+
+ // Open a DB with wal_dir != db_name. The wal_dir remains unchanged
+ options.wal_dir = dbname_ + "/wal";
+ options.db_paths.clear();
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal");
+ ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+TEST_F(OptionsUtilTest, WalDirInOptins) {
+ DB* db;
+ Options options;
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ std::vector<ColumnFamilyHandle*> handles;
+
+ // Store an options file with wal_dir=dbname_ and make sure it still loads
+ // when the input wal_dir is empty
+ options.create_if_missing = true;
+ options.wal_dir = "";
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ options.wal_dir = dbname_;
+ std::string options_file;
+ ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file));
+ ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options},
+ dbname_ + "/" + options_file,
+ options.env->GetFileSystem().get()));
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, dbname_);
+ options.wal_dir = "";
+ ASSERT_OK(DB::Open(options, dbname_, &db));
+ delete db;
+ ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+ ASSERT_EQ(db_opts.wal_dir, "");
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+ ParseCommandLineFlags(&argc, &argv, true);
+#endif // GFLAGS
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ printf("Skipped in RocksDBLite as utilities are not supported.\n");
+ return 0;
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc
new file mode 100644
index 000000000..8ad9bb1b1
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc
@@ -0,0 +1,422 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+#include <utility>
+#include <vector>
+
+#include "logging/logging.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// BlockCacheImpl
+//
+Status BlockCacheTier::Open() {
+ Status status;
+
+ WriteLock _(&lock_);
+
+ assert(!size_);
+
+ // Check the validity of the options
+ status = opt_.ValidateSettings();
+ assert(status.ok());
+ if (!status.ok()) {
+ Error(opt_.log, "Invalid block cache options");
+ return status;
+ }
+
+ // Create base directory or cleanup existing directory
+ status = opt_.env->CreateDirIfMissing(opt_.path);
+ if (!status.ok()) {
+ Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
+ status.ToString().c_str());
+ return status;
+ }
+
+ // Create base/<cache dir> directory
+ status = opt_.env->CreateDir(GetCachePath());
+ if (!status.ok()) {
+ // directory already exists, clean it up
+ status = CleanupCacheFolder(GetCachePath());
+ assert(status.ok());
+ if (!status.ok()) {
+ Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
+ status.ToString().c_str());
+ return status;
+ }
+ }
+
+ // create a new file
+ assert(!cache_file_);
+ status = NewCacheFile();
+ if (!status.ok()) {
+ Error(opt_.log, "Error creating new file %s. %s", opt_.path.c_str(),
+ status.ToString().c_str());
+ return status;
+ }
+
+ assert(cache_file_);
+
+ if (opt_.pipeline_writes) {
+ assert(!insert_th_.joinable());
+ insert_th_ = port::Thread(&BlockCacheTier::InsertMain, this);
+ }
+
+ return Status::OK();
+}
+
+bool IsCacheFile(const std::string& file) {
+ // check if the file has .rc suffix
+ // Unfortunately regex support across compilers is not even, so we use simple
+ // string parsing
+ size_t pos = file.find(".");
+ if (pos == std::string::npos) {
+ return false;
+ }
+
+ std::string suffix = file.substr(pos);
+ return suffix == ".rc";
+}
+
+Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) {
+ std::vector<std::string> files;
+ Status status = opt_.env->GetChildren(folder, &files);
+ if (!status.ok()) {
+ Error(opt_.log, "Error getting files for %s. %s", folder.c_str(),
+ status.ToString().c_str());
+ return status;
+ }
+
+ // cleanup files with the patter :digi:.rc
+ for (auto file : files) {
+ if (IsCacheFile(file)) {
+ // cache file
+ Info(opt_.log, "Removing file %s.", file.c_str());
+ status = opt_.env->DeleteFile(folder + "/" + file);
+ if (!status.ok()) {
+ Error(opt_.log, "Error deleting file %s. %s", file.c_str(),
+ status.ToString().c_str());
+ return status;
+ }
+ } else {
+ ROCKS_LOG_DEBUG(opt_.log, "Skipping file %s", file.c_str());
+ }
+ }
+ return Status::OK();
+}
+
+Status BlockCacheTier::Close() {
+ // stop the insert thread
+ if (opt_.pipeline_writes && insert_th_.joinable()) {
+ InsertOp op(/*quit=*/true);
+ insert_ops_.Push(std::move(op));
+ insert_th_.join();
+ }
+
+ // stop the writer before
+ writer_.Stop();
+
+ // clear all metadata
+ WriteLock _(&lock_);
+ metadata_.Clear();
+ return Status::OK();
+}
+
+template <class T>
+void Add(std::map<std::string, double>* stats, const std::string& key,
+ const T& t) {
+ stats->insert({key, static_cast<double>(t)});
+}
+
+PersistentCache::StatsType BlockCacheTier::Stats() {
+ std::map<std::string, double> stats;
+ Add(&stats, "persistentcache.blockcachetier.bytes_piplined",
+ stats_.bytes_pipelined_.Average());
+ Add(&stats, "persistentcache.blockcachetier.bytes_written",
+ stats_.bytes_written_.Average());
+ Add(&stats, "persistentcache.blockcachetier.bytes_read",
+ stats_.bytes_read_.Average());
+ Add(&stats, "persistentcache.blockcachetier.insert_dropped",
+ stats_.insert_dropped_);
+ Add(&stats, "persistentcache.blockcachetier.cache_hits", stats_.cache_hits_);
+ Add(&stats, "persistentcache.blockcachetier.cache_misses",
+ stats_.cache_misses_);
+ Add(&stats, "persistentcache.blockcachetier.cache_errors",
+ stats_.cache_errors_);
+ Add(&stats, "persistentcache.blockcachetier.cache_hits_pct",
+ stats_.CacheHitPct());
+ Add(&stats, "persistentcache.blockcachetier.cache_misses_pct",
+ stats_.CacheMissPct());
+ Add(&stats, "persistentcache.blockcachetier.read_hit_latency",
+ stats_.read_hit_latency_.Average());
+ Add(&stats, "persistentcache.blockcachetier.read_miss_latency",
+ stats_.read_miss_latency_.Average());
+ Add(&stats, "persistentcache.blockcachetier.write_latency",
+ stats_.write_latency_.Average());
+
+ auto out = PersistentCacheTier::Stats();
+ out.push_back(stats);
+ return out;
+}
+
+Status BlockCacheTier::Insert(const Slice& key, const char* data,
+ const size_t size) {
+ // update stats
+ stats_.bytes_pipelined_.Add(size);
+
+ if (opt_.pipeline_writes) {
+ // off load the write to the write thread
+ insert_ops_.Push(
+ InsertOp(key.ToString(), std::move(std::string(data, size))));
+ return Status::OK();
+ }
+
+ assert(!opt_.pipeline_writes);
+ return InsertImpl(key, Slice(data, size));
+}
+
+void BlockCacheTier::InsertMain() {
+ while (true) {
+ InsertOp op(insert_ops_.Pop());
+
+ if (op.signal_) {
+ // that is a secret signal to exit
+ break;
+ }
+
+ size_t retry = 0;
+ Status s;
+ while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) {
+ if (retry > kMaxRetry) {
+ break;
+ }
+
+ // this can happen when the buffers are full, we wait till some buffers
+ // are free. Why don't we wait inside the code. This is because we want
+ // to support both pipelined and non-pipelined mode
+ buffer_allocator_.WaitUntilUsable();
+ retry++;
+ }
+
+ if (!s.ok()) {
+ stats_.insert_dropped_++;
+ }
+ }
+}
+
+Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
+ // pre-condition
+ assert(key.size());
+ assert(data.size());
+ assert(cache_file_);
+
+ StopWatchNano timer(opt_.clock, /*auto_start=*/true);
+
+ WriteLock _(&lock_);
+
+ LBA lba;
+ if (metadata_.Lookup(key, &lba)) {
+ // the key already exists, this is duplicate insert
+ return Status::OK();
+ }
+
+ while (!cache_file_->Append(key, data, &lba)) {
+ if (!cache_file_->Eof()) {
+ ROCKS_LOG_DEBUG(opt_.log, "Error inserting to cache file %d",
+ cache_file_->cacheid());
+ stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
+ return Status::TryAgain();
+ }
+
+ assert(cache_file_->Eof());
+ Status status = NewCacheFile();
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ // Insert into lookup index
+ BlockInfo* info = metadata_.Insert(key, lba);
+ assert(info);
+ if (!info) {
+ return Status::IOError("Unexpected error inserting to index");
+ }
+
+ // insert to cache file reverse mapping
+ cache_file_->Add(info);
+
+ // update stats
+ stats_.bytes_written_.Add(data.size());
+ stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
+ return Status::OK();
+}
+
+Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
+ size_t* size) {
+ StopWatchNano timer(opt_.clock, /*auto_start=*/true);
+
+ LBA lba;
+ bool status;
+ status = metadata_.Lookup(key, &lba);
+ if (!status) {
+ stats_.cache_misses_++;
+ stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+ return Status::NotFound("blockcache: key not found");
+ }
+
+ BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_);
+ if (!file) {
+ // this can happen because the block index and cache file index are
+ // different, and the cache file might be removed between the two lookups
+ stats_.cache_misses_++;
+ stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+ return Status::NotFound("blockcache: cache file not found");
+ }
+
+ assert(file->refs_);
+
+ std::unique_ptr<char[]> scratch(new char[lba.size_]);
+ Slice blk_key;
+ Slice blk_val;
+
+ status = file->Read(lba, &blk_key, &blk_val, scratch.get());
+ --file->refs_;
+ if (!status) {
+ stats_.cache_misses_++;
+ stats_.cache_errors_++;
+ stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+ return Status::NotFound("blockcache: error reading data");
+ }
+
+ assert(blk_key == key);
+
+ val->reset(new char[blk_val.size()]);
+ memcpy(val->get(), blk_val.data(), blk_val.size());
+ *size = blk_val.size();
+
+ stats_.bytes_read_.Add(*size);
+ stats_.cache_hits_++;
+ stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000);
+
+ return Status::OK();
+}
+
+bool BlockCacheTier::Erase(const Slice& key) {
+ WriteLock _(&lock_);
+ BlockInfo* info = metadata_.Remove(key);
+ assert(info);
+ delete info;
+ return true;
+}
+
+Status BlockCacheTier::NewCacheFile() {
+ lock_.AssertHeld();
+
+ TEST_SYNC_POINT_CALLBACK("BlockCacheTier::NewCacheFile:DeleteDir",
+ (void*)(GetCachePath().c_str()));
+
+ std::unique_ptr<WriteableCacheFile> f(new WriteableCacheFile(
+ opt_.env, &buffer_allocator_, &writer_, GetCachePath(), writer_cache_id_,
+ opt_.cache_file_size, opt_.log));
+
+ bool status = f->Create(opt_.enable_direct_writes, opt_.enable_direct_reads);
+ if (!status) {
+ return Status::IOError("Error creating file");
+ }
+
+ Info(opt_.log, "Created cache file %d", writer_cache_id_);
+
+ writer_cache_id_++;
+ cache_file_ = f.release();
+
+ // insert to cache files tree
+ status = metadata_.Insert(cache_file_);
+ assert(status);
+ if (!status) {
+ Error(opt_.log, "Error inserting to metadata");
+ return Status::IOError("Error inserting to metadata");
+ }
+
+ return Status::OK();
+}
+
+bool BlockCacheTier::Reserve(const size_t size) {
+ WriteLock _(&lock_);
+ assert(size_ <= opt_.cache_size);
+
+ if (size + size_ <= opt_.cache_size) {
+ // there is enough space to write
+ size_ += size;
+ return true;
+ }
+
+ assert(size + size_ >= opt_.cache_size);
+ // there is not enough space to fit the requested data
+ // we can clear some space by evicting cold data
+
+ const double retain_fac = (100 - kEvictPct) / static_cast<double>(100);
+ while (size + size_ > opt_.cache_size * retain_fac) {
+ std::unique_ptr<BlockCacheFile> f(metadata_.Evict());
+ if (!f) {
+ // nothing is evictable
+ return false;
+ }
+ assert(!f->refs_);
+ uint64_t file_size;
+ if (!f->Delete(&file_size).ok()) {
+ // unable to delete file
+ return false;
+ }
+
+ assert(file_size <= size_);
+ size_ -= file_size;
+ }
+
+ size_ += size;
+ assert(size_ <= opt_.cache_size * 0.9);
+ return true;
+}
+
+Status NewPersistentCache(Env* const env, const std::string& path,
+ const uint64_t size,
+ const std::shared_ptr<Logger>& log,
+ const bool optimized_for_nvm,
+ std::shared_ptr<PersistentCache>* cache) {
+ if (!cache) {
+ return Status::IOError("invalid argument cache");
+ }
+
+ auto opt = PersistentCacheConfig(env, path, size, log);
+ if (optimized_for_nvm) {
+ // the default settings are optimized for SSD
+ // NVM devices are better accessed with 4K direct IO and written with
+ // parallelism
+ opt.enable_direct_writes = true;
+ opt.writer_qdepth = 4;
+ opt.writer_dispatch_size = 4 * 1024;
+ }
+
+ auto pcache = std::make_shared<BlockCacheTier>(opt);
+ Status s = pcache->Open();
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ *cache = pcache;
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ifndef ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h
new file mode 100644
index 000000000..1aac287cc
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif // ! OS_WIN
+
+#include <atomic>
+#include <list>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Block cache tier implementation
+//
+class BlockCacheTier : public PersistentCacheTier {
+ public:
+ explicit BlockCacheTier(const PersistentCacheConfig& opt)
+ : opt_(opt),
+ insert_ops_(static_cast<size_t>(opt_.max_write_pipeline_backlog_size)),
+ buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()),
+ writer_(this, opt_.writer_qdepth,
+ static_cast<size_t>(opt_.writer_dispatch_size)) {
+ Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt,
+ opt_.write_buffer_size, opt_.write_buffer_count());
+ }
+
+ virtual ~BlockCacheTier() {
+ // Close is re-entrant so we can call close even if it is already closed
+ Close().PermitUncheckedError();
+ assert(!insert_th_.joinable());
+ }
+
+ Status Insert(const Slice& key, const char* data, const size_t size) override;
+ Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+ size_t* size) override;
+ Status Open() override;
+ Status Close() override;
+ bool Erase(const Slice& key) override;
+ bool Reserve(const size_t size) override;
+
+ bool IsCompressed() override { return opt_.is_compressed; }
+
+ std::string GetPrintableOptions() const override { return opt_.ToString(); }
+
+ PersistentCache::StatsType Stats() override;
+
+ void TEST_Flush() override {
+ while (insert_ops_.Size()) {
+ /* sleep override */
+ SystemClock::Default()->SleepForMicroseconds(1000000);
+ }
+ }
+
+ private:
+ // Percentage of cache to be evicted when the cache is full
+ static const size_t kEvictPct = 10;
+ // Max attempts to insert key, value to cache in pipelined mode
+ static const size_t kMaxRetry = 3;
+
+ // Pipelined operation
+ struct InsertOp {
+ explicit InsertOp(const bool signal) : signal_(signal) {}
+ explicit InsertOp(std::string&& key, const std::string& data)
+ : key_(std::move(key)), data_(data) {}
+ ~InsertOp() {}
+
+ InsertOp() = delete;
+ InsertOp(InsertOp&& /*rhs*/) = default;
+ InsertOp& operator=(InsertOp&& rhs) = default;
+
+ // used for estimating size by bounded queue
+ size_t Size() { return data_.size() + key_.size(); }
+
+ std::string key_;
+ std::string data_;
+ bool signal_ = false; // signal to request processing thread to exit
+ };
+
+ // entry point for insert thread
+ void InsertMain();
+ // insert implementation
+ Status InsertImpl(const Slice& key, const Slice& data);
+ // Create a new cache file
+ Status NewCacheFile();
+ // Get cache directory path
+ std::string GetCachePath() const { return opt_.path + "/cache"; }
+ // Cleanup folder
+ Status CleanupCacheFolder(const std::string& folder);
+
+ // Statistics
+ struct Statistics {
+ HistogramImpl bytes_pipelined_;
+ HistogramImpl bytes_written_;
+ HistogramImpl bytes_read_;
+ HistogramImpl read_hit_latency_;
+ HistogramImpl read_miss_latency_;
+ HistogramImpl write_latency_;
+ std::atomic<uint64_t> cache_hits_{0};
+ std::atomic<uint64_t> cache_misses_{0};
+ std::atomic<uint64_t> cache_errors_{0};
+ std::atomic<uint64_t> insert_dropped_{0};
+
+ double CacheHitPct() const {
+ const auto lookups = cache_hits_ + cache_misses_;
+ return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0;
+ }
+
+ double CacheMissPct() const {
+ const auto lookups = cache_hits_ + cache_misses_;
+ return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0;
+ }
+ };
+
+ port::RWMutex lock_; // Synchronization
+ const PersistentCacheConfig opt_; // BlockCache options
+ BoundedQueue<InsertOp> insert_ops_; // Ops waiting for insert
+ ROCKSDB_NAMESPACE::port::Thread insert_th_; // Insert thread
+ uint32_t writer_cache_id_ = 0; // Current cache file identifier
+ WriteableCacheFile* cache_file_ = nullptr; // Current cache file reference
+ CacheWriteBufferAllocator buffer_allocator_; // Buffer provider
+ ThreadedWriter writer_; // Writer threads
+ BlockCacheTierMetadata metadata_; // Cache meta data manager
+ std::atomic<uint64_t> size_{0}; // Size of the cache
+ Statistics stats_; // Statistics
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
new file mode 100644
index 000000000..f4f8517ab
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
@@ -0,0 +1,610 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// File creation factories
+//
+Status NewWritableCacheFile(Env* const env, const std::string& filepath,
+ std::unique_ptr<WritableFile>* file,
+ const bool use_direct_writes = false) {
+ EnvOptions opt;
+ opt.use_direct_writes = use_direct_writes;
+ Status s = env->NewWritableFile(filepath, file, opt);
+ return s;
+}
+
+Status NewRandomAccessCacheFile(const std::shared_ptr<FileSystem>& fs,
+ const std::string& filepath,
+ std::unique_ptr<FSRandomAccessFile>* file,
+ const bool use_direct_reads = true) {
+ assert(fs.get());
+
+ FileOptions opt;
+ opt.use_direct_reads = use_direct_reads;
+ return fs->NewRandomAccessFile(filepath, opt, file, nullptr);
+}
+
+//
+// BlockCacheFile
+//
+Status BlockCacheFile::Delete(uint64_t* size) {
+ assert(env_);
+
+ Status status = env_->GetFileSize(Path(), size);
+ if (!status.ok()) {
+ return status;
+ }
+ return env_->DeleteFile(Path());
+}
+
+//
+// CacheRecord
+//
+// Cache record represents the record on disk
+//
+// +--------+---------+----------+------------+---------------+-------------+
+// | magic | crc | key size | value size | key data | value data |
+// +--------+---------+----------+------------+---------------+-------------+
+// <-- 4 --><-- 4 --><-- 4 --><-- 4 --><-- key size --><-- v-size -->
+//
+struct CacheRecordHeader {
+ CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {}
+ CacheRecordHeader(const uint32_t magic, const uint32_t key_size,
+ const uint32_t val_size)
+ : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {}
+
+ uint32_t magic_;
+ uint32_t crc_;
+ uint32_t key_size_;
+ uint32_t val_size_;
+};
+
+struct CacheRecord {
+ CacheRecord() {}
+ CacheRecord(const Slice& key, const Slice& val)
+ : hdr_(MAGIC, static_cast<uint32_t>(key.size()),
+ static_cast<uint32_t>(val.size())),
+ key_(key),
+ val_(val) {
+ hdr_.crc_ = ComputeCRC();
+ }
+
+ uint32_t ComputeCRC() const;
+ bool Serialize(std::vector<CacheWriteBuffer*>* bufs, size_t* woff);
+ bool Deserialize(const Slice& buf);
+
+ static uint32_t CalcSize(const Slice& key, const Slice& val) {
+ return static_cast<uint32_t>(sizeof(CacheRecordHeader) + key.size() +
+ val.size());
+ }
+
+ static const uint32_t MAGIC = 0xfefa;
+
+ bool Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
+ const char* data, const size_t size);
+
+ CacheRecordHeader hdr_;
+ Slice key_;
+ Slice val_;
+};
+
+static_assert(sizeof(CacheRecordHeader) == 16, "DataHeader is not aligned");
+
+uint32_t CacheRecord::ComputeCRC() const {
+ uint32_t crc = 0;
+ CacheRecordHeader tmp = hdr_;
+ tmp.crc_ = 0;
+ crc = crc32c::Extend(crc, reinterpret_cast<const char*>(&tmp), sizeof(tmp));
+ crc = crc32c::Extend(crc, reinterpret_cast<const char*>(key_.data()),
+ key_.size());
+ crc = crc32c::Extend(crc, reinterpret_cast<const char*>(val_.data()),
+ val_.size());
+ return crc;
+}
+
+bool CacheRecord::Serialize(std::vector<CacheWriteBuffer*>* bufs,
+ size_t* woff) {
+ assert(bufs->size());
+ return Append(bufs, woff, reinterpret_cast<const char*>(&hdr_),
+ sizeof(hdr_)) &&
+ Append(bufs, woff, reinterpret_cast<const char*>(key_.data()),
+ key_.size()) &&
+ Append(bufs, woff, reinterpret_cast<const char*>(val_.data()),
+ val_.size());
+}
+
+bool CacheRecord::Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
+ const char* data, const size_t data_size) {
+ assert(*woff < bufs->size());
+
+ const char* p = data;
+ size_t size = data_size;
+
+ while (size && *woff < bufs->size()) {
+ CacheWriteBuffer* buf = (*bufs)[*woff];
+ const size_t free = buf->Free();
+ if (size <= free) {
+ buf->Append(p, size);
+ size = 0;
+ } else {
+ buf->Append(p, free);
+ p += free;
+ size -= free;
+ assert(!buf->Free());
+ assert(buf->Used() == buf->Capacity());
+ }
+
+ if (!buf->Free()) {
+ *woff += 1;
+ }
+ }
+
+ assert(!size);
+
+ return !size;
+}
+
+bool CacheRecord::Deserialize(const Slice& data) {
+ assert(data.size() >= sizeof(CacheRecordHeader));
+ if (data.size() < sizeof(CacheRecordHeader)) {
+ return false;
+ }
+
+ memcpy(&hdr_, data.data(), sizeof(hdr_));
+
+ assert(hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) == data.size());
+ if (hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) != data.size()) {
+ return false;
+ }
+
+ key_ = Slice(data.data_ + sizeof(hdr_), hdr_.key_size_);
+ val_ = Slice(key_.data_ + hdr_.key_size_, hdr_.val_size_);
+
+ if (!(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_)) {
+ fprintf(stderr, "** magic %d ** \n", hdr_.magic_);
+ fprintf(stderr, "** key_size %d ** \n", hdr_.key_size_);
+ fprintf(stderr, "** val_size %d ** \n", hdr_.val_size_);
+ fprintf(stderr, "** key %s ** \n", key_.ToString().c_str());
+ fprintf(stderr, "** val %s ** \n", val_.ToString().c_str());
+ for (size_t i = 0; i < hdr_.val_size_; ++i) {
+ fprintf(stderr, "%d.", (uint8_t)val_.data()[i]);
+ }
+ fprintf(stderr, "\n** cksum %d != %d **", hdr_.crc_, ComputeCRC());
+ }
+
+ assert(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_);
+ return hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_;
+}
+
+//
+// RandomAccessFile
+//
+
+bool RandomAccessCacheFile::Open(const bool enable_direct_reads) {
+ WriteLock _(&rwlock_);
+ return OpenImpl(enable_direct_reads);
+}
+
+bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) {
+ rwlock_.AssertHeld();
+
+ ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str());
+ assert(env_);
+
+ std::unique_ptr<FSRandomAccessFile> file;
+ Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file,
+ enable_direct_reads);
+ if (!status.ok()) {
+ Error(log_, "Error opening random access file %s. %s", Path().c_str(),
+ status.ToString().c_str());
+ return false;
+ }
+ freader_.reset(new RandomAccessFileReader(std::move(file), Path(),
+ env_->GetSystemClock().get()));
+
+ return true;
+}
+
+bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val,
+ char* scratch) {
+ ReadLock _(&rwlock_);
+
+ assert(lba.cache_id_ == cache_id_);
+
+ if (!freader_) {
+ return false;
+ }
+
+ Slice result;
+ Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch,
+ nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ Error(log_, "Error reading from file %s. %s", Path().c_str(),
+ s.ToString().c_str());
+ return false;
+ }
+
+ assert(result.data() == scratch);
+
+ return ParseRec(lba, key, val, scratch);
+}
+
+bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val,
+ char* scratch) {
+ Slice data(scratch, lba.size_);
+
+ CacheRecord rec;
+ if (!rec.Deserialize(data)) {
+ assert(!"Error deserializing data");
+ Error(log_, "Error de-serializing record from file %s off %d",
+ Path().c_str(), lba.off_);
+ return false;
+ }
+
+ *key = Slice(rec.key_);
+ *val = Slice(rec.val_);
+
+ return true;
+}
+
+//
+// WriteableCacheFile
+//
+
+WriteableCacheFile::~WriteableCacheFile() {
+ WriteLock _(&rwlock_);
+ if (!eof_) {
+ // This file never flushed. We give priority to shutdown since this is a
+ // cache
+ // TODO(krad): Figure a way to flush the pending data
+ if (file_) {
+ assert(refs_ == 1);
+ --refs_;
+ }
+ }
+ assert(!refs_);
+ ClearBuffers();
+}
+
+bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
+ const bool enable_direct_reads) {
+ WriteLock _(&rwlock_);
+
+ enable_direct_reads_ = enable_direct_reads;
+
+ ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)",
+ Path().c_str(), max_size_);
+
+ assert(env_);
+
+ Status s = env_->FileExists(Path());
+ if (s.ok()) {
+ ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(),
+ s.ToString().c_str());
+ }
+
+ s = NewWritableCacheFile(env_, Path(), &file_);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(log_, "Unable to create file %s. %s", Path().c_str(),
+ s.ToString().c_str());
+ return false;
+ }
+
+ assert(!refs_);
+ ++refs_;
+
+ return true;
+}
+
+bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) {
+ WriteLock _(&rwlock_);
+
+ if (eof_) {
+ // We can't append since the file is full
+ return false;
+ }
+
+ // estimate the space required to store the (key, val)
+ uint32_t rec_size = CacheRecord::CalcSize(key, val);
+
+ if (!ExpandBuffer(rec_size)) {
+ // unable to expand the buffer
+ ROCKS_LOG_DEBUG(log_, "Error expanding buffers. size=%d", rec_size);
+ return false;
+ }
+
+ lba->cache_id_ = cache_id_;
+ lba->off_ = disk_woff_;
+ lba->size_ = rec_size;
+
+ CacheRecord rec(key, val);
+ if (!rec.Serialize(&bufs_, &buf_woff_)) {
+ // unexpected error: unable to serialize the data
+ assert(!"Error serializing record");
+ return false;
+ }
+
+ disk_woff_ += rec_size;
+ eof_ = disk_woff_ >= max_size_;
+
+ // dispatch buffer for flush
+ DispatchBuffer();
+
+ return true;
+}
+
+bool WriteableCacheFile::ExpandBuffer(const size_t size) {
+ rwlock_.AssertHeld();
+ assert(!eof_);
+
+ // determine if there is enough space
+ size_t free = 0; // compute the free space left in buffer
+ for (size_t i = buf_woff_; i < bufs_.size(); ++i) {
+ free += bufs_[i]->Free();
+ if (size <= free) {
+ // we have enough space in the buffer
+ return true;
+ }
+ }
+
+ // expand the buffer until there is enough space to write `size` bytes
+ assert(free < size);
+ assert(alloc_);
+
+ while (free < size) {
+ CacheWriteBuffer* const buf = alloc_->Allocate();
+ if (!buf) {
+ ROCKS_LOG_DEBUG(log_, "Unable to allocate buffers");
+ return false;
+ }
+
+ size_ += static_cast<uint32_t>(buf->Free());
+ free += buf->Free();
+ bufs_.push_back(buf);
+ }
+
+ assert(free >= size);
+ return true;
+}
+
+void WriteableCacheFile::DispatchBuffer() {
+ rwlock_.AssertHeld();
+
+ assert(bufs_.size());
+ assert(buf_doff_ <= buf_woff_);
+ assert(buf_woff_ <= bufs_.size());
+
+ if (pending_ios_) {
+ return;
+ }
+
+ if (!eof_ && buf_doff_ == buf_woff_) {
+ // dispatch buffer is pointing to write buffer and we haven't hit eof
+ return;
+ }
+
+ assert(eof_ || buf_doff_ < buf_woff_);
+ assert(buf_doff_ < bufs_.size());
+ assert(file_);
+ assert(alloc_);
+
+ auto* buf = bufs_[buf_doff_];
+ const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
+
+ assert(!buf->Free() ||
+ (eof_ && buf_doff_ == buf_woff_ && buf_woff_ < bufs_.size()));
+ // we have reached end of file, and there is space in the last buffer
+ // pad it with zero for direct IO
+ buf->FillTrailingZeros();
+
+ assert(buf->Used() % kFileAlignmentSize == 0);
+
+ writer_->Write(file_.get(), buf, file_off,
+ std::bind(&WriteableCacheFile::BufferWriteDone, this));
+ pending_ios_++;
+ buf_doff_++;
+}
+
+void WriteableCacheFile::BufferWriteDone() {
+ WriteLock _(&rwlock_);
+
+ assert(bufs_.size());
+
+ pending_ios_--;
+
+ if (buf_doff_ < bufs_.size()) {
+ DispatchBuffer();
+ }
+
+ if (eof_ && buf_doff_ >= bufs_.size() && !pending_ios_) {
+ // end-of-file reached, move to read mode
+ CloseAndOpenForReading();
+ }
+}
+
+void WriteableCacheFile::CloseAndOpenForReading() {
+ // Our env abstraction do not allow reading from a file opened for appending
+ // We need close the file and re-open it for reading
+ Close();
+ RandomAccessCacheFile::OpenImpl(enable_direct_reads_);
+}
+
+bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block,
+ char* scratch) {
+ rwlock_.AssertHeld();
+
+ if (!ReadBuffer(lba, scratch)) {
+ Error(log_, "Error reading from buffer. cache=%d off=%d", cache_id_,
+ lba.off_);
+ return false;
+ }
+
+ return ParseRec(lba, key, block, scratch);
+}
+
+bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) {
+ rwlock_.AssertHeld();
+
+ assert(lba.off_ < disk_woff_);
+ assert(alloc_);
+
+ // we read from the buffers like reading from a flat file. The list of buffers
+ // are treated as contiguous stream of data
+
+ char* tmp = data;
+ size_t pending_nbytes = lba.size_;
+ // start buffer
+ size_t start_idx = lba.off_ / alloc_->BufferSize();
+ // offset into the start buffer
+ size_t start_off = lba.off_ % alloc_->BufferSize();
+
+ assert(start_idx <= buf_woff_);
+
+ for (size_t i = start_idx; pending_nbytes && i < bufs_.size(); ++i) {
+ assert(i <= buf_woff_);
+ auto* buf = bufs_[i];
+ assert(i == buf_woff_ || !buf->Free());
+ // bytes to write to the buffer
+ size_t nbytes = pending_nbytes > (buf->Used() - start_off)
+ ? (buf->Used() - start_off)
+ : pending_nbytes;
+ memcpy(tmp, buf->Data() + start_off, nbytes);
+
+ // left over to be written
+ pending_nbytes -= nbytes;
+ start_off = 0;
+ tmp += nbytes;
+ }
+
+ assert(!pending_nbytes);
+ if (pending_nbytes) {
+ return false;
+ }
+
+ assert(tmp == data + lba.size_);
+ return true;
+}
+
+void WriteableCacheFile::Close() {
+ rwlock_.AssertHeld();
+
+ assert(size_ >= max_size_);
+ assert(disk_woff_ >= max_size_);
+ assert(buf_doff_ == bufs_.size());
+ assert(bufs_.size() - buf_woff_ <= 1);
+ assert(!pending_ios_);
+
+ Info(log_, "Closing file %s. size=%d written=%d", Path().c_str(), size_,
+ disk_woff_);
+
+ ClearBuffers();
+ file_.reset();
+
+ assert(refs_);
+ --refs_;
+}
+
+void WriteableCacheFile::ClearBuffers() {
+ assert(alloc_);
+
+ for (size_t i = 0; i < bufs_.size(); ++i) {
+ alloc_->Deallocate(bufs_[i]);
+ }
+
+ bufs_.clear();
+}
+
+//
+// ThreadedFileWriter implementation
+//
+ThreadedWriter::ThreadedWriter(PersistentCacheTier* const cache,
+ const size_t qdepth, const size_t io_size)
+ : Writer(cache), io_size_(io_size) {
+ for (size_t i = 0; i < qdepth; ++i) {
+ port::Thread th(&ThreadedWriter::ThreadMain, this);
+ threads_.push_back(std::move(th));
+ }
+}
+
+void ThreadedWriter::Stop() {
+ // notify all threads to exit
+ for (size_t i = 0; i < threads_.size(); ++i) {
+ q_.Push(IO(/*signal=*/true));
+ }
+
+ // wait for all threads to exit
+ for (auto& th : threads_) {
+ th.join();
+ assert(!th.joinable());
+ }
+ threads_.clear();
+}
+
+void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf,
+ const uint64_t file_off,
+ const std::function<void()> callback) {
+ q_.Push(IO(file, buf, file_off, callback));
+}
+
+void ThreadedWriter::ThreadMain() {
+ while (true) {
+ // Fetch the IO to process
+ IO io(q_.Pop());
+ if (io.signal_) {
+ // that's secret signal to exit
+ break;
+ }
+
+ // Reserve space for writing the buffer
+ while (!cache_->Reserve(io.buf_->Used())) {
+ // We can fail to reserve space if every file in the system
+ // is being currently accessed
+ /* sleep override */
+ SystemClock::Default()->SleepForMicroseconds(1000000);
+ }
+
+ DispatchIO(io);
+
+ io.callback_();
+ }
+}
+
+void ThreadedWriter::DispatchIO(const IO& io) {
+ size_t written = 0;
+ while (written < io.buf_->Used()) {
+ Slice data(io.buf_->Data() + written, io_size_);
+ Status s = io.file_->Append(data);
+ assert(s.ok());
+ if (!s.ok()) {
+ // That is definite IO error to device. There is not much we can
+ // do but ignore the failure. This can lead to corruption of data on
+ // disk, but the cache will skip while reading
+ fprintf(stderr, "Error writing data to file. %s\n", s.ToString().c_str());
+ }
+ written += io_size_;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
new file mode 100644
index 000000000..1d265ab74
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
@@ -0,0 +1,293 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file_buffer.h"
+#include "utilities/persistent_cache/lrulist.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
+
+// The io code path of persistent cache uses pipelined architecture
+//
+// client -> In Queue <-- BlockCacheTier --> Out Queue <-- Writer <--> Kernel
+//
+// This would enable the system to scale for GB/s of throughput which is
+// expected with modern devies like NVM.
+//
+// The file level operations are encapsulated in the following abstractions
+//
+// BlockCacheFile
+// ^
+// |
+// |
+// RandomAccessCacheFile (For reading)
+// ^
+// |
+// |
+// WriteableCacheFile (For writing)
+//
+// Write IO code path :
+//
+namespace ROCKSDB_NAMESPACE {
+
+class WriteableCacheFile;
+struct BlockInfo;
+
+// Represents a logical record on device
+//
+// (L)ogical (B)lock (Address = { cache-file-id, offset, size }
+struct LogicalBlockAddress {
+ LogicalBlockAddress() {}
+ explicit LogicalBlockAddress(const uint32_t cache_id, const uint32_t off,
+ const uint16_t size)
+ : cache_id_(cache_id), off_(off), size_(size) {}
+
+ uint32_t cache_id_ = 0;
+ uint32_t off_ = 0;
+ uint32_t size_ = 0;
+};
+
+using LBA = LogicalBlockAddress;
+
+// class Writer
+//
+// Writer is the abstraction used for writing data to file. The component can be
+// multithreaded. It is the last step of write pipeline
+class Writer {
+ public:
+ explicit Writer(PersistentCacheTier* const cache) : cache_(cache) {}
+ virtual ~Writer() {}
+
+ // write buffer to file at the given offset
+ virtual void Write(WritableFile* const file, CacheWriteBuffer* buf,
+ const uint64_t file_off,
+ const std::function<void()> callback) = 0;
+ // stop the writer
+ virtual void Stop() = 0;
+
+ PersistentCacheTier* const cache_;
+};
+
+// class BlockCacheFile
+//
+// Generic interface to support building file specialized for read/writing
+class BlockCacheFile : public LRUElement<BlockCacheFile> {
+ public:
+ explicit BlockCacheFile(const uint32_t cache_id)
+ : LRUElement<BlockCacheFile>(), cache_id_(cache_id) {}
+
+ explicit BlockCacheFile(Env* const env, const std::string& dir,
+ const uint32_t cache_id)
+ : LRUElement<BlockCacheFile>(),
+ env_(env),
+ dir_(dir),
+ cache_id_(cache_id) {}
+
+ virtual ~BlockCacheFile() {}
+
+ // append key/value to file and return LBA locator to user
+ virtual bool Append(const Slice& /*key*/, const Slice& /*val*/,
+ LBA* const /*lba*/) {
+ assert(!"not implemented");
+ return false;
+ }
+
+ // read from the record locator (LBA) and return key, value and status
+ virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/,
+ char* /*scratch*/) {
+ assert(!"not implemented");
+ return false;
+ }
+
+ // get file path
+ std::string Path() const {
+ return dir_ + "/" + std::to_string(cache_id_) + ".rc";
+ }
+ // get cache ID
+ uint32_t cacheid() const { return cache_id_; }
+ // Add block information to file data
+ // Block information is the list of index reference for this file
+ virtual void Add(BlockInfo* binfo) {
+ WriteLock _(&rwlock_);
+ block_infos_.push_back(binfo);
+ }
+ // get block information
+ std::list<BlockInfo*>& block_infos() { return block_infos_; }
+ // delete file and return the size of the file
+ virtual Status Delete(uint64_t* size);
+
+ protected:
+ port::RWMutex rwlock_; // synchronization mutex
+ Env* const env_ = nullptr; // Env for OS
+ const std::string dir_; // Directory name
+ const uint32_t cache_id_; // Cache id for the file
+ std::list<BlockInfo*> block_infos_; // List of index entries mapping to the
+ // file content
+};
+
+// class RandomAccessFile
+//
+// Thread safe implementation for reading random data from file
+class RandomAccessCacheFile : public BlockCacheFile {
+ public:
+ explicit RandomAccessCacheFile(Env* const env, const std::string& dir,
+ const uint32_t cache_id,
+ const std::shared_ptr<Logger>& log)
+ : BlockCacheFile(env, dir, cache_id), log_(log) {}
+
+ virtual ~RandomAccessCacheFile() {}
+
+ // open file for reading
+ bool Open(const bool enable_direct_reads);
+ // read data from the disk
+ bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override;
+
+ private:
+ std::unique_ptr<RandomAccessFileReader> freader_;
+
+ protected:
+ bool OpenImpl(const bool enable_direct_reads);
+ bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch);
+
+ std::shared_ptr<Logger> log_; // log file
+};
+
+// class WriteableCacheFile
+//
+// All writes to the files are cached in buffers. The buffers are flushed to
+// disk as they get filled up. When file size reaches a certain size, a new file
+// will be created provided there is free space
+class WriteableCacheFile : public RandomAccessCacheFile {
+ public:
+ explicit WriteableCacheFile(Env* const env, CacheWriteBufferAllocator* alloc,
+ Writer* writer, const std::string& dir,
+ const uint32_t cache_id, const uint32_t max_size,
+ const std::shared_ptr<Logger>& log)
+ : RandomAccessCacheFile(env, dir, cache_id, log),
+ alloc_(alloc),
+ writer_(writer),
+ max_size_(max_size) {}
+
+ virtual ~WriteableCacheFile();
+
+ // create file on disk
+ bool Create(const bool enable_direct_writes, const bool enable_direct_reads);
+
+ // read data from logical file
+ bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override {
+ ReadLock _(&rwlock_);
+ const bool closed = eof_ && bufs_.empty();
+ if (closed) {
+ // the file is closed, read from disk
+ return RandomAccessCacheFile::Read(lba, key, block, scratch);
+ }
+ // file is still being written, read from buffers
+ return ReadBuffer(lba, key, block, scratch);
+ }
+
+ // append data to end of file
+ bool Append(const Slice&, const Slice&, LBA* const) override;
+ // End-of-file
+ bool Eof() const { return eof_; }
+
+ private:
+ friend class ThreadedWriter;
+
+ static const size_t kFileAlignmentSize = 4 * 1024; // align file size
+
+ bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch);
+ bool ReadBuffer(const LBA& lba, char* data);
+ bool ExpandBuffer(const size_t size);
+ void DispatchBuffer();
+ void BufferWriteDone();
+ void CloseAndOpenForReading();
+ void ClearBuffers();
+ void Close();
+
+ // File layout in memory
+ //
+ // +------+------+------+------+------+------+
+ // | b0 | b1 | b2 | b3 | b4 | b5 |
+ // +------+------+------+------+------+------+
+ // ^ ^
+ // | |
+ // buf_doff_ buf_woff_
+ // (next buffer to (next buffer to fill)
+ // flush to disk)
+ //
+ // The buffers are flushed to disk serially for a given file
+
+ CacheWriteBufferAllocator* const alloc_ = nullptr; // Buffer provider
+ Writer* const writer_ = nullptr; // File writer thread
+ std::unique_ptr<WritableFile> file_; // RocksDB Env file abstraction
+ std::vector<CacheWriteBuffer*> bufs_; // Written buffers
+ uint32_t size_ = 0; // Size of the file
+ const uint32_t max_size_; // Max size of the file
+ bool eof_ = false; // End of file
+ uint32_t disk_woff_ = 0; // Offset to write on disk
+ size_t buf_woff_ = 0; // off into bufs_ to write
+ size_t buf_doff_ = 0; // off into bufs_ to dispatch
+ size_t pending_ios_ = 0; // Number of ios to disk in-progress
+ bool enable_direct_reads_ = false; // Should we enable direct reads
+ // when reading from disk
+};
+
+//
+// Abstraction to do writing to device. It is part of pipelined architecture.
+//
+class ThreadedWriter : public Writer {
+ public:
+ // Representation of IO to device
+ struct IO {
+ explicit IO(const bool signal) : signal_(signal) {}
+ explicit IO(WritableFile* const file, CacheWriteBuffer* const buf,
+ const uint64_t file_off, const std::function<void()> callback)
+ : file_(file), buf_(buf), file_off_(file_off), callback_(callback) {}
+
+ IO(const IO&) = default;
+ IO& operator=(const IO&) = default;
+ size_t Size() const { return sizeof(IO); }
+
+ WritableFile* file_ = nullptr; // File to write to
+ CacheWriteBuffer* buf_ = nullptr; // buffer to write
+ uint64_t file_off_ = 0; // file offset
+ bool signal_ = false; // signal to exit thread loop
+ std::function<void()> callback_; // Callback on completion
+ };
+
+ explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth,
+ const size_t io_size);
+ virtual ~ThreadedWriter() { assert(threads_.empty()); }
+
+ void Stop() override;
+ void Write(WritableFile* const file, CacheWriteBuffer* buf,
+ const uint64_t file_off,
+ const std::function<void()> callback) override;
+
+ private:
+ void ThreadMain();
+ void DispatchIO(const IO& io);
+
+ const size_t io_size_ = 0;
+ BoundedQueue<IO> q_;
+ std::vector<port::Thread> threads_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h
new file mode 100644
index 000000000..d4f02455a
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// CacheWriteBuffer
+//
+// Buffer abstraction that can be manipulated via append
+// (not thread safe)
+class CacheWriteBuffer {
+ public:
+ explicit CacheWriteBuffer(const size_t size) : size_(size), pos_(0) {
+ buf_.reset(new char[size_]);
+ assert(!pos_);
+ assert(size_);
+ }
+
+ virtual ~CacheWriteBuffer() {}
+
+ void Append(const char* buf, const size_t size) {
+ assert(pos_ + size <= size_);
+ memcpy(buf_.get() + pos_, buf, size);
+ pos_ += size;
+ assert(pos_ <= size_);
+ }
+
+ void FillTrailingZeros() {
+ assert(pos_ <= size_);
+ memset(buf_.get() + pos_, '0', size_ - pos_);
+ pos_ = size_;
+ }
+
+ void Reset() { pos_ = 0; }
+ size_t Free() const { return size_ - pos_; }
+ size_t Capacity() const { return size_; }
+ size_t Used() const { return pos_; }
+ char* Data() const { return buf_.get(); }
+
+ private:
+ std::unique_ptr<char[]> buf_;
+ const size_t size_;
+ size_t pos_;
+};
+
+//
+// CacheWriteBufferAllocator
+//
+// Buffer pool abstraction(not thread safe)
+//
+class CacheWriteBufferAllocator {
+ public:
+ explicit CacheWriteBufferAllocator(const size_t buffer_size,
+ const size_t buffer_count)
+ : cond_empty_(&lock_), buffer_size_(buffer_size) {
+ MutexLock _(&lock_);
+ buffer_size_ = buffer_size;
+ for (uint32_t i = 0; i < buffer_count; i++) {
+ auto* buf = new CacheWriteBuffer(buffer_size_);
+ assert(buf);
+ if (buf) {
+ bufs_.push_back(buf);
+ cond_empty_.Signal();
+ }
+ }
+ }
+
+ virtual ~CacheWriteBufferAllocator() {
+ MutexLock _(&lock_);
+ assert(bufs_.size() * buffer_size_ == Capacity());
+ for (auto* buf : bufs_) {
+ delete buf;
+ }
+ bufs_.clear();
+ }
+
+ CacheWriteBuffer* Allocate() {
+ MutexLock _(&lock_);
+ if (bufs_.empty()) {
+ return nullptr;
+ }
+
+ assert(!bufs_.empty());
+ CacheWriteBuffer* const buf = bufs_.front();
+ bufs_.pop_front();
+ return buf;
+ }
+
+ void Deallocate(CacheWriteBuffer* const buf) {
+ assert(buf);
+ MutexLock _(&lock_);
+ buf->Reset();
+ bufs_.push_back(buf);
+ cond_empty_.Signal();
+ }
+
+ void WaitUntilUsable() {
+ // We are asked to wait till we have buffers available
+ MutexLock _(&lock_);
+ while (bufs_.empty()) {
+ cond_empty_.Wait();
+ }
+ }
+
+ size_t Capacity() const { return bufs_.size() * buffer_size_; }
+ size_t Free() const { return bufs_.size() * buffer_size_; }
+ size_t BufferSize() const { return buffer_size_; }
+
+ private:
+ port::Mutex lock_; // Sync lock
+ port::CondVar cond_empty_; // Condition var for empty buffers
+ size_t buffer_size_; // Size of each buffer
+ std::list<CacheWriteBuffer*> bufs_; // Buffer stash
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
new file mode 100644
index 000000000..d73b5d0b4
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+
+#include <functional>
+
+namespace ROCKSDB_NAMESPACE {
+
+bool BlockCacheTierMetadata::Insert(BlockCacheFile* file) {
+ return cache_file_index_.Insert(file);
+}
+
+BlockCacheFile* BlockCacheTierMetadata::Lookup(const uint32_t cache_id) {
+ BlockCacheFile* ret = nullptr;
+ BlockCacheFile lookup_key(cache_id);
+ bool ok = cache_file_index_.Find(&lookup_key, &ret);
+ if (ok) {
+ assert(ret->refs_);
+ return ret;
+ }
+ return nullptr;
+}
+
+BlockCacheFile* BlockCacheTierMetadata::Evict() {
+ using std::placeholders::_1;
+ auto fn = std::bind(&BlockCacheTierMetadata::RemoveAllKeys, this, _1);
+ return cache_file_index_.Evict(fn);
+}
+
+void BlockCacheTierMetadata::Clear() {
+ cache_file_index_.Clear([](BlockCacheFile* arg) { delete arg; });
+ block_index_.Clear([](BlockInfo* arg) { delete arg; });
+}
+
+BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) {
+ std::unique_ptr<BlockInfo> binfo(new BlockInfo(key, lba));
+ if (!block_index_.Insert(binfo.get())) {
+ return nullptr;
+ }
+ return binfo.release();
+}
+
+bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) {
+ BlockInfo lookup_key(key);
+ BlockInfo* block;
+ port::RWMutex* rlock = nullptr;
+ if (!block_index_.Find(&lookup_key, &block, &rlock)) {
+ return false;
+ }
+
+ ReadUnlock _(rlock);
+ assert(block->key_ == key.ToString());
+ if (lba) {
+ *lba = block->lba_;
+ }
+ return true;
+}
+
+BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) {
+ BlockInfo lookup_key(key);
+ BlockInfo* binfo = nullptr;
+ bool ok __attribute__((__unused__));
+ ok = block_index_.Erase(&lookup_key, &binfo);
+ assert(ok);
+ return binfo;
+}
+
+void BlockCacheTierMetadata::RemoveAllKeys(BlockCacheFile* f) {
+ for (BlockInfo* binfo : f->block_infos()) {
+ BlockInfo* tmp = nullptr;
+ bool status = block_index_.Erase(binfo, &tmp);
+ (void)status;
+ assert(status);
+ assert(tmp == binfo);
+ delete binfo;
+ }
+ f->block_infos().clear();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h
new file mode 100644
index 000000000..2fcd50105
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/slice.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+#include "utilities/persistent_cache/lrulist.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Block Cache Tier Metadata
+//
+// The BlockCacheTierMetadata holds all the metadata associated with block
+// cache. It
+// fundamentally contains 2 indexes and an LRU.
+//
+// Block Cache Index
+//
+// This is a forward index that maps a given key to a LBA (Logical Block
+// Address). LBA is a disk pointer that points to a record on the cache.
+//
+// LBA = { cache-id, offset, size }
+//
+// Cache File Index
+//
+// This is a forward index that maps a given cache-id to a cache file object.
+// Typically you would lookup using LBA and use the object to read or write
+struct BlockInfo {
+ explicit BlockInfo(const Slice& key, const LBA& lba = LBA())
+ : key_(key.ToString()), lba_(lba) {}
+
+ std::string key_;
+ LBA lba_;
+};
+
+class BlockCacheTierMetadata {
+ public:
+ explicit BlockCacheTierMetadata(const uint32_t blocks_capacity = 1024 * 1024,
+ const uint32_t cachefile_capacity = 10 * 1024)
+ : cache_file_index_(cachefile_capacity), block_index_(blocks_capacity) {}
+
+ virtual ~BlockCacheTierMetadata() {}
+
+ // Insert a given cache file
+ bool Insert(BlockCacheFile* file);
+
+ // Lookup cache file based on cache_id
+ BlockCacheFile* Lookup(const uint32_t cache_id);
+
+ // Insert block information to block index
+ BlockInfo* Insert(const Slice& key, const LBA& lba);
+ // bool Insert(BlockInfo* binfo);
+
+ // Lookup block information from block index
+ bool Lookup(const Slice& key, LBA* lba);
+
+ // Remove a given from the block index
+ BlockInfo* Remove(const Slice& key);
+
+ // Find and evict a cache file using LRU policy
+ BlockCacheFile* Evict();
+
+ // Clear the metadata contents
+ virtual void Clear();
+
+ protected:
+ // Remove all block information from a given file
+ virtual void RemoveAllKeys(BlockCacheFile* file);
+
+ private:
+ // Cache file index definition
+ //
+ // cache-id => BlockCacheFile
+ struct BlockCacheFileHash {
+ uint64_t operator()(const BlockCacheFile* rec) {
+ return std::hash<uint32_t>()(rec->cacheid());
+ }
+ };
+
+ struct BlockCacheFileEqual {
+ uint64_t operator()(const BlockCacheFile* lhs, const BlockCacheFile* rhs) {
+ return lhs->cacheid() == rhs->cacheid();
+ }
+ };
+
+ using CacheFileIndexType =
+ EvictableHashTable<BlockCacheFile, BlockCacheFileHash,
+ BlockCacheFileEqual>;
+
+ // Block Lookup Index
+ //
+ // key => LBA
+ struct Hash {
+ size_t operator()(BlockInfo* node) const {
+ return std::hash<std::string>()(node->key_);
+ }
+ };
+
+ struct Equal {
+ size_t operator()(BlockInfo* lhs, BlockInfo* rhs) const {
+ return lhs->key_ == rhs->key_;
+ }
+ };
+
+ using BlockIndexType = HashTable<BlockInfo*, Hash, Equal>;
+
+ CacheFileIndexType cache_file_index_;
+ BlockIndexType block_index_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table.h b/src/rocksdb/utilities/persistent_cache/hash_table.h
new file mode 100644
index 000000000..b00b294ce
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table.h
@@ -0,0 +1,239 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <assert.h>
+
+#include <list>
+#include <vector>
+
+#ifdef OS_LINUX
+#include <sys/mman.h>
+#endif
+
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// HashTable<T, Hash, Equal>
+//
+// Traditional implementation of hash table with synchronization built on top
+// don't perform very well in multi-core scenarios. This is an implementation
+// designed for multi-core scenarios with high lock contention.
+//
+// |<-------- alpha ------------->|
+// Buckets Collision list
+// ---- +----+ +---+---+--- ...... ---+---+---+
+// / | |--->| | | | | |
+// / +----+ +---+---+--- ...... ---+---+---+
+// / | |
+// Locks/ +----+
+// +--+/ . .
+// | | . .
+// +--+ . .
+// | | . .
+// +--+ . .
+// | | . .
+// +--+ . .
+// \ +----+
+// \ | |
+// \ +----+
+// \ | |
+// \---- +----+
+//
+// The lock contention is spread over an array of locks. This helps improve
+// concurrent access. The spine is designed for a certain capacity and load
+// factor. When the capacity planning is done correctly we can expect
+// O(load_factor = 1) insert, access and remove time.
+//
+// Micro benchmark on debug build gives about .5 Million/sec rate of insert,
+// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
+// blocks were of 4K, the hash table can support a virtual throughput of
+// 6 GB/s.
+//
+// T Object type (contains both key and value)
+// Hash Function that returns an hash from type T
+// Equal Returns if two objects are equal
+// (We need explicit equal for pointer type)
+//
+template <class T, class Hash, class Equal>
+class HashTable {
+ public:
+ explicit HashTable(const size_t capacity = 1024 * 1024,
+ const float load_factor = 2.0, const uint32_t nlocks = 256)
+ : nbuckets_(
+ static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)),
+ nlocks_(nlocks) {
+ // pre-conditions
+ assert(capacity);
+ assert(load_factor);
+ assert(nbuckets_);
+ assert(nlocks_);
+
+ buckets_.reset(new Bucket[nbuckets_]);
+#ifdef OS_LINUX
+ mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
+#endif
+
+ // initialize locks
+ locks_.reset(new port::RWMutex[nlocks_]);
+#ifdef OS_LINUX
+ mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
+#endif
+
+ // post-conditions
+ assert(buckets_);
+ assert(locks_);
+ }
+
+ virtual ~HashTable() { AssertEmptyBuckets(); }
+
+ //
+ // Insert given record to hash table
+ //
+ bool Insert(const T& t) {
+ const uint64_t h = Hash()(t);
+ const uint32_t bucket_idx = h % nbuckets_;
+ const uint32_t lock_idx = bucket_idx % nlocks_;
+
+ WriteLock _(&locks_[lock_idx]);
+ auto& bucket = buckets_[bucket_idx];
+ return Insert(&bucket, t);
+ }
+
+ // Lookup hash table
+ //
+ // Please note that read lock should be held by the caller. This is because
+ // the caller owns the data, and should hold the read lock as long as he
+ // operates on the data.
+ bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
+ const uint64_t h = Hash()(t);
+ const uint32_t bucket_idx = h % nbuckets_;
+ const uint32_t lock_idx = bucket_idx % nlocks_;
+
+ port::RWMutex& lock = locks_[lock_idx];
+ lock.ReadLock();
+
+ auto& bucket = buckets_[bucket_idx];
+ if (Find(&bucket, t, ret)) {
+ *ret_lock = &lock;
+ return true;
+ }
+
+ lock.ReadUnlock();
+ return false;
+ }
+
+ //
+ // Erase a given key from the hash table
+ //
+ bool Erase(const T& t, T* ret) {
+ const uint64_t h = Hash()(t);
+ const uint32_t bucket_idx = h % nbuckets_;
+ const uint32_t lock_idx = bucket_idx % nlocks_;
+
+ WriteLock _(&locks_[lock_idx]);
+
+ auto& bucket = buckets_[bucket_idx];
+ return Erase(&bucket, t, ret);
+ }
+
+ // Fetch the mutex associated with a key
+ // This call is used to hold the lock for a given data for extended period of
+ // time.
+ port::RWMutex* GetMutex(const T& t) {
+ const uint64_t h = Hash()(t);
+ const uint32_t bucket_idx = h % nbuckets_;
+ const uint32_t lock_idx = bucket_idx % nlocks_;
+
+ return &locks_[lock_idx];
+ }
+
+ void Clear(void (*fn)(T)) {
+ for (uint32_t i = 0; i < nbuckets_; ++i) {
+ const uint32_t lock_idx = i % nlocks_;
+ WriteLock _(&locks_[lock_idx]);
+ for (auto& t : buckets_[i].list_) {
+ (*fn)(t);
+ }
+ buckets_[i].list_.clear();
+ }
+ }
+
+ protected:
+ // Models bucket of keys that hash to the same bucket number
+ struct Bucket {
+ std::list<T> list_;
+ };
+
+ // Substitute for std::find with custom comparator operator
+ typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
+ for (auto it = list->begin(); it != list->end(); ++it) {
+ if (Equal()(*it, t)) {
+ return it;
+ }
+ }
+ return list->end();
+ }
+
+ bool Insert(Bucket* bucket, const T& t) {
+ // Check if the key already exists
+ auto it = Find(&bucket->list_, t);
+ if (it != bucket->list_.end()) {
+ return false;
+ }
+
+ // insert to bucket
+ bucket->list_.push_back(t);
+ return true;
+ }
+
+ bool Find(Bucket* bucket, const T& t, T* ret) {
+ auto it = Find(&bucket->list_, t);
+ if (it != bucket->list_.end()) {
+ if (ret) {
+ *ret = *it;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ bool Erase(Bucket* bucket, const T& t, T* ret) {
+ auto it = Find(&bucket->list_, t);
+ if (it != bucket->list_.end()) {
+ if (ret) {
+ *ret = *it;
+ }
+
+ bucket->list_.erase(it);
+ return true;
+ }
+ return false;
+ }
+
+ // assert that all buckets are empty
+ void AssertEmptyBuckets() {
+#ifndef NDEBUG
+ for (size_t i = 0; i < nbuckets_; ++i) {
+ WriteLock _(&locks_[i % nlocks_]);
+ assert(buckets_[i].list_.empty());
+ }
+#endif
+ }
+
+ const uint32_t nbuckets_; // No. of buckets in the spine
+ std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets
+ const uint32_t nlocks_; // No. of locks
+ std::unique_ptr<port::RWMutex[]> locks_; // Granular locks
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc
new file mode 100644
index 000000000..74d7e2edf
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc
@@ -0,0 +1,310 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_LITE)
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
+#else
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "port/port_posix.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table.h"
+
+using std::string;
+
+DEFINE_int32(nsec, 10, "nsec");
+DEFINE_int32(nthread_write, 1, "insert %");
+DEFINE_int32(nthread_read, 1, "lookup %");
+DEFINE_int32(nthread_erase, 1, "erase %");
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// HashTableImpl interface
+//
+// Abstraction of a hash table implementation
+template <class Key, class Value>
+class HashTableImpl {
+ public:
+ virtual ~HashTableImpl() {}
+
+ virtual bool Insert(const Key& key, const Value& val) = 0;
+ virtual bool Erase(const Key& key) = 0;
+ virtual bool Lookup(const Key& key, Value* val) = 0;
+};
+
+// HashTableBenchmark
+//
+// Abstraction to test a given hash table implementation. The test mostly
+// focus on insert, lookup and erase. The test can operate in test mode and
+// benchmark mode.
+class HashTableBenchmark {
+ public:
+ explicit HashTableBenchmark(HashTableImpl<size_t, std::string>* impl,
+ const size_t sec = 10,
+ const size_t nthread_write = 1,
+ const size_t nthread_read = 1,
+ const size_t nthread_erase = 1)
+ : impl_(impl),
+ sec_(sec),
+ ninserts_(0),
+ nreads_(0),
+ nerases_(0),
+ nerases_failed_(0),
+ quit_(false) {
+ Prepop();
+
+ StartThreads(nthread_write, WriteMain);
+ StartThreads(nthread_read, ReadMain);
+ StartThreads(nthread_erase, EraseMain);
+
+ uint64_t start = NowInMillSec();
+ while (!quit_) {
+ quit_ = NowInMillSec() - start > sec_ * 1000;
+ /* sleep override */ sleep(1);
+ }
+
+ Env* env = Env::Default();
+ env->WaitForJoin();
+
+ if (sec_) {
+ printf("Result \n");
+ printf("====== \n");
+ printf("insert/sec = %f \n", ninserts_ / static_cast<double>(sec_));
+ printf("read/sec = %f \n", nreads_ / static_cast<double>(sec_));
+ printf("erases/sec = %f \n", nerases_ / static_cast<double>(sec_));
+ const uint64_t ops = ninserts_ + nreads_ + nerases_;
+ printf("ops/sec = %f \n", ops / static_cast<double>(sec_));
+ printf("erase fail = %d (%f%%)\n", static_cast<int>(nerases_failed_),
+ static_cast<float>(nerases_failed_ / nerases_ * 100));
+ printf("====== \n");
+ }
+ }
+
+ void RunWrite() {
+ while (!quit_) {
+ size_t k = insert_key_++;
+ std::string tmp(1000, k % 255);
+ bool status = impl_->Insert(k, tmp);
+ assert(status);
+ ninserts_++;
+ }
+ }
+
+ void RunRead() {
+ Random64 rgen(time(nullptr));
+ while (!quit_) {
+ std::string s;
+ size_t k = rgen.Next() % max_prepop_key;
+ bool status = impl_->Lookup(k, &s);
+ assert(status);
+ assert(s == std::string(1000, k % 255));
+ nreads_++;
+ }
+ }
+
+ void RunErase() {
+ while (!quit_) {
+ size_t k = erase_key_++;
+ bool status = impl_->Erase(k);
+ nerases_failed_ += !status;
+ nerases_++;
+ }
+ }
+
+ private:
+ // Start threads for a given function
+ void StartThreads(const size_t n, void (*fn)(void*)) {
+ Env* env = Env::Default();
+ for (size_t i = 0; i < n; ++i) {
+ env->StartThread(fn, this);
+ }
+ }
+
+ // Prepop the hash table with 1M keys
+ void Prepop() {
+ for (size_t i = 0; i < max_prepop_key; ++i) {
+ bool status = impl_->Insert(i, std::string(1000, i % 255));
+ assert(status);
+ }
+
+ erase_key_ = insert_key_ = max_prepop_key;
+
+ for (size_t i = 0; i < 10 * max_prepop_key; ++i) {
+ bool status = impl_->Insert(insert_key_++, std::string(1000, 'x'));
+ assert(status);
+ }
+ }
+
+ static uint64_t NowInMillSec() {
+ port::TimeVal tv;
+ port::GetTimeOfDay(&tv, /*tz=*/nullptr);
+ return tv.tv_sec * 1000 + tv.tv_usec / 1000;
+ }
+
+ //
+ // Wrapper functions for thread entry
+ //
+ static void WriteMain(void* args) {
+ reinterpret_cast<HashTableBenchmark*>(args)->RunWrite();
+ }
+
+ static void ReadMain(void* args) {
+ reinterpret_cast<HashTableBenchmark*>(args)->RunRead();
+ }
+
+ static void EraseMain(void* args) {
+ reinterpret_cast<HashTableBenchmark*>(args)->RunErase();
+ }
+
+ HashTableImpl<size_t, std::string>* impl_; // Implementation to test
+ const size_t sec_; // Test time
+ const size_t max_prepop_key = 1ULL * 1024 * 1024; // Max prepop key
+ std::atomic<size_t> insert_key_; // Last inserted key
+ std::atomic<size_t> erase_key_; // Erase key
+ std::atomic<size_t> ninserts_; // Number of inserts
+ std::atomic<size_t> nreads_; // Number of reads
+ std::atomic<size_t> nerases_; // Number of erases
+ std::atomic<size_t> nerases_failed_; // Number of erases failed
+ bool quit_; // Should the threads quit ?
+};
+
+//
+// SimpleImpl
+// Lock safe unordered_map implementation
+class SimpleImpl : public HashTableImpl<size_t, string> {
+ public:
+ bool Insert(const size_t& key, const string& val) override {
+ WriteLock _(&rwlock_);
+ map_.insert(make_pair(key, val));
+ return true;
+ }
+
+ bool Erase(const size_t& key) override {
+ WriteLock _(&rwlock_);
+ auto it = map_.find(key);
+ if (it == map_.end()) {
+ return false;
+ }
+ map_.erase(it);
+ return true;
+ }
+
+ bool Lookup(const size_t& key, string* val) override {
+ ReadLock _(&rwlock_);
+ auto it = map_.find(key);
+ if (it != map_.end()) {
+ *val = it->second;
+ }
+ return it != map_.end();
+ }
+
+ private:
+ port::RWMutex rwlock_;
+ std::unordered_map<size_t, string> map_;
+};
+
+//
+// GranularLockImpl
+// Thread safe custom RocksDB implementation of hash table with granular
+// locking
+class GranularLockImpl : public HashTableImpl<size_t, string> {
+ public:
+ bool Insert(const size_t& key, const string& val) override {
+ Node n(key, val);
+ return impl_.Insert(n);
+ }
+
+ bool Erase(const size_t& key) override {
+ Node n(key, string());
+ return impl_.Erase(n, nullptr);
+ }
+
+ bool Lookup(const size_t& key, string* val) override {
+ Node n(key, string());
+ port::RWMutex* rlock;
+ bool status = impl_.Find(n, &n, &rlock);
+ if (status) {
+ ReadUnlock _(rlock);
+ *val = n.val_;
+ }
+ return status;
+ }
+
+ private:
+ struct Node {
+ explicit Node(const size_t key, const string& val) : key_(key), val_(val) {}
+
+ size_t key_ = 0;
+ string val_;
+ };
+
+ struct Hash {
+ uint64_t operator()(const Node& node) {
+ return std::hash<uint64_t>()(node.key_);
+ }
+ };
+
+ struct Equal {
+ bool operator()(const Node& lhs, const Node& rhs) {
+ return lhs.key_ == rhs.key_;
+ }
+ };
+
+ HashTable<Node, Hash, Equal> impl_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+//
+// main
+//
+int main(int argc, char** argv) {
+ GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+ std::string(argv[0]) + " [OPTIONS]...");
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
+
+ //
+ // Micro benchmark unordered_map
+ //
+ printf("Micro benchmarking std::unordered_map \n");
+ {
+ ROCKSDB_NAMESPACE::SimpleImpl impl;
+ ROCKSDB_NAMESPACE::HashTableBenchmark _(
+ &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read,
+ FLAGS_nthread_erase);
+ }
+ //
+ // Micro benchmark scalable hash table
+ //
+ printf("Micro benchmarking scalable hash map \n");
+ {
+ ROCKSDB_NAMESPACE::GranularLockImpl impl;
+ ROCKSDB_NAMESPACE::HashTableBenchmark _(
+ &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read,
+ FLAGS_nthread_erase);
+ }
+
+ return 0;
+}
+#endif // #ifndef GFLAGS
+#else
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h
new file mode 100644
index 000000000..e10939b2f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/lrulist.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Evictable Hash Table
+//
+// Hash table index where least accessed (or one of the least accessed) elements
+// can be evicted.
+//
+// Please note EvictableHashTable can only be created for pointer type objects
+template <class T, class Hash, class Equal>
+class EvictableHashTable : private HashTable<T*, Hash, Equal> {
+ public:
+ using hash_table = HashTable<T*, Hash, Equal>;
+
+ explicit EvictableHashTable(const size_t capacity = 1024 * 1024,
+ const float load_factor = 2.0,
+ const uint32_t nlocks = 256)
+ : HashTable<T*, Hash, Equal>(capacity, load_factor, nlocks),
+ lru_lists_(new LRUList<T>[hash_table::nlocks_]) {
+ assert(lru_lists_);
+ }
+
+ virtual ~EvictableHashTable() { AssertEmptyLRU(); }
+
+ //
+ // Insert given record to hash table (and LRU list)
+ //
+ bool Insert(T* t) {
+ const uint64_t h = Hash()(t);
+ typename hash_table::Bucket& bucket = GetBucket(h);
+ LRUListType& lru = GetLRUList(h);
+ port::RWMutex& lock = GetMutex(h);
+
+ WriteLock _(&lock);
+ if (hash_table::Insert(&bucket, t)) {
+ lru.Push(t);
+ return true;
+ }
+ return false;
+ }
+
+ //
+ // Lookup hash table
+ //
+ // Please note that read lock should be held by the caller. This is because
+ // the caller owns the data, and should hold the read lock as long as he
+ // operates on the data.
+ bool Find(T* t, T** ret) {
+ const uint64_t h = Hash()(t);
+ typename hash_table::Bucket& bucket = GetBucket(h);
+ LRUListType& lru = GetLRUList(h);
+ port::RWMutex& lock = GetMutex(h);
+
+ ReadLock _(&lock);
+ if (hash_table::Find(&bucket, t, ret)) {
+ ++(*ret)->refs_;
+ lru.Touch(*ret);
+ return true;
+ }
+ return false;
+ }
+
+ //
+ // Evict one of the least recently used object
+ //
+ T* Evict(const std::function<void(T*)>& fn = nullptr) {
+ uint32_t random = Random::GetTLSInstance()->Next();
+ const size_t start_idx = random % hash_table::nlocks_;
+ T* t = nullptr;
+
+ // iterate from start_idx .. 0 .. start_idx
+ for (size_t i = 0; !t && i < hash_table::nlocks_; ++i) {
+ const size_t idx = (start_idx + i) % hash_table::nlocks_;
+
+ WriteLock _(&hash_table::locks_[idx]);
+ LRUListType& lru = lru_lists_[idx];
+ if (!lru.IsEmpty() && (t = lru.Pop()) != nullptr) {
+ assert(!t->refs_);
+ // We got an item to evict, erase from the bucket
+ const uint64_t h = Hash()(t);
+ typename hash_table::Bucket& bucket = GetBucket(h);
+ T* tmp = nullptr;
+ bool status = hash_table::Erase(&bucket, t, &tmp);
+ assert(t == tmp);
+ (void)status;
+ assert(status);
+ if (fn) {
+ fn(t);
+ }
+ break;
+ }
+ assert(!t);
+ }
+ return t;
+ }
+
+ void Clear(void (*fn)(T*)) {
+ for (uint32_t i = 0; i < hash_table::nbuckets_; ++i) {
+ const uint32_t lock_idx = i % hash_table::nlocks_;
+ WriteLock _(&hash_table::locks_[lock_idx]);
+ auto& lru_list = lru_lists_[lock_idx];
+ auto& bucket = hash_table::buckets_[i];
+ for (auto* t : bucket.list_) {
+ lru_list.Unlink(t);
+ (*fn)(t);
+ }
+ bucket.list_.clear();
+ }
+ // make sure that all LRU lists are emptied
+ AssertEmptyLRU();
+ }
+
+ void AssertEmptyLRU() {
+#ifndef NDEBUG
+ for (uint32_t i = 0; i < hash_table::nlocks_; ++i) {
+ WriteLock _(&hash_table::locks_[i]);
+ auto& lru_list = lru_lists_[i];
+ assert(lru_list.IsEmpty());
+ }
+#endif
+ }
+
+ //
+ // Fetch the mutex associated with a key
+ // This call is used to hold the lock for a given data for extended period of
+ // time.
+ port::RWMutex* GetMutex(T* t) { return hash_table::GetMutex(t); }
+
+ private:
+ using LRUListType = LRUList<T>;
+
+ typename hash_table::Bucket& GetBucket(const uint64_t h) {
+ const uint32_t bucket_idx = h % hash_table::nbuckets_;
+ return hash_table::buckets_[bucket_idx];
+ }
+
+ LRUListType& GetLRUList(const uint64_t h) {
+ const uint32_t bucket_idx = h % hash_table::nbuckets_;
+ const uint32_t lock_idx = bucket_idx % hash_table::nlocks_;
+ return lru_lists_[lock_idx];
+ }
+
+ port::RWMutex& GetMutex(const uint64_t h) {
+ const uint32_t bucket_idx = h % hash_table::nbuckets_;
+ const uint32_t lock_idx = bucket_idx % hash_table::nlocks_;
+ return hash_table::locks_[lock_idx];
+ }
+
+ std::unique_ptr<LRUListType[]> lru_lists_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_test.cc b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc
new file mode 100644
index 000000000..2f6387f5f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "utilities/persistent_cache/hash_table.h"
+
+#include <stdlib.h>
+
+#include <iostream>
+#include <set>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "memory/arena.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+struct HashTableTest : public testing::Test {
+ ~HashTableTest() override { map_.Clear(&HashTableTest::ClearNode); }
+
+ struct Node {
+ Node() {}
+ explicit Node(const uint64_t key, const std::string& val = std::string())
+ : key_(key), val_(val) {}
+
+ uint64_t key_ = 0;
+ std::string val_;
+ };
+
+ struct Equal {
+ bool operator()(const Node& lhs, const Node& rhs) {
+ return lhs.key_ == rhs.key_;
+ }
+ };
+
+ struct Hash {
+ uint64_t operator()(const Node& node) {
+ return std::hash<uint64_t>()(node.key_);
+ }
+ };
+
+ static void ClearNode(Node /*node*/) {}
+
+ HashTable<Node, Hash, Equal> map_;
+};
+
+struct EvictableHashTableTest : public testing::Test {
+ ~EvictableHashTableTest() override {
+ map_.Clear(&EvictableHashTableTest::ClearNode);
+ }
+
+ struct Node : LRUElement<Node> {
+ Node() {}
+ explicit Node(const uint64_t key, const std::string& val = std::string())
+ : key_(key), val_(val) {}
+
+ uint64_t key_ = 0;
+ std::string val_;
+ std::atomic<uint32_t> refs_{0};
+ };
+
+ struct Equal {
+ bool operator()(const Node* lhs, const Node* rhs) {
+ return lhs->key_ == rhs->key_;
+ }
+ };
+
+ struct Hash {
+ uint64_t operator()(const Node* node) {
+ return std::hash<uint64_t>()(node->key_);
+ }
+ };
+
+ static void ClearNode(Node* /*node*/) {}
+
+ EvictableHashTable<Node, Hash, Equal> map_;
+};
+
+TEST_F(HashTableTest, TestInsert) {
+ const uint64_t max_keys = 1024 * 1024;
+
+ // insert
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ map_.Insert(Node(k, std::string(1000, k % 255)));
+ }
+
+ // verify
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ Node val;
+ port::RWMutex* rlock = nullptr;
+ assert(map_.Find(Node(k), &val, &rlock));
+ rlock->ReadUnlock();
+ assert(val.val_ == std::string(1000, k % 255));
+ }
+}
+
+TEST_F(HashTableTest, TestErase) {
+ const uint64_t max_keys = 1024 * 1024;
+ // insert
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ map_.Insert(Node(k, std::string(1000, k % 255)));
+ }
+
+ auto rand = Random64(time(nullptr));
+ // erase a few keys randomly
+ std::set<uint64_t> erased;
+ for (int i = 0; i < 1024; ++i) {
+ uint64_t k = rand.Next() % max_keys;
+ if (erased.find(k) != erased.end()) {
+ continue;
+ }
+ assert(map_.Erase(Node(k), /*ret=*/nullptr));
+ erased.insert(k);
+ }
+
+ // verify
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ Node val;
+ port::RWMutex* rlock = nullptr;
+ bool status = map_.Find(Node(k), &val, &rlock);
+ if (erased.find(k) == erased.end()) {
+ assert(status);
+ rlock->ReadUnlock();
+ assert(val.val_ == std::string(1000, k % 255));
+ } else {
+ assert(!status);
+ }
+ }
+}
+
+TEST_F(EvictableHashTableTest, TestEvict) {
+ const uint64_t max_keys = 1024 * 1024;
+
+ // insert
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ map_.Insert(new Node(k, std::string(1000, k % 255)));
+ }
+
+ // verify
+ for (uint64_t k = 0; k < max_keys; ++k) {
+ Node* val = map_.Evict();
+ // unfortunately we can't predict eviction value since it is from any one of
+ // the lock stripe
+ assert(val);
+ assert(val->val_ == std::string(1000, val->key_ % 255));
+ delete val;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/persistent_cache/lrulist.h b/src/rocksdb/utilities/persistent_cache/lrulist.h
new file mode 100644
index 000000000..a608890fc
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/lrulist.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// LRU element definition
+//
+// Any object that needs to be part of the LRU algorithm should extend this
+// class
+template <class T>
+struct LRUElement {
+ explicit LRUElement() : next_(nullptr), prev_(nullptr), refs_(0) {}
+
+ virtual ~LRUElement() { assert(!refs_); }
+
+ T* next_;
+ T* prev_;
+ std::atomic<size_t> refs_;
+};
+
+// LRU implementation
+//
+// In place LRU implementation. There is no copy or allocation involved when
+// inserting or removing an element. This makes the data structure slim
+template <class T>
+class LRUList {
+ public:
+ virtual ~LRUList() {
+ MutexLock _(&lock_);
+ assert(!head_);
+ assert(!tail_);
+ }
+
+ // Push element into the LRU at the cold end
+ inline void Push(T* const t) {
+ assert(t);
+ assert(!t->next_);
+ assert(!t->prev_);
+
+ MutexLock _(&lock_);
+
+ assert((!head_ && !tail_) || (head_ && tail_));
+ assert(!head_ || !head_->prev_);
+ assert(!tail_ || !tail_->next_);
+
+ t->next_ = head_;
+ if (head_) {
+ head_->prev_ = t;
+ }
+
+ head_ = t;
+ if (!tail_) {
+ tail_ = t;
+ }
+ }
+
+ // Unlink the element from the LRU
+ inline void Unlink(T* const t) {
+ MutexLock _(&lock_);
+ UnlinkImpl(t);
+ }
+
+ // Evict an element from the LRU
+ inline T* Pop() {
+ MutexLock _(&lock_);
+
+ assert(tail_ && head_);
+ assert(!tail_->next_);
+ assert(!head_->prev_);
+
+ T* t = head_;
+ while (t && t->refs_) {
+ t = t->next_;
+ }
+
+ if (!t) {
+ // nothing can be evicted
+ return nullptr;
+ }
+
+ assert(!t->refs_);
+
+ // unlike the element
+ UnlinkImpl(t);
+ return t;
+ }
+
+ // Move the element from the front of the list to the back of the list
+ inline void Touch(T* const t) {
+ MutexLock _(&lock_);
+ UnlinkImpl(t);
+ PushBackImpl(t);
+ }
+
+ // Check if the LRU is empty
+ inline bool IsEmpty() const {
+ MutexLock _(&lock_);
+ return !head_ && !tail_;
+ }
+
+ private:
+ // Unlink an element from the LRU
+ void UnlinkImpl(T* const t) {
+ assert(t);
+
+ lock_.AssertHeld();
+
+ assert(head_ && tail_);
+ assert(t->prev_ || head_ == t);
+ assert(t->next_ || tail_ == t);
+
+ if (t->prev_) {
+ t->prev_->next_ = t->next_;
+ }
+ if (t->next_) {
+ t->next_->prev_ = t->prev_;
+ }
+
+ if (tail_ == t) {
+ tail_ = tail_->prev_;
+ }
+ if (head_ == t) {
+ head_ = head_->next_;
+ }
+
+ t->next_ = t->prev_ = nullptr;
+ }
+
+ // Insert an element at the hot end
+ inline void PushBack(T* const t) {
+ MutexLock _(&lock_);
+ PushBackImpl(t);
+ }
+
+ inline void PushBackImpl(T* const t) {
+ assert(t);
+ assert(!t->next_);
+ assert(!t->prev_);
+
+ lock_.AssertHeld();
+
+ assert((!head_ && !tail_) || (head_ && tail_));
+ assert(!head_ || !head_->prev_);
+ assert(!tail_ || !tail_->next_);
+
+ t->prev_ = tail_;
+ if (tail_) {
+ tail_->next_ = t;
+ }
+
+ tail_ = t;
+ if (!head_) {
+ head_ = tail_;
+ }
+ }
+
+ mutable port::Mutex lock_; // synchronization primitive
+ T* head_ = nullptr; // front (cold)
+ T* tail_ = nullptr; // back (hot)
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
new file mode 100644
index 000000000..9d6e15d6b
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
@@ -0,0 +1,359 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
+#else
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <unordered_map>
+
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+#include "table/block_based/block_builder.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+DEFINE_int32(nsec, 10, "nsec");
+DEFINE_int32(nthread_write, 1, "Insert threads");
+DEFINE_int32(nthread_read, 1, "Lookup threads");
+DEFINE_string(path, "/tmp/microbench/blkcache", "Path for cachefile");
+DEFINE_string(log_path, "/tmp/log", "Path for the log file");
+DEFINE_uint64(cache_size, std::numeric_limits<uint64_t>::max(), "Cache size");
+DEFINE_int32(iosize, 4 * 1024, "Read IO size");
+DEFINE_int32(writer_iosize, 4 * 1024, "File writer IO size");
+DEFINE_int32(writer_qdepth, 1, "File writer qdepth");
+DEFINE_bool(enable_pipelined_writes, false, "Enable async writes");
+DEFINE_string(cache_type, "block_cache",
+ "Cache type. (block_cache, volatile, tiered)");
+DEFINE_bool(benchmark, false, "Benchmark mode");
+DEFINE_int32(volatile_cache_pct, 10, "Percentage of cache in memory tier.");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<PersistentCacheTier> NewVolatileCache() {
+ assert(FLAGS_cache_size != std::numeric_limits<uint64_t>::max());
+ std::unique_ptr<PersistentCacheTier> pcache(
+ new VolatileCacheTier(FLAGS_cache_size));
+ return pcache;
+}
+
+std::unique_ptr<PersistentCacheTier> NewBlockCache() {
+ std::shared_ptr<Logger> log;
+ if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) {
+ fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str());
+ return nullptr;
+ }
+
+ PersistentCacheConfig opt(Env::Default(), FLAGS_path, FLAGS_cache_size, log);
+ opt.writer_dispatch_size = FLAGS_writer_iosize;
+ opt.writer_qdepth = FLAGS_writer_qdepth;
+ opt.pipeline_writes = FLAGS_enable_pipelined_writes;
+ opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+ std::unique_ptr<PersistentCacheTier> cache(new BlockCacheTier(opt));
+ Status status = cache->Open();
+ return cache;
+}
+
+// create a new cache tier
+// construct a tiered RAM+Block cache
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+ const size_t mem_size, const PersistentCacheConfig& opt) {
+ std::unique_ptr<PersistentTieredCache> tcache(new PersistentTieredCache());
+ // create primary tier
+ assert(mem_size);
+ auto pcache =
+ std::shared_ptr<PersistentCacheTier>(new VolatileCacheTier(mem_size));
+ tcache->AddTier(pcache);
+ // create secondary tier
+ auto scache = std::shared_ptr<PersistentCacheTier>(new BlockCacheTier(opt));
+ tcache->AddTier(scache);
+
+ Status s = tcache->Open();
+ assert(s.ok());
+ return tcache;
+}
+
+std::unique_ptr<PersistentTieredCache> NewTieredCache() {
+ std::shared_ptr<Logger> log;
+ if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) {
+ fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str());
+ abort();
+ }
+
+ auto pct = FLAGS_volatile_cache_pct / static_cast<double>(100);
+ PersistentCacheConfig opt(Env::Default(), FLAGS_path,
+ (1 - pct) * FLAGS_cache_size, log);
+ opt.writer_dispatch_size = FLAGS_writer_iosize;
+ opt.writer_qdepth = FLAGS_writer_qdepth;
+ opt.pipeline_writes = FLAGS_enable_pipelined_writes;
+ opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+ return NewTieredCache(FLAGS_cache_size * pct, opt);
+}
+
+//
+// Benchmark driver
+//
+class CacheTierBenchmark {
+ public:
+ explicit CacheTierBenchmark(std::shared_ptr<PersistentCacheTier>&& cache)
+ : cache_(cache) {
+ if (FLAGS_nthread_read) {
+ fprintf(stdout, "Pre-populating\n");
+ Prepop();
+ fprintf(stdout, "Pre-population completed\n");
+ }
+
+ stats_.Clear();
+
+ // Start IO threads
+ std::list<port::Thread> threads;
+ Spawn(FLAGS_nthread_write, &threads,
+ std::bind(&CacheTierBenchmark::Write, this));
+ Spawn(FLAGS_nthread_read, &threads,
+ std::bind(&CacheTierBenchmark::Read, this));
+
+ // Wait till FLAGS_nsec and then signal to quit
+ StopWatchNano t(SystemClock::Default().get(), /*auto_start=*/true);
+ size_t sec = t.ElapsedNanos() / 1000000000ULL;
+ while (!quit_) {
+ sec = t.ElapsedNanos() / 1000000000ULL;
+ quit_ = sec > size_t(FLAGS_nsec);
+ /* sleep override */ sleep(1);
+ }
+
+ // Wait for threads to exit
+ Join(&threads);
+ // Print stats
+ PrintStats(sec);
+ // Close the cache
+ cache_->TEST_Flush();
+ cache_->Close();
+ }
+
+ private:
+ void PrintStats(const size_t sec) {
+ std::ostringstream msg;
+ msg << "Test stats" << std::endl
+ << "* Elapsed: " << sec << " s" << std::endl
+ << "* Write Latency:" << std::endl
+ << stats_.write_latency_.ToString() << std::endl
+ << "* Read Latency:" << std::endl
+ << stats_.read_latency_.ToString() << std::endl
+ << "* Bytes written:" << std::endl
+ << stats_.bytes_written_.ToString() << std::endl
+ << "* Bytes read:" << std::endl
+ << stats_.bytes_read_.ToString() << std::endl
+ << "Cache stats:" << std::endl
+ << cache_->PrintStats() << std::endl;
+ fprintf(stderr, "%s\n", msg.str().c_str());
+ }
+
+ //
+ // Insert implementation and corresponding helper functions
+ //
+ void Prepop() {
+ for (uint64_t i = 0; i < 1024 * 1024; ++i) {
+ InsertKey(i);
+ insert_key_limit_++;
+ read_key_limit_++;
+ }
+
+ // Wait until data is flushed
+ cache_->TEST_Flush();
+ // warmup the cache
+ for (uint64_t i = 0; i < 1024 * 1024; ReadKey(i++)) {
+ }
+ }
+
+ void Write() {
+ while (!quit_) {
+ InsertKey(insert_key_limit_++);
+ }
+ }
+
+ void InsertKey(const uint64_t key) {
+ // construct key
+ uint64_t k[3];
+ Slice block_key = FillKey(k, key);
+
+ // construct value
+ auto block = NewBlock(key);
+
+ // insert
+ StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
+ while (true) {
+ Status status = cache_->Insert(block_key, block.get(), FLAGS_iosize);
+ if (status.ok()) {
+ break;
+ }
+
+ // transient error is possible if we run without pipelining
+ assert(!FLAGS_enable_pipelined_writes);
+ }
+
+ // adjust stats
+ const size_t elapsed_micro = timer.ElapsedNanos() / 1000;
+ stats_.write_latency_.Add(elapsed_micro);
+ stats_.bytes_written_.Add(FLAGS_iosize);
+ }
+
+ //
+ // Read implementation
+ //
+ void Read() {
+ while (!quit_) {
+ ReadKey(random() % read_key_limit_);
+ }
+ }
+
+ void ReadKey(const uint64_t val) {
+ // construct key
+ uint64_t k[3];
+ Slice key = FillKey(k, val);
+
+ // Lookup in cache
+ StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
+ std::unique_ptr<char[]> block;
+ size_t size;
+ Status status = cache_->Lookup(key, &block, &size);
+ if (!status.ok()) {
+ fprintf(stderr, "%s\n", status.ToString().c_str());
+ }
+ assert(status.ok());
+ assert(size == (size_t)FLAGS_iosize);
+
+ // adjust stats
+ const size_t elapsed_micro = timer.ElapsedNanos() / 1000;
+ stats_.read_latency_.Add(elapsed_micro);
+ stats_.bytes_read_.Add(FLAGS_iosize);
+
+ // verify content
+ if (!FLAGS_benchmark) {
+ auto expected_block = NewBlock(val);
+ assert(memcmp(block.get(), expected_block.get(), FLAGS_iosize) == 0);
+ }
+ }
+
+ // create data for a key by filling with a certain pattern
+ std::unique_ptr<char[]> NewBlock(const uint64_t val) {
+ std::unique_ptr<char[]> data(new char[FLAGS_iosize]);
+ memset(data.get(), val % 255, FLAGS_iosize);
+ return data;
+ }
+
+ // spawn threads
+ void Spawn(const size_t n, std::list<port::Thread>* threads,
+ const std::function<void()>& fn) {
+ for (size_t i = 0; i < n; ++i) {
+ threads->emplace_back(fn);
+ }
+ }
+
+ // join threads
+ void Join(std::list<port::Thread>* threads) {
+ for (auto& th : *threads) {
+ th.join();
+ }
+ }
+
+ // construct key
+ Slice FillKey(uint64_t (&k)[3], const uint64_t val) {
+ k[0] = k[1] = 0;
+ k[2] = val;
+ void* p = static_cast<void*>(&k);
+ return Slice(static_cast<char*>(p), sizeof(k));
+ }
+
+ // benchmark stats
+ struct Stats {
+ void Clear() {
+ bytes_written_.Clear();
+ bytes_read_.Clear();
+ read_latency_.Clear();
+ write_latency_.Clear();
+ }
+
+ HistogramImpl bytes_written_;
+ HistogramImpl bytes_read_;
+ HistogramImpl read_latency_;
+ HistogramImpl write_latency_;
+ };
+
+ std::shared_ptr<PersistentCacheTier> cache_; // cache implementation
+ std::atomic<uint64_t> insert_key_limit_{0}; // data inserted upto
+ std::atomic<uint64_t> read_key_limit_{0}; // data can be read safely upto
+ bool quit_ = false; // Quit thread ?
+ mutable Stats stats_; // Stats
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+//
+// main
+//
+int main(int argc, char** argv) {
+ GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+ std::string(argv[0]) + " [OPTIONS]...");
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
+
+ std::ostringstream msg;
+ msg << "Config" << std::endl
+ << "======" << std::endl
+ << "* nsec=" << FLAGS_nsec << std::endl
+ << "* nthread_write=" << FLAGS_nthread_write << std::endl
+ << "* path=" << FLAGS_path << std::endl
+ << "* cache_size=" << FLAGS_cache_size << std::endl
+ << "* iosize=" << FLAGS_iosize << std::endl
+ << "* writer_iosize=" << FLAGS_writer_iosize << std::endl
+ << "* writer_qdepth=" << FLAGS_writer_qdepth << std::endl
+ << "* enable_pipelined_writes=" << FLAGS_enable_pipelined_writes
+ << std::endl
+ << "* cache_type=" << FLAGS_cache_type << std::endl
+ << "* benchmark=" << FLAGS_benchmark << std::endl
+ << "* volatile_cache_pct=" << FLAGS_volatile_cache_pct << std::endl;
+
+ fprintf(stderr, "%s\n", msg.str().c_str());
+
+ std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCacheTier> cache;
+ if (FLAGS_cache_type == "block_cache") {
+ fprintf(stderr, "Using block cache implementation\n");
+ cache = ROCKSDB_NAMESPACE::NewBlockCache();
+ } else if (FLAGS_cache_type == "volatile") {
+ fprintf(stderr, "Using volatile cache implementation\n");
+ cache = ROCKSDB_NAMESPACE::NewVolatileCache();
+ } else if (FLAGS_cache_type == "tiered") {
+ fprintf(stderr, "Using tiered cache implementation\n");
+ cache = ROCKSDB_NAMESPACE::NewTieredCache();
+ } else {
+ fprintf(stderr, "Unknown option for cache\n");
+ }
+
+ assert(cache);
+ if (!cache) {
+ fprintf(stderr, "Error creating cache\n");
+ abort();
+ }
+
+ std::unique_ptr<ROCKSDB_NAMESPACE::CacheTierBenchmark> benchmark(
+ new ROCKSDB_NAMESPACE::CacheTierBenchmark(std::move(cache)));
+
+ return 0;
+}
+#endif // #ifndef GFLAGS
+#else
+int main(int, char**) { return 0; }
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
new file mode 100644
index 000000000..d1b18b68a
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
@@ -0,0 +1,462 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#if !defined ROCKSDB_LITE
+
+#include "utilities/persistent_cache/persistent_cache_test.h"
+
+#include <functional>
+#include <memory>
+#include <thread>
+
+#include "file/file_util.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const double kStressFactor = .125;
+
+#ifdef OS_LINUX
+static void OnOpenForRead(void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewRandomAccessFile:O_DIRECT",
+ std::bind(OnOpenForRead, std::placeholders::_1));
+}
+
+static void OnOpenForWrite(void* arg) {
+ int* val = static_cast<int*>(arg);
+ *val &= ~O_DIRECT;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile:O_DIRECT",
+ std::bind(OnOpenForWrite, std::placeholders::_1));
+}
+#endif
+
+static void OnDeleteDir(void* arg) {
+ char* dir = static_cast<char*>(arg);
+ ASSERT_OK(DestroyDir(Env::Default(), std::string(dir)));
+}
+
+//
+// Simple logger that prints message on stdout
+//
+class ConsoleLogger : public Logger {
+ public:
+ using Logger::Logv;
+ ConsoleLogger() : Logger(InfoLogLevel::ERROR_LEVEL) {}
+
+ void Logv(const char* format, va_list ap) override {
+ MutexLock _(&lock_);
+ vprintf(format, ap);
+ printf("\n");
+ }
+
+ port::Mutex lock_;
+};
+
+// construct a tiered RAM+Block cache
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+ const size_t mem_size, const PersistentCacheConfig& opt) {
+ std::unique_ptr<PersistentTieredCache> tcache(new PersistentTieredCache());
+ // create primary tier
+ assert(mem_size);
+ auto pcache = std::shared_ptr<PersistentCacheTier>(new VolatileCacheTier(
+ /*is_compressed*/ true, mem_size));
+ tcache->AddTier(pcache);
+ // create secondary tier
+ auto scache = std::shared_ptr<PersistentCacheTier>(new BlockCacheTier(opt));
+ tcache->AddTier(scache);
+
+ Status s = tcache->Open();
+ assert(s.ok());
+ return tcache;
+}
+
+// create block cache
+std::unique_ptr<PersistentCacheTier> NewBlockCache(
+ Env* env, const std::string& path,
+ const uint64_t max_size = std::numeric_limits<uint64_t>::max(),
+ const bool enable_direct_writes = false) {
+ const uint32_t max_file_size =
+ static_cast<uint32_t>(12 * 1024 * 1024 * kStressFactor);
+ auto log = std::make_shared<ConsoleLogger>();
+ PersistentCacheConfig opt(env, path, max_size, log);
+ opt.cache_file_size = max_file_size;
+ opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+ opt.enable_direct_writes = enable_direct_writes;
+ std::unique_ptr<PersistentCacheTier> scache(new BlockCacheTier(opt));
+ Status s = scache->Open();
+ assert(s.ok());
+ return scache;
+}
+
+// create a new cache tier
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+ Env* env, const std::string& path, const uint64_t max_volatile_cache_size,
+ const uint64_t max_block_cache_size =
+ std::numeric_limits<uint64_t>::max()) {
+ const uint32_t max_file_size =
+ static_cast<uint32_t>(12 * 1024 * 1024 * kStressFactor);
+ auto log = std::make_shared<ConsoleLogger>();
+ auto opt = PersistentCacheConfig(env, path, max_block_cache_size, log);
+ opt.cache_file_size = max_file_size;
+ opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+ // create tier out of the two caches
+ auto cache = NewTieredCache(max_volatile_cache_size, opt);
+ return cache;
+}
+
+PersistentCacheTierTest::PersistentCacheTierTest()
+ : path_(test::PerThreadDBPath("cache_test")) {
+#ifdef OS_LINUX
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewRandomAccessFile:O_DIRECT", OnOpenForRead);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewWritableFile:O_DIRECT", OnOpenForWrite);
+#endif
+}
+
+// Block cache tests
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithFileCreateError) {
+ cache_ = NewBlockCache(Env::Default(), path_,
+ /*size=*/std::numeric_limits<uint64_t>::max(),
+ /*direct_writes=*/false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "BlockCacheTier::NewCacheFile:DeleteDir", OnDeleteDir);
+
+ RunNegativeInsertTest(/*nthreads=*/1,
+ /*max_keys*/
+ static_cast<size_t>(10 * 1024 * kStressFactor));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Travis is unable to handle the normal version of the tests running out of
+// fds, out of space and timeouts. This is an easier version of the test
+// specifically written for Travis
+TEST_F(PersistentCacheTierTest, DISABLED_BasicTest) {
+ cache_ = std::make_shared<VolatileCacheTier>();
+ RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+
+ cache_ = NewBlockCache(Env::Default(), path_,
+ /*size=*/std::numeric_limits<uint64_t>::max(),
+ /*direct_writes=*/true);
+ RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+
+ cache_ = NewTieredCache(Env::Default(), path_,
+ /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024));
+ RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+}
+
+// Volatile cache tests
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsert) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys :
+ {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = std::make_shared<VolatileCacheTier>();
+ RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+}
+
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsertWithEviction) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = std::make_shared<VolatileCacheTier>(
+ /*compressed=*/true,
+ /*size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor));
+ RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+}
+
+// Block cache tests
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsert) {
+ for (auto direct_writes : {true, false}) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys :
+ {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = NewBlockCache(Env::Default(), path_,
+ /*size=*/std::numeric_limits<uint64_t>::max(),
+ direct_writes);
+ RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+ }
+}
+
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithEviction) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = NewBlockCache(
+ Env::Default(), path_,
+ /*max_size=*/static_cast<size_t>(200 * 1024 * 1024 * kStressFactor));
+ RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+}
+
+// Tiered cache tests
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsert) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys :
+ {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = NewTieredCache(
+ Env::Default(), path_,
+ /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor));
+ RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+}
+
+// the tests causes a lot of file deletions which Travis limited testing
+// environment cannot handle
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) {
+ for (auto nthreads : {1, 5}) {
+ for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+ cache_ = NewTieredCache(
+ Env::Default(), path_,
+ /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor),
+ /*block_cache_size*/
+ static_cast<size_t>(200 * 1024 * 1024 * kStressFactor));
+ RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+ }
+ }
+}
+
+std::shared_ptr<PersistentCacheTier> MakeVolatileCache(
+ Env* /*env*/, const std::string& /*dbname*/) {
+ return std::make_shared<VolatileCacheTier>();
+}
+
+std::shared_ptr<PersistentCacheTier> MakeBlockCache(Env* env,
+ const std::string& dbname) {
+ return NewBlockCache(env, dbname);
+}
+
+std::shared_ptr<PersistentCacheTier> MakeTieredCache(
+ Env* env, const std::string& dbname) {
+ const auto memory_size = 1 * 1024 * 1024 * kStressFactor;
+ return NewTieredCache(env, dbname, static_cast<size_t>(memory_size));
+}
+
+#ifdef OS_LINUX
+static void UniqueIdCallback(void* arg) {
+ int* result = reinterpret_cast<int*>(arg);
+ if (*result == -1) {
+ *result = 0;
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+#endif
+
+TEST_F(PersistentCacheTierTest, FactoryTest) {
+ for (auto nvm_opt : {true, false}) {
+ ASSERT_FALSE(cache_);
+ auto log = std::make_shared<ConsoleLogger>();
+ std::shared_ptr<PersistentCache> cache;
+ ASSERT_OK(NewPersistentCache(Env::Default(), path_,
+ /*size=*/1 * 1024 * 1024 * 1024, log, nvm_opt,
+ &cache));
+ ASSERT_TRUE(cache);
+ ASSERT_EQ(cache->Stats().size(), 1);
+ ASSERT_TRUE(cache->Stats()[0].size());
+ cache.reset();
+ }
+}
+
+PersistentCacheDBTest::PersistentCacheDBTest()
+ : DBTestBase("cache_test", /*env_do_fsync=*/true) {
+#ifdef OS_LINUX
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "NewRandomAccessFile:O_DIRECT", OnOpenForRead);
+#endif
+}
+
+// test template
+void PersistentCacheDBTest::RunTest(
+ const std::function<std::shared_ptr<PersistentCacheTier>(bool)>& new_pcache,
+ const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) {
+ // number of insertion interations
+ int num_iter = static_cast<int>(max_keys * kStressFactor);
+
+ for (size_t iter = 0; iter < max_usecase; iter++) {
+ Options options;
+ options.write_buffer_size =
+ static_cast<size_t>(64 * 1024 * kStressFactor); // small write buffer
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options = CurrentOptions(options);
+
+ // setup page cache
+ std::shared_ptr<PersistentCacheTier> pcache;
+ BlockBasedTableOptions table_options;
+ table_options.cache_index_and_filter_blocks = true;
+
+ const size_t size_max = std::numeric_limits<size_t>::max();
+
+ switch (iter) {
+ case 0:
+ // page cache, block cache, no-compressed cache
+ pcache = new_pcache(/*is_compressed=*/true);
+ table_options.persistent_cache = pcache;
+ table_options.block_cache = NewLRUCache(size_max);
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 1:
+ // page cache, block cache, compressed cache
+ pcache = new_pcache(/*is_compressed=*/true);
+ table_options.persistent_cache = pcache;
+ table_options.block_cache = NewLRUCache(size_max);
+ table_options.block_cache_compressed = NewLRUCache(size_max);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 2:
+ // page cache, block cache, compressed cache + KNoCompression
+ // both block cache and compressed cache, but DB is not compressed
+ // also, make block cache sizes bigger, to trigger block cache hits
+ pcache = new_pcache(/*is_compressed=*/true);
+ table_options.persistent_cache = pcache;
+ table_options.block_cache = NewLRUCache(size_max);
+ table_options.block_cache_compressed = NewLRUCache(size_max);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.compression = kNoCompression;
+ break;
+ case 3:
+ // page cache, no block cache, no compressed cache
+ pcache = new_pcache(/*is_compressed=*/false);
+ table_options.persistent_cache = pcache;
+ table_options.block_cache = nullptr;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ case 4:
+ // page cache, no block cache, no compressed cache
+ // Page cache caches compressed blocks
+ pcache = new_pcache(/*is_compressed=*/true);
+ table_options.persistent_cache = pcache;
+ table_options.block_cache = nullptr;
+ table_options.block_cache_compressed = nullptr;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ break;
+ default:
+ FAIL();
+ }
+
+ std::vector<std::string> values;
+ // insert data
+ Insert(options, table_options, num_iter, &values);
+ // flush all data in cache to device
+ pcache->TEST_Flush();
+ // verify data
+ Verify(num_iter, values);
+
+ auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ auto compressed_block_hit =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+ auto compressed_block_miss =
+ TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+ auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT);
+ auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS);
+
+ // check that we triggered the appropriate code paths in the cache
+ switch (iter) {
+ case 0:
+ // page cache, block cache, no-compressed cache
+ ASSERT_GT(page_miss, 0);
+ ASSERT_GT(page_hit, 0);
+ ASSERT_GT(block_miss, 0);
+ ASSERT_EQ(compressed_block_miss, 0);
+ ASSERT_EQ(compressed_block_hit, 0);
+ break;
+ case 1:
+ // page cache, block cache, compressed cache
+ ASSERT_GT(page_miss, 0);
+ ASSERT_GT(block_miss, 0);
+ ASSERT_GT(compressed_block_miss, 0);
+ break;
+ case 2:
+ // page cache, block cache, compressed cache + KNoCompression
+ ASSERT_GT(page_miss, 0);
+ ASSERT_GT(page_hit, 0);
+ ASSERT_GT(block_miss, 0);
+ ASSERT_GT(compressed_block_miss, 0);
+ // remember kNoCompression
+ ASSERT_EQ(compressed_block_hit, 0);
+ break;
+ case 3:
+ case 4:
+ // page cache, no block cache, no compressed cache
+ ASSERT_GT(page_miss, 0);
+ ASSERT_GT(page_hit, 0);
+ ASSERT_EQ(compressed_block_hit, 0);
+ ASSERT_EQ(compressed_block_miss, 0);
+ break;
+ default:
+ FAIL();
+ }
+
+ options.create_if_missing = true;
+ DestroyAndReopen(options);
+
+ ASSERT_OK(pcache->Close());
+ }
+}
+
+// Travis is unable to handle the normal version of the tests running out of
+// fds, out of space and timeouts. This is an easier version of the test
+// specifically written for Travis.
+// Now used generally because main tests are too expensive as unit tests.
+TEST_F(PersistentCacheDBTest, BasicTest) {
+ RunTest(std::bind(&MakeBlockCache, env_, dbname_), /*max_keys=*/1024,
+ /*max_usecase=*/1);
+}
+
+// test table with block page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_BlockCacheTest) {
+ RunTest(std::bind(&MakeBlockCache, env_, dbname_));
+}
+
+// test table with volatile page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_VolatileCacheTest) {
+ RunTest(std::bind(&MakeVolatileCache, env_, dbname_));
+}
+
+// test table with tiered page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_TieredCacheTest) {
+ RunTest(std::bind(&MakeTieredCache, env_, dbname_));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#else // !defined ROCKSDB_LITE
+int main() { return 0; }
+#endif // !defined ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h
new file mode 100644
index 000000000..f13155ed6
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h
@@ -0,0 +1,286 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "memory/arena.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Unit tests for testing PersistentCacheTier
+//
+class PersistentCacheTierTest : public testing::Test {
+ public:
+ PersistentCacheTierTest();
+ virtual ~PersistentCacheTierTest() {
+ if (cache_) {
+ Status s = cache_->Close();
+ assert(s.ok());
+ }
+ }
+
+ protected:
+ // Flush cache
+ void Flush() {
+ if (cache_) {
+ cache_->TEST_Flush();
+ }
+ }
+
+ // create threaded workload
+ template <class T>
+ std::list<port::Thread> SpawnThreads(const size_t n, const T& fn) {
+ std::list<port::Thread> threads;
+ for (size_t i = 0; i < n; i++) {
+ port::Thread th(fn);
+ threads.push_back(std::move(th));
+ }
+ return threads;
+ }
+
+ // Wait for threads to join
+ void Join(std::list<port::Thread>&& threads) {
+ for (auto& th : threads) {
+ th.join();
+ }
+ threads.clear();
+ }
+
+ // Run insert workload in threads
+ void Insert(const size_t nthreads, const size_t max_keys) {
+ key_ = 0;
+ max_keys_ = max_keys;
+ // spawn threads
+ auto fn = std::bind(&PersistentCacheTierTest::InsertImpl, this);
+ auto threads = SpawnThreads(nthreads, fn);
+ // join with threads
+ Join(std::move(threads));
+ // Flush cache
+ Flush();
+ }
+
+ // Run verification on the cache
+ void Verify(const size_t nthreads = 1, const bool eviction_enabled = false) {
+ stats_verify_hits_ = 0;
+ stats_verify_missed_ = 0;
+ key_ = 0;
+ // spawn threads
+ auto fn =
+ std::bind(&PersistentCacheTierTest::VerifyImpl, this, eviction_enabled);
+ auto threads = SpawnThreads(nthreads, fn);
+ // join with threads
+ Join(std::move(threads));
+ }
+
+ // pad 0 to numbers
+ std::string PaddedNumber(const size_t data, const size_t pad_size) {
+ assert(pad_size);
+ char* ret = new char[pad_size];
+ int pos = static_cast<int>(pad_size) - 1;
+ size_t count = 0;
+ size_t t = data;
+ // copy numbers
+ while (t) {
+ count++;
+ ret[pos--] = '0' + t % 10;
+ t = t / 10;
+ }
+ // copy 0s
+ while (pos >= 0) {
+ ret[pos--] = '0';
+ }
+ // post condition
+ assert(count <= pad_size);
+ assert(pos == -1);
+ std::string result(ret, pad_size);
+ delete[] ret;
+ return result;
+ }
+
+ // Insert workload implementation
+ void InsertImpl() {
+ const std::string prefix = "key_prefix_";
+
+ while (true) {
+ size_t i = key_++;
+ if (i >= max_keys_) {
+ break;
+ }
+
+ char data[4 * 1024];
+ memset(data, '0' + (i % 10), sizeof(data));
+ auto k = prefix + PaddedNumber(i, /*count=*/8);
+ Slice key(k);
+ while (true) {
+ Status status = cache_->Insert(key, data, sizeof(data));
+ if (status.ok()) {
+ break;
+ }
+ ASSERT_TRUE(status.IsTryAgain());
+ Env::Default()->SleepForMicroseconds(1 * 1000 * 1000);
+ }
+ }
+ }
+
+ // Verification implementation
+ void VerifyImpl(const bool eviction_enabled = false) {
+ const std::string prefix = "key_prefix_";
+ while (true) {
+ size_t i = key_++;
+ if (i >= max_keys_) {
+ break;
+ }
+
+ char edata[4 * 1024];
+ memset(edata, '0' + (i % 10), sizeof(edata));
+ auto k = prefix + PaddedNumber(i, /*count=*/8);
+ Slice key(k);
+ std::unique_ptr<char[]> block;
+ size_t block_size;
+
+ if (eviction_enabled) {
+ if (!cache_->Lookup(key, &block, &block_size).ok()) {
+ // assume that the key is evicted
+ stats_verify_missed_++;
+ continue;
+ }
+ }
+
+ ASSERT_OK(cache_->Lookup(key, &block, &block_size));
+ ASSERT_EQ(block_size, sizeof(edata));
+ ASSERT_EQ(memcmp(edata, block.get(), sizeof(edata)), 0);
+ stats_verify_hits_++;
+ }
+ }
+
+ // template for insert test
+ void RunInsertTest(const size_t nthreads, const size_t max_keys) {
+ Insert(nthreads, max_keys);
+ Verify(nthreads);
+ ASSERT_EQ(stats_verify_hits_, max_keys);
+ ASSERT_EQ(stats_verify_missed_, 0);
+
+ ASSERT_OK(cache_->Close());
+ cache_.reset();
+ }
+
+ // template for negative insert test
+ void RunNegativeInsertTest(const size_t nthreads, const size_t max_keys) {
+ Insert(nthreads, max_keys);
+ Verify(nthreads, /*eviction_enabled=*/true);
+ ASSERT_LT(stats_verify_hits_, max_keys);
+ ASSERT_GT(stats_verify_missed_, 0);
+
+ ASSERT_OK(cache_->Close());
+ cache_.reset();
+ }
+
+ // template for insert with eviction test
+ void RunInsertTestWithEviction(const size_t nthreads, const size_t max_keys) {
+ Insert(nthreads, max_keys);
+ Verify(nthreads, /*eviction_enabled=*/true);
+ ASSERT_EQ(stats_verify_hits_ + stats_verify_missed_, max_keys);
+ ASSERT_GT(stats_verify_hits_, 0);
+ ASSERT_GT(stats_verify_missed_, 0);
+
+ ASSERT_OK(cache_->Close());
+ cache_.reset();
+ }
+
+ const std::string path_;
+ std::shared_ptr<Logger> log_;
+ std::shared_ptr<PersistentCacheTier> cache_;
+ std::atomic<size_t> key_{0};
+ size_t max_keys_ = 0;
+ std::atomic<size_t> stats_verify_hits_{0};
+ std::atomic<size_t> stats_verify_missed_{0};
+};
+
+//
+// RocksDB tests
+//
+class PersistentCacheDBTest : public DBTestBase {
+ public:
+ PersistentCacheDBTest();
+
+ static uint64_t TestGetTickerCount(const Options& options,
+ Tickers ticker_type) {
+ return static_cast<uint32_t>(
+ options.statistics->getTickerCount(ticker_type));
+ }
+
+ // insert data to table
+ void Insert(const Options& options,
+ const BlockBasedTableOptions& /*table_options*/,
+ const int num_iter, std::vector<std::string>* values) {
+ CreateAndReopenWithCF({"pikachu"}, options);
+ // default column family doesn't have block cache
+ Options no_block_cache_opts;
+ no_block_cache_opts.statistics = options.statistics;
+ no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+ BlockBasedTableOptions table_options_no_bc;
+ table_options_no_bc.no_block_cache = true;
+ no_block_cache_opts.table_factory.reset(
+ NewBlockBasedTableFactory(table_options_no_bc));
+ ReopenWithColumnFamilies(
+ {"default", "pikachu"},
+ std::vector<Options>({no_block_cache_opts, options}));
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+ std::string str;
+ for (int i = 0; i < num_iter; i++) {
+ if (i % 4 == 0) { // high compression ratio
+ str = rnd.RandomString(1000);
+ }
+ values->push_back(str);
+ ASSERT_OK(Put(1, Key(i), (*values)[i]));
+ }
+
+ // flush all data from memtable so that reads are from block cache
+ ASSERT_OK(Flush(1));
+ }
+
+ // verify data
+ void Verify(const int num_iter, const std::vector<std::string>& values) {
+ for (int j = 0; j < 2; ++j) {
+ for (int i = 0; i < num_iter; i++) {
+ ASSERT_EQ(Get(1, Key(i)), values[i]);
+ }
+ }
+ }
+
+ // test template
+ void RunTest(const std::function<std::shared_ptr<PersistentCacheTier>(bool)>&
+ new_pcache,
+ const size_t max_keys, const size_t max_usecase);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
new file mode 100644
index 000000000..54cbce8f7
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+
+#include <cinttypes>
+#include <sstream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string PersistentCacheConfig::ToString() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " path: %s\n", path.c_str());
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " enable_direct_reads: %d\n",
+ enable_direct_reads);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " enable_direct_writes: %d\n",
+ enable_direct_writes);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cache_size: %" PRIu64 "\n", cache_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cache_file_size: %" PRIu32 "\n",
+ cache_file_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " writer_qdepth: %" PRIu32 "\n",
+ writer_qdepth);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " pipeline_writes: %d\n", pipeline_writes);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " max_write_pipeline_backlog_size: %" PRIu64 "\n",
+ max_write_pipeline_backlog_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " write_buffer_size: %" PRIu32 "\n",
+ write_buffer_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " writer_dispatch_size: %" PRIu64 "\n",
+ writer_dispatch_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " is_compressed: %d\n", is_compressed);
+ ret.append(buffer);
+
+ return ret;
+}
+
+//
+// PersistentCacheTier implementation
+//
+Status PersistentCacheTier::Open() {
+ if (next_tier_) {
+ return next_tier_->Open();
+ }
+ return Status::OK();
+}
+
+Status PersistentCacheTier::Close() {
+ if (next_tier_) {
+ return next_tier_->Close();
+ }
+ return Status::OK();
+}
+
+bool PersistentCacheTier::Reserve(const size_t /*size*/) {
+ // default implementation is a pass through
+ return true;
+}
+
+bool PersistentCacheTier::Erase(const Slice& /*key*/) {
+ // default implementation is a pass through since not all cache tiers might
+ // support erase
+ return true;
+}
+
+std::string PersistentCacheTier::PrintStats() {
+ std::ostringstream os;
+ for (auto tier_stats : Stats()) {
+ os << "---- next tier -----" << std::endl;
+ for (auto stat : tier_stats) {
+ os << stat.first << ": " << stat.second << std::endl;
+ }
+ }
+ return os.str();
+}
+
+PersistentCache::StatsType PersistentCacheTier::Stats() {
+ if (next_tier_) {
+ return next_tier_->Stats();
+ }
+ return PersistentCache::StatsType{};
+}
+
+uint64_t PersistentCacheTier::NewId() {
+ return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+//
+// PersistentTieredCache implementation
+//
+PersistentTieredCache::~PersistentTieredCache() { assert(tiers_.empty()); }
+
+Status PersistentTieredCache::Open() {
+ assert(!tiers_.empty());
+ return tiers_.front()->Open();
+}
+
+Status PersistentTieredCache::Close() {
+ assert(!tiers_.empty());
+ Status status = tiers_.front()->Close();
+ if (status.ok()) {
+ tiers_.clear();
+ }
+ return status;
+}
+
+bool PersistentTieredCache::Erase(const Slice& key) {
+ assert(!tiers_.empty());
+ return tiers_.front()->Erase(key);
+}
+
+PersistentCache::StatsType PersistentTieredCache::Stats() {
+ assert(!tiers_.empty());
+ return tiers_.front()->Stats();
+}
+
+std::string PersistentTieredCache::PrintStats() {
+ assert(!tiers_.empty());
+ return tiers_.front()->PrintStats();
+}
+
+Status PersistentTieredCache::Insert(const Slice& page_key, const char* data,
+ const size_t size) {
+ assert(!tiers_.empty());
+ return tiers_.front()->Insert(page_key, data, size);
+}
+
+Status PersistentTieredCache::Lookup(const Slice& page_key,
+ std::unique_ptr<char[]>* data,
+ size_t* size) {
+ assert(!tiers_.empty());
+ return tiers_.front()->Lookup(page_key, data, size);
+}
+
+void PersistentTieredCache::AddTier(const Tier& tier) {
+ if (!tiers_.empty()) {
+ tiers_.back()->set_next_tier(tier);
+ }
+ tiers_.push_back(tier);
+}
+
+bool PersistentTieredCache::IsCompressed() {
+ assert(tiers_.size());
+ return tiers_.front()->IsCompressed();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
new file mode 100644
index 000000000..65aadcd3f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
@@ -0,0 +1,342 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "rocksdb/env.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+// Persistent Cache
+//
+// Persistent cache is tiered key-value cache that can use persistent medium. It
+// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
+// The code has been kept generic but significant benchmark/design/development
+// time has been spent to make sure the cache performs appropriately for
+// respective storage medium.
+// The file defines
+// PersistentCacheTier : Implementation that handles individual cache tier
+// PersistentTieresCache : Implementation that handles all tiers as a logical
+// unit
+//
+// PersistentTieredCache architecture:
+// +--------------------------+ PersistentCacheTier that handles multiple tiers
+// | +----------------+ |
+// | | RAM | PersistentCacheTier that handles RAM (VolatileCacheImpl)
+// | +----------------+ |
+// | | next |
+// | v |
+// | +----------------+ |
+// | | NVM | PersistentCacheTier implementation that handles NVM
+// | +----------------+ (BlockCacheImpl)
+// | | next |
+// | V |
+// | +----------------+ |
+// | | LE-SSD | PersistentCacheTier implementation that handles LE-SSD
+// | +----------------+ (BlockCacheImpl)
+// | | |
+// | V |
+// | null |
+// +--------------------------+
+// |
+// V
+// null
+namespace ROCKSDB_NAMESPACE {
+
+// Persistent Cache Config
+//
+// This struct captures all the options that are used to configure persistent
+// cache. Some of the terminologies used in naming the options are
+//
+// dispatch size :
+// This is the size in which IO is dispatched to the device
+//
+// write buffer size :
+// This is the size of an individual write buffer size. Write buffers are
+// grouped to form buffered file.
+//
+// cache size :
+// This is the logical maximum for the cache size
+//
+// qdepth :
+// This is the max number of IOs that can issues to the device in parallel
+//
+// pepeling :
+// The writer code path follows pipelined architecture, which means the
+// operations are handed off from one stage to another
+//
+// pipelining backlog size :
+// With the pipelined architecture, there can always be backlogging of ops in
+// pipeline queues. This is the maximum backlog size after which ops are dropped
+// from queue
+struct PersistentCacheConfig {
+ explicit PersistentCacheConfig(
+ Env* const _env, const std::string& _path, const uint64_t _cache_size,
+ const std::shared_ptr<Logger>& _log,
+ const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
+ env = _env;
+ clock = (env != nullptr) ? env->GetSystemClock().get()
+ : SystemClock::Default().get();
+ path = _path;
+ log = _log;
+ cache_size = _cache_size;
+ writer_dispatch_size = write_buffer_size = _write_buffer_size;
+ }
+
+ //
+ // Validate the settings. Our intentions are to catch erroneous settings ahead
+ // of time instead going violating invariants or causing dead locks.
+ //
+ Status ValidateSettings() const {
+ // (1) check pre-conditions for variables
+ if (!env || path.empty()) {
+ return Status::InvalidArgument("empty or null args");
+ }
+
+ // (2) assert size related invariants
+ // - cache size cannot be less than cache file size
+ // - individual write buffer size cannot be greater than cache file size
+ // - total write buffer size cannot be less than 2X cache file size
+ if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
+ write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
+ return Status::InvalidArgument("invalid cache size");
+ }
+
+ // (2) check writer settings
+ // - Queue depth cannot be 0
+ // - writer_dispatch_size cannot be greater than writer_buffer_size
+ // - dispatch size and buffer size need to be aligned
+ if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
+ write_buffer_size % writer_dispatch_size) {
+ return Status::InvalidArgument("invalid writer settings");
+ }
+
+ return Status::OK();
+ }
+
+ //
+ // Env abstraction to use for system level operations
+ //
+ Env* env;
+ SystemClock* clock;
+ //
+ // Path for the block cache where blocks are persisted
+ //
+ std::string path;
+
+ //
+ // Log handle for logging messages
+ //
+ std::shared_ptr<Logger> log;
+
+ //
+ // Enable direct IO for reading
+ //
+ bool enable_direct_reads = true;
+
+ //
+ // Enable direct IO for writing
+ //
+ bool enable_direct_writes = false;
+
+ //
+ // Logical cache size
+ //
+ uint64_t cache_size = std::numeric_limits<uint64_t>::max();
+
+ // cache-file-size
+ //
+ // Cache consists of multiples of small files. This parameter defines the
+ // size of an individual cache file
+ //
+ // default: 1M
+ uint32_t cache_file_size = 100ULL * 1024 * 1024;
+
+ // writer-qdepth
+ //
+ // The writers can issues IO to the devices in parallel. This parameter
+ // controls the max number if IOs that can issues in parallel to the block
+ // device
+ //
+ // default :1
+ uint32_t writer_qdepth = 1;
+
+ // pipeline-writes
+ //
+ // The write optionally follow pipelined architecture. This helps
+ // avoid regression in the eviction code path of the primary tier. This
+ // parameter defines if pipelining is enabled or disabled
+ //
+ // default: true
+ bool pipeline_writes = true;
+
+ // max-write-pipeline-backlog-size
+ //
+ // Max pipeline buffer size. This is the maximum backlog we can accumulate
+ // while waiting for writes. After the limit, new ops will be dropped.
+ //
+ // Default: 1GiB
+ uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
+
+ // write-buffer-size
+ //
+ // This is the size in which buffer slabs are allocated.
+ //
+ // Default: 1M
+ uint32_t write_buffer_size = 1ULL * 1024 * 1024;
+
+ // write-buffer-count
+ //
+ // This is the total number of buffer slabs. This is calculated as a factor of
+ // file size in order to avoid dead lock.
+ size_t write_buffer_count() const {
+ assert(write_buffer_size);
+ return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
+ write_buffer_size);
+ }
+
+ // writer-dispatch-size
+ //
+ // The writer thread will dispatch the IO at the specified IO size
+ //
+ // default: 1M
+ uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
+
+ // is_compressed
+ //
+ // This option determines if the cache will run in compressed mode or
+ // uncompressed mode
+ bool is_compressed = true;
+
+ PersistentCacheConfig MakePersistentCacheConfig(
+ const std::string& path, const uint64_t size,
+ const std::shared_ptr<Logger>& log);
+
+ std::string ToString() const;
+};
+
+// Persistent Cache Tier
+//
+// This a logical abstraction that defines a tier of the persistent cache. Tiers
+// can be stacked over one another. PersistentCahe provides the basic definition
+// for accessing/storing in the cache. PersistentCacheTier extends the interface
+// to enable management and stacking of tiers.
+class PersistentCacheTier : public PersistentCache {
+ public:
+ using Tier = std::shared_ptr<PersistentCacheTier>;
+
+ virtual ~PersistentCacheTier() {}
+
+ // Open the persistent cache tier
+ virtual Status Open();
+
+ // Close the persistent cache tier
+ virtual Status Close();
+
+ // Reserve space up to 'size' bytes
+ virtual bool Reserve(const size_t size);
+
+ // Erase a key from the cache
+ virtual bool Erase(const Slice& key);
+
+ // Print stats to string recursively
+ virtual std::string PrintStats();
+
+ virtual PersistentCache::StatsType Stats() override;
+
+ // Insert to page cache
+ virtual Status Insert(const Slice& page_key, const char* data,
+ const size_t size) override = 0;
+
+ // Lookup page cache by page identifier
+ virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+ size_t* size) override = 0;
+
+ // Does it store compressed data ?
+ virtual bool IsCompressed() override = 0;
+
+ virtual std::string GetPrintableOptions() const override = 0;
+
+ virtual uint64_t NewId() override;
+
+ // Return a reference to next tier
+ virtual Tier& next_tier() { return next_tier_; }
+
+ // Set the value for next tier
+ virtual void set_next_tier(const Tier& tier) {
+ assert(!next_tier_);
+ next_tier_ = tier;
+ }
+
+ virtual void TEST_Flush() {
+ if (next_tier_) {
+ next_tier_->TEST_Flush();
+ }
+ }
+
+ private:
+ Tier next_tier_; // next tier
+ std::atomic<uint64_t> last_id_{1};
+};
+
+// PersistentTieredCache
+//
+// Abstraction that helps you construct a tiers of persistent caches as a
+// unified cache. The tier(s) of cache will act a single tier for management
+// ease and support PersistentCache methods for accessing data.
+class PersistentTieredCache : public PersistentCacheTier {
+ public:
+ virtual ~PersistentTieredCache();
+
+ Status Open() override;
+ Status Close() override;
+ bool Erase(const Slice& key) override;
+ std::string PrintStats() override;
+ PersistentCache::StatsType Stats() override;
+ Status Insert(const Slice& page_key, const char* data,
+ const size_t size) override;
+ Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+ size_t* size) override;
+ bool IsCompressed() override;
+
+ std::string GetPrintableOptions() const override {
+ return "PersistentTieredCache";
+ }
+
+ void AddTier(const Tier& tier);
+
+ Tier& next_tier() override {
+ auto it = tiers_.end();
+ return (*it)->next_tier();
+ }
+
+ void set_next_tier(const Tier& tier) override {
+ auto it = tiers_.end();
+ (*it)->set_next_tier(tier);
+ }
+
+ void TEST_Flush() override {
+ assert(!tiers_.empty());
+ tiers_.front()->TEST_Flush();
+ PersistentCacheTier::TEST_Flush();
+ }
+
+ protected:
+ std::list<Tier> tiers_; // list of tiers top-down
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h
new file mode 100644
index 000000000..2a769652d
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <limits>
+#include <list>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Simple synchronized queue implementation with the option of
+// bounding the queue
+//
+// On overflow, the elements will be discarded
+//
+template <class T>
+class BoundedQueue {
+ public:
+ explicit BoundedQueue(
+ const size_t max_size = std::numeric_limits<size_t>::max())
+ : cond_empty_(&lock_), max_size_(max_size) {}
+
+ virtual ~BoundedQueue() {}
+
+ void Push(T&& t) {
+ MutexLock _(&lock_);
+ if (max_size_ != std::numeric_limits<size_t>::max() &&
+ size_ + t.Size() >= max_size_) {
+ // overflow
+ return;
+ }
+
+ size_ += t.Size();
+ q_.push_back(std::move(t));
+ cond_empty_.SignalAll();
+ }
+
+ T Pop() {
+ MutexLock _(&lock_);
+ while (q_.empty()) {
+ cond_empty_.Wait();
+ }
+
+ T t = std::move(q_.front());
+ size_ -= t.Size();
+ q_.pop_front();
+ return t;
+ }
+
+ size_t Size() const {
+ MutexLock _(&lock_);
+ return size_;
+ }
+
+ private:
+ mutable port::Mutex lock_;
+ port::CondVar cond_empty_;
+ std::list<T> q_;
+ size_t size_ = 0;
+ const size_t max_size_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
new file mode 100644
index 000000000..45d2830aa
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+void VolatileCacheTier::DeleteCacheData(VolatileCacheTier::CacheData* data) {
+ assert(data);
+ delete data;
+}
+
+VolatileCacheTier::~VolatileCacheTier() { index_.Clear(&DeleteCacheData); }
+
+PersistentCache::StatsType VolatileCacheTier::Stats() {
+ std::map<std::string, double> stat;
+ stat.insert({"persistent_cache.volatile_cache.hits",
+ static_cast<double>(stats_.cache_hits_)});
+ stat.insert({"persistent_cache.volatile_cache.misses",
+ static_cast<double>(stats_.cache_misses_)});
+ stat.insert({"persistent_cache.volatile_cache.inserts",
+ static_cast<double>(stats_.cache_inserts_)});
+ stat.insert({"persistent_cache.volatile_cache.evicts",
+ static_cast<double>(stats_.cache_evicts_)});
+ stat.insert({"persistent_cache.volatile_cache.hit_pct",
+ static_cast<double>(stats_.CacheHitPct())});
+ stat.insert({"persistent_cache.volatile_cache.miss_pct",
+ static_cast<double>(stats_.CacheMissPct())});
+
+ auto out = PersistentCacheTier::Stats();
+ out.push_back(stat);
+ return out;
+}
+
+Status VolatileCacheTier::Insert(const Slice& page_key, const char* data,
+ const size_t size) {
+ // precondition
+ assert(data);
+ assert(size);
+
+ // increment the size
+ size_ += size;
+
+ // check if we have overshot the limit, if so evict some space
+ while (size_ > max_size_) {
+ if (!Evict()) {
+ // unable to evict data, we give up so we don't spike read
+ // latency
+ assert(size_ >= size);
+ size_ -= size;
+ return Status::TryAgain("Unable to evict any data");
+ }
+ }
+
+ assert(size_ >= size);
+
+ // insert order: LRU, followed by index
+ std::string key(page_key.data(), page_key.size());
+ std::string value(data, size);
+ std::unique_ptr<CacheData> cache_data(
+ new CacheData(std::move(key), std::move(value)));
+ bool ok = index_.Insert(cache_data.get());
+ if (!ok) {
+ // decrement the size that we incremented ahead of time
+ assert(size_ >= size);
+ size_ -= size;
+ // failed to insert to cache, block already in cache
+ return Status::TryAgain("key already exists in volatile cache");
+ }
+
+ cache_data.release();
+ stats_.cache_inserts_++;
+ return Status::OK();
+}
+
+Status VolatileCacheTier::Lookup(const Slice& page_key,
+ std::unique_ptr<char[]>* result,
+ size_t* size) {
+ CacheData key(std::move(page_key.ToString()));
+ CacheData* kv;
+ bool ok = index_.Find(&key, &kv);
+ if (ok) {
+ // set return data
+ result->reset(new char[kv->value.size()]);
+ memcpy(result->get(), kv->value.c_str(), kv->value.size());
+ *size = kv->value.size();
+ // drop the reference on cache data
+ kv->refs_--;
+ // update stats
+ stats_.cache_hits_++;
+ return Status::OK();
+ }
+
+ stats_.cache_misses_++;
+
+ if (next_tier()) {
+ return next_tier()->Lookup(page_key, result, size);
+ }
+
+ return Status::NotFound("key not found in volatile cache");
+}
+
+bool VolatileCacheTier::Erase(const Slice& /*key*/) {
+ assert(!"not supported");
+ return true;
+}
+
+bool VolatileCacheTier::Evict() {
+ CacheData* edata = index_.Evict();
+ if (!edata) {
+ // not able to evict any object
+ return false;
+ }
+
+ stats_.cache_evicts_++;
+
+ // push the evicted object to the next level
+ if (next_tier()) {
+ // TODO: Should the insert error be ignored?
+ Status s = next_tier()->Insert(Slice(edata->key), edata->value.c_str(),
+ edata->value.size());
+ s.PermitUncheckedError();
+ }
+
+ // adjust size and destroy data
+ size_ -= edata->value.size();
+ delete edata;
+
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h
new file mode 100644
index 000000000..09265e457
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+
+// VolatileCacheTier
+//
+// This file provides persistent cache tier implementation for caching
+// key/values in RAM.
+//
+// key/values
+// |
+// V
+// +-------------------+
+// | VolatileCacheTier | Store in an evictable hash table
+// +-------------------+
+// |
+// V
+// on eviction
+// pushed to next tier
+//
+// The implementation is designed to be concurrent. The evictable hash table
+// implementation is not concurrent at this point though.
+//
+// The eviction algorithm is LRU
+namespace ROCKSDB_NAMESPACE {
+
+class VolatileCacheTier : public PersistentCacheTier {
+ public:
+ explicit VolatileCacheTier(
+ const bool is_compressed = true,
+ const size_t max_size = std::numeric_limits<size_t>::max())
+ : is_compressed_(is_compressed), max_size_(max_size) {}
+
+ virtual ~VolatileCacheTier();
+
+ // insert to cache
+ Status Insert(const Slice& page_key, const char* data,
+ const size_t size) override;
+ // lookup key in cache
+ Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+ size_t* size) override;
+
+ // is compressed cache ?
+ bool IsCompressed() override { return is_compressed_; }
+
+ // erase key from cache
+ bool Erase(const Slice& key) override;
+
+ std::string GetPrintableOptions() const override {
+ return "VolatileCacheTier";
+ }
+
+ // Expose stats as map
+ PersistentCache::StatsType Stats() override;
+
+ private:
+ //
+ // Cache data abstraction
+ //
+ struct CacheData : LRUElement<CacheData> {
+ explicit CacheData(CacheData&& rhs) noexcept
+ : key(std::move(rhs.key)), value(std::move(rhs.value)) {}
+
+ explicit CacheData(const std::string& _key, const std::string& _value = "")
+ : key(_key), value(_value) {}
+
+ virtual ~CacheData() {}
+
+ const std::string key;
+ const std::string value;
+ };
+
+ static void DeleteCacheData(CacheData* data);
+
+ //
+ // Index and LRU definition
+ //
+ struct CacheDataHash {
+ uint64_t operator()(const CacheData* obj) const {
+ assert(obj);
+ return std::hash<std::string>()(obj->key);
+ }
+ };
+
+ struct CacheDataEqual {
+ bool operator()(const CacheData* lhs, const CacheData* rhs) const {
+ assert(lhs);
+ assert(rhs);
+ return lhs->key == rhs->key;
+ }
+ };
+
+ struct Statistics {
+ std::atomic<uint64_t> cache_misses_{0};
+ std::atomic<uint64_t> cache_hits_{0};
+ std::atomic<uint64_t> cache_inserts_{0};
+ std::atomic<uint64_t> cache_evicts_{0};
+
+ double CacheHitPct() const {
+ auto lookups = cache_hits_ + cache_misses_;
+ return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0;
+ }
+
+ double CacheMissPct() const {
+ auto lookups = cache_hits_ + cache_misses_;
+ return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0;
+ }
+ };
+
+ using IndexType =
+ EvictableHashTable<CacheData, CacheDataHash, CacheDataEqual>;
+
+ // Evict LRU tail
+ bool Evict();
+
+ const bool is_compressed_ = true; // does it store compressed data
+ IndexType index_; // in-memory cache
+ std::atomic<uint64_t> max_size_{0}; // Maximum size of the cache
+ std::atomic<uint64_t> size_{0}; // Size of the cache
+ Statistics stats_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc
new file mode 100644
index 000000000..dc419e51a
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc
@@ -0,0 +1,288 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const std::string kGhostCachePrefix = "ghost_";
+} // namespace
+
+GhostCache::GhostCache(std::shared_ptr<Cache> sim_cache)
+ : sim_cache_(sim_cache) {}
+
+bool GhostCache::Admit(const Slice& lookup_key) {
+ auto handle = sim_cache_->Lookup(lookup_key);
+ if (handle != nullptr) {
+ sim_cache_->Release(handle);
+ return true;
+ }
+ // TODO: Should we check for errors here?
+ auto s = sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
+ /*deleter=*/nullptr);
+ s.PermitUncheckedError();
+ return false;
+}
+
+CacheSimulator::CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+ std::shared_ptr<Cache> sim_cache)
+ : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {}
+
+void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
+ bool admit = true;
+ const bool is_user_access =
+ BlockCacheTraceHelper::IsUserAccess(access.caller);
+ bool is_cache_miss = true;
+ if (ghost_cache_ && !access.no_insert) {
+ admit = ghost_cache_->Admit(access.block_key);
+ }
+ auto handle = sim_cache_->Lookup(access.block_key);
+ if (handle != nullptr) {
+ sim_cache_->Release(handle);
+ is_cache_miss = false;
+ } else {
+ if (!access.no_insert && admit && access.block_size > 0) {
+ // Ignore errors on insert
+ auto s = sim_cache_->Insert(access.block_key, /*value=*/nullptr,
+ access.block_size,
+ /*deleter=*/nullptr);
+ s.PermitUncheckedError();
+ }
+ }
+ miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+ is_cache_miss);
+}
+
+void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms,
+ bool is_user_access, bool is_cache_miss) {
+ uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond;
+ num_accesses_timeline_[timestamp_in_seconds] += 1;
+ num_accesses_ += 1;
+ if (num_misses_timeline_.find(timestamp_in_seconds) ==
+ num_misses_timeline_.end()) {
+ num_misses_timeline_[timestamp_in_seconds] = 0;
+ }
+ if (is_cache_miss) {
+ num_misses_ += 1;
+ num_misses_timeline_[timestamp_in_seconds] += 1;
+ }
+ if (is_user_access) {
+ user_accesses_ += 1;
+ if (is_cache_miss) {
+ user_misses_ += 1;
+ }
+ }
+}
+
+Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority(
+ const BlockCacheTraceRecord& access) const {
+ if (access.block_type == TraceType::kBlockTraceFilterBlock ||
+ access.block_type == TraceType::kBlockTraceIndexBlock ||
+ access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
+ return Cache::Priority::HIGH;
+ }
+ return Cache::Priority::LOW;
+}
+
+void PrioritizedCacheSimulator::AccessKVPair(
+ const Slice& key, uint64_t value_size, Cache::Priority priority,
+ const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access,
+ bool* is_cache_miss, bool* admitted, bool update_metrics) {
+ assert(is_cache_miss);
+ assert(admitted);
+ *is_cache_miss = true;
+ *admitted = true;
+ if (ghost_cache_ && !no_insert) {
+ *admitted = ghost_cache_->Admit(key);
+ }
+ auto handle = sim_cache_->Lookup(key);
+ if (handle != nullptr) {
+ sim_cache_->Release(handle);
+ *is_cache_miss = false;
+ } else if (!no_insert && *admitted && value_size > 0) {
+ // TODO: Should we check for an error here?
+ auto s = sim_cache_->Insert(key, /*value=*/nullptr, value_size,
+ /*deleter=*/nullptr,
+ /*handle=*/nullptr, priority);
+ s.PermitUncheckedError();
+ }
+ if (update_metrics) {
+ miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+ *is_cache_miss);
+ }
+}
+
+void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+ bool is_cache_miss = true;
+ bool admitted = true;
+ AccessKVPair(access.block_key, access.block_size,
+ ComputeBlockPriority(access), access, access.no_insert,
+ BlockCacheTraceHelper::IsUserAccess(access.caller),
+ &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+ // TODO (haoyu): We only support Get for now. We need to extend the tracing
+ // for MultiGet, i.e., non-data block accesses must log all keys in a
+ // MultiGet.
+ bool is_cache_miss = true;
+ bool admitted = false;
+ if (access.caller == TableReaderCaller::kUserGet &&
+ access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+ // This is a Get request.
+ const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
+ GetRequestStatus& status = getid_status_map_[access.get_id];
+ if (status.is_complete) {
+ // This Get request completes.
+ // Skip future accesses to its index/filter/data
+ // blocks. These block lookups are unnecessary if we observe a hit for the
+ // referenced key-value pair already. Thus, we treat these lookups as
+ // hits. This is also to ensure the total number of accesses are the same
+ // when comparing to other policies.
+ miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+ /*is_user_access=*/true,
+ /*is_cache_miss=*/false);
+ return;
+ }
+ if (status.row_key_status.find(row_key) == status.row_key_status.end()) {
+ // This is the first time that this key is accessed. Look up the key-value
+ // pair first. Do not update the miss/accesses metrics here since it will
+ // be updated later.
+ AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH,
+ access,
+ /*no_insert=*/false,
+ /*is_user_access=*/true, &is_cache_miss, &admitted,
+ /*update_metrics=*/false);
+ InsertResult result = InsertResult::NO_INSERT;
+ if (admitted && access.referenced_data_size > 0) {
+ result = InsertResult::INSERTED;
+ } else if (admitted) {
+ result = InsertResult::ADMITTED;
+ }
+ status.row_key_status[row_key] = result;
+ }
+ if (!is_cache_miss) {
+ // A cache hit.
+ status.is_complete = true;
+ miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+ /*is_user_access=*/true,
+ /*is_cache_miss=*/false);
+ return;
+ }
+ // The row key-value pair observes a cache miss. We need to access its
+ // index/filter/data blocks.
+ InsertResult inserted = status.row_key_status[row_key];
+ AccessKVPair(
+ access.block_key, access.block_size, ComputeBlockPriority(access),
+ access,
+ /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
+ /*is_user_access=*/true, &is_cache_miss, &admitted,
+ /*update_metrics=*/true);
+ if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
+ // TODO: Should we check for an error here?
+ auto s = sim_cache_->Insert(row_key, /*value=*/nullptr,
+ access.referenced_data_size,
+ /*deleter=*/nullptr,
+ /*handle=*/nullptr, Cache::Priority::HIGH);
+ s.PermitUncheckedError();
+ status.row_key_status[row_key] = InsertResult::INSERTED;
+ }
+ return;
+ }
+ AccessKVPair(access.block_key, access.block_size,
+ ComputeBlockPriority(access), access, access.no_insert,
+ BlockCacheTraceHelper::IsUserAccess(access.caller),
+ &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+BlockCacheTraceSimulator::BlockCacheTraceSimulator(
+ uint64_t warmup_seconds, uint32_t downsample_ratio,
+ const std::vector<CacheConfiguration>& cache_configurations)
+ : warmup_seconds_(warmup_seconds),
+ downsample_ratio_(downsample_ratio),
+ cache_configurations_(cache_configurations) {}
+
+Status BlockCacheTraceSimulator::InitializeCaches() {
+ for (auto const& config : cache_configurations_) {
+ for (auto cache_capacity : config.cache_capacities) {
+ // Scale down the cache capacity since the trace contains accesses on
+ // 1/'downsample_ratio' blocks.
+ uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_;
+ std::shared_ptr<CacheSimulator> sim_cache;
+ std::unique_ptr<GhostCache> ghost_cache;
+ std::string cache_name = config.cache_name;
+ if (cache_name.find(kGhostCachePrefix) != std::string::npos) {
+ ghost_cache.reset(new GhostCache(
+ NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ cache_name = cache_name.substr(kGhostCachePrefix.size());
+ }
+ if (cache_name == "lru") {
+ sim_cache = std::make_shared<CacheSimulator>(
+ std::move(ghost_cache),
+ NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0));
+ } else if (cache_name == "lru_priority") {
+ sim_cache = std::make_shared<PrioritizedCacheSimulator>(
+ std::move(ghost_cache),
+ NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0.5));
+ } else if (cache_name == "lru_hybrid") {
+ sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+ std::move(ghost_cache),
+ NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0.5),
+ /*insert_blocks_upon_row_kvpair_miss=*/true);
+ } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") {
+ sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+ std::move(ghost_cache),
+ NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0.5),
+ /*insert_blocks_upon_row_kvpair_miss=*/false);
+ } else {
+ // Not supported.
+ return Status::InvalidArgument("Unknown cache name " +
+ config.cache_name);
+ }
+ sim_caches_[config].push_back(sim_cache);
+ }
+ }
+ return Status::OK();
+}
+
+void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
+ if (trace_start_time_ == 0) {
+ trace_start_time_ = access.access_timestamp;
+ }
+ // access.access_timestamp is in microseconds.
+ if (!warmup_complete_ &&
+ trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
+ access.access_timestamp) {
+ for (auto& config_caches : sim_caches_) {
+ for (auto& sim_cache : config_caches.second) {
+ sim_cache->reset_counter();
+ }
+ }
+ warmup_complete_ = true;
+ }
+ for (auto& config_caches : sim_caches_) {
+ for (auto& sim_cache : config_caches.second) {
+ sim_cache->Access(access);
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.h b/src/rocksdb/utilities/simulator_cache/cache_simulator.h
new file mode 100644
index 000000000..6d4979013
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.h
@@ -0,0 +1,231 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <unordered_map>
+
+#include "cache/lru_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A cache configuration provided by user.
+struct CacheConfiguration {
+ std::string cache_name; // LRU.
+ uint32_t num_shard_bits;
+ uint64_t ghost_cache_capacity; // ghost cache capacity in bytes.
+ std::vector<uint64_t>
+ cache_capacities; // simulate cache capacities in bytes.
+
+ bool operator==(const CacheConfiguration& o) const {
+ return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+ ghost_cache_capacity == o.ghost_cache_capacity;
+ }
+ bool operator<(const CacheConfiguration& o) const {
+ return cache_name < o.cache_name ||
+ (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) ||
+ (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+ ghost_cache_capacity < o.ghost_cache_capacity);
+ }
+};
+
+class MissRatioStats {
+ public:
+ void reset_counter() {
+ num_misses_ = 0;
+ num_accesses_ = 0;
+ user_accesses_ = 0;
+ user_misses_ = 0;
+ }
+ double miss_ratio() const {
+ if (num_accesses_ == 0) {
+ return -1;
+ }
+ return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
+ }
+ uint64_t total_accesses() const { return num_accesses_; }
+ uint64_t total_misses() const { return num_misses_; }
+
+ const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
+ return num_accesses_timeline_;
+ }
+
+ const std::map<uint64_t, uint64_t>& num_misses_timeline() const {
+ return num_misses_timeline_;
+ }
+
+ double user_miss_ratio() const {
+ if (user_accesses_ == 0) {
+ return -1;
+ }
+ return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
+ }
+ uint64_t user_accesses() const { return user_accesses_; }
+ uint64_t user_misses() const { return user_misses_; }
+
+ void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
+ bool is_cache_miss);
+
+ private:
+ uint64_t num_accesses_ = 0;
+ uint64_t num_misses_ = 0;
+ uint64_t user_accesses_ = 0;
+ uint64_t user_misses_ = 0;
+
+ std::map<uint64_t, uint64_t> num_accesses_timeline_;
+ std::map<uint64_t, uint64_t> num_misses_timeline_;
+};
+
+// A ghost cache admits an entry on its second access.
+class GhostCache {
+ public:
+ explicit GhostCache(std::shared_ptr<Cache> sim_cache);
+ ~GhostCache() = default;
+ // No copy and move.
+ GhostCache(const GhostCache&) = delete;
+ GhostCache& operator=(const GhostCache&) = delete;
+ GhostCache(GhostCache&&) = delete;
+ GhostCache& operator=(GhostCache&&) = delete;
+
+ // Returns true if the lookup_key is in the ghost cache.
+ // Returns false otherwise.
+ bool Admit(const Slice& lookup_key);
+
+ private:
+ std::shared_ptr<Cache> sim_cache_;
+};
+
+// A cache simulator that runs against a block cache trace.
+class CacheSimulator {
+ public:
+ CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+ std::shared_ptr<Cache> sim_cache);
+ virtual ~CacheSimulator() = default;
+ // No copy and move.
+ CacheSimulator(const CacheSimulator&) = delete;
+ CacheSimulator& operator=(const CacheSimulator&) = delete;
+ CacheSimulator(CacheSimulator&&) = delete;
+ CacheSimulator& operator=(CacheSimulator&&) = delete;
+
+ virtual void Access(const BlockCacheTraceRecord& access);
+
+ void reset_counter() { miss_ratio_stats_.reset_counter(); }
+
+ const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; }
+
+ protected:
+ MissRatioStats miss_ratio_stats_;
+ std::unique_ptr<GhostCache> ghost_cache_;
+ std::shared_ptr<Cache> sim_cache_;
+};
+
+// A prioritized cache simulator that runs against a block cache trace.
+// It inserts missing index/filter/uncompression-dictionary blocks with high
+// priority in the cache.
+class PrioritizedCacheSimulator : public CacheSimulator {
+ public:
+ PrioritizedCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+ std::shared_ptr<Cache> sim_cache)
+ : CacheSimulator(std::move(ghost_cache), sim_cache) {}
+ void Access(const BlockCacheTraceRecord& access) override;
+
+ protected:
+ // Access the key-value pair and returns true upon a cache miss.
+ void AccessKVPair(const Slice& key, uint64_t value_size,
+ Cache::Priority priority,
+ const BlockCacheTraceRecord& access, bool no_insert,
+ bool is_user_access, bool* is_cache_miss, bool* admitted,
+ bool update_metrics);
+
+ Cache::Priority ComputeBlockPriority(
+ const BlockCacheTraceRecord& access) const;
+};
+
+// A hybrid row and block cache simulator. It looks up/inserts key-value pairs
+// referenced by Get/MultiGet requests, and not their accessed index/filter/data
+// blocks.
+//
+// Upon a Get/MultiGet request, it looks up the referenced key first.
+// If it observes a cache hit, future block accesses on this key-value pair is
+// skipped since the request is served already. Otherwise, it continues to look
+// up/insert its index/filter/data blocks. It also inserts the referenced
+// key-value pair in the cache for future lookups.
+class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
+ public:
+ HybridRowBlockCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+ std::shared_ptr<Cache> sim_cache,
+ bool insert_blocks_upon_row_kvpair_miss)
+ : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache),
+ insert_blocks_upon_row_kvpair_miss_(
+ insert_blocks_upon_row_kvpair_miss) {}
+ void Access(const BlockCacheTraceRecord& access) override;
+
+ private:
+ enum InsertResult : char {
+ INSERTED,
+ ADMITTED,
+ NO_INSERT,
+ };
+
+ // We set is_complete to true when the referenced row-key of a get request
+ // hits the cache. If is_complete is true, we treat future accesses of this
+ // get request as hits.
+ //
+ // For each row key, it stores an enum. It is INSERTED when the
+ // kv-pair has been inserted into the cache, ADMITTED if it should be inserted
+ // but haven't been, NO_INSERT if it should not be inserted.
+ //
+ // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
+ // know its size. This may happen if the first access on the referenced key is
+ // an index/filter block.
+ struct GetRequestStatus {
+ bool is_complete = false;
+ std::map<std::string, InsertResult> row_key_status;
+ };
+
+ // A map stores get_id to a map of row keys.
+ std::map<uint64_t, GetRequestStatus> getid_status_map_;
+ bool insert_blocks_upon_row_kvpair_miss_;
+};
+
+// A block cache simulator that reports miss ratio curves given a set of cache
+// configurations.
+class BlockCacheTraceSimulator {
+ public:
+ // warmup_seconds: The number of seconds to warmup simulated caches. The
+ // hit/miss counters are reset after the warmup completes.
+ BlockCacheTraceSimulator(
+ uint64_t warmup_seconds, uint32_t downsample_ratio,
+ const std::vector<CacheConfiguration>& cache_configurations);
+ ~BlockCacheTraceSimulator() = default;
+ // No copy and move.
+ BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
+ BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
+ BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
+ BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
+
+ Status InitializeCaches();
+
+ void Access(const BlockCacheTraceRecord& access);
+
+ const std::map<CacheConfiguration,
+ std::vector<std::shared_ptr<CacheSimulator>>>&
+ sim_caches() const {
+ return sim_caches_;
+ }
+
+ private:
+ const uint64_t warmup_seconds_;
+ const uint32_t downsample_ratio_;
+ const std::vector<CacheConfiguration> cache_configurations_;
+
+ bool warmup_complete_ = false;
+ std::map<CacheConfiguration, std::vector<std::shared_ptr<CacheSimulator>>>
+ sim_caches_;
+ uint64_t trace_start_time_ = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc
new file mode 100644
index 000000000..2bc057c92
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc
@@ -0,0 +1,497 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <cstdlib>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const std::string kBlockKeyPrefix = "test-block-";
+const std::string kRefKeyPrefix = "test-get-";
+const std::string kRefKeySequenceNumber = std::string(8, 'c');
+const uint64_t kGetId = 1;
+const uint64_t kGetBlockId = 100;
+const uint64_t kCompactionBlockId = 1000;
+const uint64_t kCacheSize = 1024 * 1024 * 1024;
+const uint64_t kGhostCacheSize = 1024 * 1024;
+} // namespace
+
+class CacheSimulatorTest : public testing::Test {
+ public:
+ const size_t kNumBlocks = 5;
+ const size_t kValueSize = 1000;
+
+ CacheSimulatorTest() { env_ = ROCKSDB_NAMESPACE::Env::Default(); }
+
+ BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) {
+ BlockCacheTraceRecord record;
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ record.block_size = 4096;
+ record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId);
+ record.access_timestamp = env_->NowMicros();
+ record.cf_id = 0;
+ record.cf_name = "test";
+ record.caller = TableReaderCaller::kUserGet;
+ record.level = 6;
+ record.sst_fd_number = 0;
+ record.get_id = getid;
+ record.is_cache_hit = false;
+ record.no_insert = false;
+ record.referenced_key =
+ kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber;
+ record.referenced_key_exist_in_block = true;
+ record.referenced_data_size = 100;
+ record.num_keys_in_block = 300;
+ return record;
+ }
+
+ BlockCacheTraceRecord GenerateCompactionRecord() {
+ BlockCacheTraceRecord record;
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ record.block_size = 4096;
+ record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId);
+ record.access_timestamp = env_->NowMicros();
+ record.cf_id = 0;
+ record.cf_name = "test";
+ record.caller = TableReaderCaller::kCompaction;
+ record.level = 6;
+ record.sst_fd_number = kCompactionBlockId;
+ record.is_cache_hit = false;
+ record.no_insert = true;
+ return record;
+ }
+
+ void AssertCache(std::shared_ptr<Cache> sim_cache,
+ const MissRatioStats& miss_ratio_stats,
+ uint64_t expected_usage, uint64_t expected_num_accesses,
+ uint64_t expected_num_misses,
+ std::vector<std::string> blocks,
+ std::vector<std::string> keys) {
+ EXPECT_EQ(expected_usage, sim_cache->GetUsage());
+ EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses());
+ EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses());
+ for (auto const& block : blocks) {
+ auto handle = sim_cache->Lookup(block);
+ EXPECT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ }
+ for (auto const& key : keys) {
+ std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
+ auto handle =
+ sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString());
+ EXPECT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ }
+ }
+
+ Env* env_;
+};
+
+TEST_F(CacheSimulatorTest, GhostCache) {
+ const std::string key1 = "test1";
+ const std::string key2 = "test2";
+ std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+ NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ EXPECT_FALSE(ghost_cache->Admit(key1));
+ EXPECT_TRUE(ghost_cache->Admit(key1));
+ EXPECT_TRUE(ghost_cache->Admit(key1));
+ EXPECT_FALSE(ghost_cache->Admit(key2));
+ EXPECT_TRUE(ghost_cache->Admit(key2));
+}
+
+TEST_F(CacheSimulatorTest, CacheSimulator) {
+ const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+ const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord();
+ std::shared_ptr<Cache> sim_cache =
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0);
+ std::unique_ptr<CacheSimulator> cache_simulator(
+ new CacheSimulator(nullptr, sim_cache));
+ cache_simulator->Access(access);
+ cache_simulator->Access(access);
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+ cache_simulator->Access(compaction_access);
+ cache_simulator->Access(compaction_access);
+ ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio());
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+ cache_simulator->reset_counter();
+ ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio());
+ auto handle = sim_cache->Lookup(access.block_key);
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ handle = sim_cache->Lookup(compaction_access.block_key);
+ ASSERT_EQ(nullptr, handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostCacheSimulator) {
+ const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+ std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+ NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ std::unique_ptr<CacheSimulator> cache_simulator(new CacheSimulator(
+ std::move(ghost_cache),
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ cache_simulator->Access(access);
+ cache_simulator->Access(access);
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+ // Both of them will be miss since we have a ghost cache.
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
+ const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+ std::shared_ptr<Cache> sim_cache =
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0);
+ std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+ new PrioritizedCacheSimulator(nullptr, sim_cache));
+ cache_simulator->Access(access);
+ cache_simulator->Access(access);
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+
+ auto handle = sim_cache->Lookup(access.block_key);
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
+ const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+ std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+ NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+ new PrioritizedCacheSimulator(
+ std::move(ghost_cache),
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ cache_simulator->Access(access);
+ cache_simulator->Access(access);
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+ // Both of them will be miss since we have a ghost cache.
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
+ uint64_t block_id = 100;
+ BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+ first_get.get_from_user_specified_snapshot = true;
+ BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1);
+ second_get.referenced_data_size = 0;
+ second_get.referenced_key_exist_in_block = false;
+ second_get.get_from_user_specified_snapshot = true;
+ BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2);
+ third_get.referenced_data_size = 0;
+ third_get.referenced_key_exist_in_block = false;
+ third_get.referenced_key = kRefKeyPrefix + "third_get";
+ // We didn't find the referenced key in the third get.
+ third_get.referenced_key_exist_in_block = false;
+ third_get.referenced_data_size = 0;
+ std::shared_ptr<Cache> sim_cache =
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0);
+ std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+ new HybridRowBlockCacheSimulator(
+ nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+ // The first get request accesses 10 blocks. We should only report 10 accesses
+ // and 100% miss.
+ for (uint32_t i = 0; i < 10; i++) {
+ first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+ cache_simulator->Access(first_get);
+ block_id++;
+ }
+
+ ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+ ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+ auto handle =
+ sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+ ExtractUserKey(first_get.referenced_key).ToString());
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ for (uint32_t i = 100; i < block_id; i++) {
+ handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ }
+
+ // The second get request accesses the same key. We should report 15
+ // access and 66% miss, 10 misses with 15 accesses.
+ // We do not consider these 5 block lookups as misses since the row hits the
+ // cache.
+ for (uint32_t i = 0; i < 5; i++) {
+ second_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+ cache_simulator->Access(second_get);
+ block_id++;
+ }
+ ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(66, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().miss_ratio()));
+ ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(66, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().user_miss_ratio()));
+ handle =
+ sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
+ ExtractUserKey(second_get.referenced_key).ToString());
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ for (uint32_t i = 100; i < block_id; i++) {
+ handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+ if (i < 110) {
+ ASSERT_NE(nullptr, handle) << i;
+ sim_cache->Release(handle);
+ } else {
+ ASSERT_EQ(nullptr, handle) << i;
+ }
+ }
+
+ // The third get on a different key and does not have a size.
+ // This key should not be inserted into the cache.
+ for (uint32_t i = 0; i < 5; i++) {
+ third_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+ cache_simulator->Access(third_get);
+ block_id++;
+ }
+ ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(75, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().miss_ratio()));
+ ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(75, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().user_miss_ratio()));
+ // Assert that the third key is not inserted into the cache.
+ handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" +
+ third_get.referenced_key);
+ ASSERT_EQ(nullptr, handle);
+ for (uint32_t i = 100; i < block_id; i++) {
+ if (i < 110 || i >= 115) {
+ handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+ ASSERT_NE(nullptr, handle) << i;
+ sim_cache->Release(handle);
+ } else {
+ handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+ ASSERT_EQ(nullptr, handle) << i;
+ }
+ }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
+ BlockCacheTraceRecord get = GenerateGetRecord(kGetId);
+ get.block_size = 1;
+ get.referenced_data_size = 0;
+ get.access_timestamp = 0;
+ get.block_key = "1";
+ get.get_id = 1;
+ get.get_from_user_specified_snapshot = false;
+ get.referenced_key =
+ kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber;
+ get.no_insert = false;
+ get.sst_fd_number = 0;
+ get.get_from_user_specified_snapshot = false;
+
+ LRUCacheOptions co;
+ co.capacity = 16;
+ co.num_shard_bits = 1;
+ co.strict_capacity_limit = false;
+ co.high_pri_pool_ratio = 0;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> sim_cache = NewLRUCache(co);
+ std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+ new HybridRowBlockCacheSimulator(
+ nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+ // Expect a miss and does not insert the row key-value pair since it does not
+ // have size.
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"},
+ {});
+ get.access_timestamp += 1;
+ get.referenced_data_size = 1;
+ get.block_key = "2";
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2,
+ {"1", "2"}, {"1"});
+ get.access_timestamp += 1;
+ get.block_key = "3";
+ // K1 should not inserted again.
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3,
+ {"1", "2", "3"}, {"1"});
+
+ // A second get request referencing the same key.
+ get.access_timestamp += 1;
+ get.get_id = 2;
+ get.block_key = "4";
+ get.referenced_data_size = 0;
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3,
+ {"1", "2", "3"}, {"1"});
+
+ // A third get request searches three files, three different keys.
+ // And the second key observes a hit.
+ get.access_timestamp += 1;
+ get.referenced_data_size = 1;
+ get.get_id = 3;
+ get.block_key = "3";
+ get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber;
+ // K2 should observe a miss. Block 3 observes a hit.
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3,
+ {"1", "2", "3"}, {"1", "2"});
+
+ get.access_timestamp += 1;
+ get.referenced_data_size = 1;
+ get.get_id = 3;
+ get.block_key = "4";
+ get.referenced_data_size = 1;
+ get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber;
+ // K1 should observe a hit.
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3,
+ {"1", "2", "3"}, {"1", "2"});
+
+ get.access_timestamp += 1;
+ get.referenced_data_size = 1;
+ get.get_id = 3;
+ get.block_key = "4";
+ get.referenced_data_size = 1;
+ get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber;
+ // K3 should observe a miss.
+ // However, as the get already complete, we should not access k3 any more.
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3,
+ {"1", "2", "3"}, {"1", "2"});
+
+ // A fourth get request searches one file and two blocks. One row key.
+ get.access_timestamp += 1;
+ get.get_id = 4;
+ get.block_key = "5";
+ get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+ get.referenced_data_size = 1;
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
+ {"1", "2", "3", "5"}, {"1", "2", "4"});
+ for (auto const& key : {"1", "2", "4"}) {
+ auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ }
+
+ // A bunch of insertions which evict cached row keys.
+ for (uint32_t i = 6; i < 100; i++) {
+ get.access_timestamp += 1;
+ get.get_id = 0;
+ get.block_key = std::to_string(i);
+ cache_simulator->Access(get);
+ }
+
+ get.get_id = 4;
+ // A different block.
+ get.block_key = "100";
+ // Same row key and should not be inserted again.
+ get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+ get.referenced_data_size = 1;
+ cache_simulator->Access(get);
+ AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
+ {});
+ for (auto const& key : {"1", "2", "4"}) {
+ auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+ ASSERT_EQ(nullptr, handle);
+ }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
+ uint64_t block_id = 100;
+ BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+ std::shared_ptr<Cache> sim_cache =
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0);
+ std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+ new HybridRowBlockCacheSimulator(
+ nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false));
+ for (uint32_t i = 0; i < 9; i++) {
+ first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+ cache_simulator->Access(first_get);
+ block_id++;
+ }
+ auto handle =
+ sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+ ExtractUserKey(first_get.referenced_key).ToString());
+ ASSERT_NE(nullptr, handle);
+ sim_cache->Release(handle);
+ // All blocks are missing from the cache since insert_blocks_row_kvpair_misses
+ // is set to false.
+ for (uint32_t i = 100; i < block_id; i++) {
+ handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+ ASSERT_EQ(nullptr, handle);
+ }
+}
+
+TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) {
+ std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+ NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0)));
+ const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId);
+ const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1);
+ const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2);
+ std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+ new HybridRowBlockCacheSimulator(
+ std::move(ghost_cache),
+ NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+ /*strict_capacity_limit=*/false,
+ /*high_pri_pool_ratio=*/0),
+ /*insert_blocks_row_kvpair_misses=*/false));
+ // Two get requests access the same key.
+ cache_simulator->Access(first_get);
+ cache_simulator->Access(second_get);
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+ ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+ // We insert the key-value pair upon the second get request. A third get
+ // request should observe a hit.
+ for (uint32_t i = 0; i < 10; i++) {
+ cache_simulator->Access(third_get);
+ }
+ ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses());
+ ASSERT_EQ(16, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().miss_ratio()));
+ ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses());
+ ASSERT_EQ(16, static_cast<uint64_t>(
+ cache_simulator->miss_ratio_stats().user_miss_ratio()));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache.cc b/src/rocksdb/utilities/simulator_cache/sim_cache.cc
new file mode 100644
index 000000000..a883b52e7
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/sim_cache.cc
@@ -0,0 +1,364 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/sim_cache.h"
+
+#include <atomic>
+#include <iomanip>
+
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class CacheActivityLogger {
+ public:
+ CacheActivityLogger()
+ : activity_logging_enabled_(false), max_logging_size_(0) {}
+
+ ~CacheActivityLogger() {
+ MutexLock l(&mutex_);
+
+ StopLoggingInternal();
+ bg_status_.PermitUncheckedError();
+ }
+
+ Status StartLogging(const std::string& activity_log_file, Env* env,
+ uint64_t max_logging_size = 0) {
+ assert(activity_log_file != "");
+ assert(env != nullptr);
+
+ Status status;
+ FileOptions file_opts;
+
+ MutexLock l(&mutex_);
+
+ // Stop existing logging if any
+ StopLoggingInternal();
+
+ // Open log file
+ status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file,
+ file_opts, &file_writer_, nullptr);
+ if (!status.ok()) {
+ return status;
+ }
+
+ max_logging_size_ = max_logging_size;
+ activity_logging_enabled_.store(true);
+
+ return status;
+ }
+
+ void StopLogging() {
+ MutexLock l(&mutex_);
+
+ StopLoggingInternal();
+ }
+
+ void ReportLookup(const Slice& key) {
+ if (activity_logging_enabled_.load() == false) {
+ return;
+ }
+
+ std::ostringstream oss;
+ // line format: "LOOKUP - <KEY>"
+ oss << "LOOKUP - " << key.ToString(true) << std::endl;
+
+ MutexLock l(&mutex_);
+ Status s = file_writer_->Append(oss.str());
+ if (!s.ok() && bg_status_.ok()) {
+ bg_status_ = s;
+ }
+ if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+ // Stop logging if we have reached the max file size or
+ // encountered an error
+ StopLoggingInternal();
+ }
+ }
+
+ void ReportAdd(const Slice& key, size_t size) {
+ if (activity_logging_enabled_.load() == false) {
+ return;
+ }
+
+ std::ostringstream oss;
+ // line format: "ADD - <KEY> - <KEY-SIZE>"
+ oss << "ADD - " << key.ToString(true) << " - " << size << std::endl;
+ MutexLock l(&mutex_);
+ Status s = file_writer_->Append(oss.str());
+ if (!s.ok() && bg_status_.ok()) {
+ bg_status_ = s;
+ }
+
+ if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+ // Stop logging if we have reached the max file size or
+ // encountered an error
+ StopLoggingInternal();
+ }
+ }
+
+ Status& bg_status() {
+ MutexLock l(&mutex_);
+ return bg_status_;
+ }
+
+ private:
+ bool MaxLoggingSizeReached() {
+ mutex_.AssertHeld();
+
+ return (max_logging_size_ > 0 &&
+ file_writer_->GetFileSize() >= max_logging_size_);
+ }
+
+ void StopLoggingInternal() {
+ mutex_.AssertHeld();
+
+ if (!activity_logging_enabled_) {
+ return;
+ }
+
+ activity_logging_enabled_.store(false);
+ Status s = file_writer_->Close();
+ if (!s.ok() && bg_status_.ok()) {
+ bg_status_ = s;
+ }
+ }
+
+ // Mutex to sync writes to file_writer, and all following
+ // class data members
+ port::Mutex mutex_;
+ // Indicates if logging is currently enabled
+ // atomic to allow reads without mutex
+ std::atomic<bool> activity_logging_enabled_;
+ // When reached, we will stop logging and close the file
+ // Value of 0 means unlimited
+ uint64_t max_logging_size_;
+ std::unique_ptr<WritableFileWriter> file_writer_;
+ Status bg_status_;
+};
+
+// SimCacheImpl definition
+class SimCacheImpl : public SimCache {
+ public:
+ // capacity for real cache (ShardedLRUCache)
+ // test_capacity for key only cache
+ SimCacheImpl(std::shared_ptr<Cache> sim_cache, std::shared_ptr<Cache> cache)
+ : cache_(cache),
+ key_only_cache_(sim_cache),
+ miss_times_(0),
+ hit_times_(0),
+ stats_(nullptr) {}
+
+ ~SimCacheImpl() override {}
+ void SetCapacity(size_t capacity) override { cache_->SetCapacity(capacity); }
+
+ void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+ cache_->SetStrictCapacityLimit(strict_capacity_limit);
+ }
+
+ using Cache::Insert;
+ Status Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value), Handle** handle,
+ Priority priority) override {
+ // The handle and value passed in are for real cache, so we pass nullptr
+ // to key_only_cache_ for both instead. Also, the deleter function pointer
+ // will be called by user to perform some external operation which should
+ // be applied only once. Thus key_only_cache accepts an empty function.
+ // *Lambda function without capture can be assgined to a function pointer
+ Handle* h = key_only_cache_->Lookup(key);
+ if (h == nullptr) {
+ // TODO: Check for error here?
+ auto s = key_only_cache_->Insert(
+ key, nullptr, charge, [](const Slice& /*k*/, void* /*v*/) {}, nullptr,
+ priority);
+ s.PermitUncheckedError();
+ } else {
+ key_only_cache_->Release(h);
+ }
+
+ cache_activity_logger_.ReportAdd(key, charge);
+ if (!cache_) {
+ return Status::OK();
+ }
+ return cache_->Insert(key, value, charge, deleter, handle, priority);
+ }
+
+ using Cache::Lookup;
+ Handle* Lookup(const Slice& key, Statistics* stats) override {
+ Handle* h = key_only_cache_->Lookup(key);
+ if (h != nullptr) {
+ key_only_cache_->Release(h);
+ inc_hit_counter();
+ RecordTick(stats, SIM_BLOCK_CACHE_HIT);
+ } else {
+ inc_miss_counter();
+ RecordTick(stats, SIM_BLOCK_CACHE_MISS);
+ }
+
+ cache_activity_logger_.ReportLookup(key);
+ if (!cache_) {
+ return nullptr;
+ }
+ return cache_->Lookup(key, stats);
+ }
+
+ bool Ref(Handle* handle) override { return cache_->Ref(handle); }
+
+ using Cache::Release;
+ bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+ return cache_->Release(handle, erase_if_last_ref);
+ }
+
+ void Erase(const Slice& key) override {
+ cache_->Erase(key);
+ key_only_cache_->Erase(key);
+ }
+
+ void* Value(Handle* handle) override { return cache_->Value(handle); }
+
+ uint64_t NewId() override { return cache_->NewId(); }
+
+ size_t GetCapacity() const override { return cache_->GetCapacity(); }
+
+ bool HasStrictCapacityLimit() const override {
+ return cache_->HasStrictCapacityLimit();
+ }
+
+ size_t GetUsage() const override { return cache_->GetUsage(); }
+
+ size_t GetUsage(Handle* handle) const override {
+ return cache_->GetUsage(handle);
+ }
+
+ size_t GetCharge(Handle* handle) const override {
+ return cache_->GetCharge(handle);
+ }
+
+ DeleterFn GetDeleter(Handle* handle) const override {
+ return cache_->GetDeleter(handle);
+ }
+
+ size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
+
+ void DisownData() override {
+ cache_->DisownData();
+ key_only_cache_->DisownData();
+ }
+
+ void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+ bool thread_safe) override {
+ // only apply to _cache since key_only_cache doesn't hold value
+ cache_->ApplyToAllCacheEntries(callback, thread_safe);
+ }
+
+ void ApplyToAllEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ const ApplyToAllEntriesOptions& opts) override {
+ cache_->ApplyToAllEntries(callback, opts);
+ }
+
+ void EraseUnRefEntries() override {
+ cache_->EraseUnRefEntries();
+ key_only_cache_->EraseUnRefEntries();
+ }
+
+ size_t GetSimCapacity() const override {
+ return key_only_cache_->GetCapacity();
+ }
+ size_t GetSimUsage() const override { return key_only_cache_->GetUsage(); }
+ void SetSimCapacity(size_t capacity) override {
+ key_only_cache_->SetCapacity(capacity);
+ }
+
+ uint64_t get_miss_counter() const override {
+ return miss_times_.load(std::memory_order_relaxed);
+ }
+
+ uint64_t get_hit_counter() const override {
+ return hit_times_.load(std::memory_order_relaxed);
+ }
+
+ void reset_counter() override {
+ miss_times_.store(0, std::memory_order_relaxed);
+ hit_times_.store(0, std::memory_order_relaxed);
+ SetTickerCount(stats_, SIM_BLOCK_CACHE_HIT, 0);
+ SetTickerCount(stats_, SIM_BLOCK_CACHE_MISS, 0);
+ }
+
+ std::string ToString() const override {
+ std::ostringstream oss;
+ oss << "SimCache MISSes: " << get_miss_counter() << std::endl;
+ oss << "SimCache HITs: " << get_hit_counter() << std::endl;
+ auto lookups = get_miss_counter() + get_hit_counter();
+ oss << "SimCache HITRATE: " << std::fixed << std::setprecision(2)
+ << (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups)
+ << std::endl;
+ return oss.str();
+ }
+
+ std::string GetPrintableOptions() const override {
+ std::ostringstream oss;
+ oss << " cache_options:" << std::endl;
+ oss << cache_->GetPrintableOptions();
+ oss << " sim_cache_options:" << std::endl;
+ oss << key_only_cache_->GetPrintableOptions();
+ return oss.str();
+ }
+
+ Status StartActivityLogging(const std::string& activity_log_file, Env* env,
+ uint64_t max_logging_size = 0) override {
+ return cache_activity_logger_.StartLogging(activity_log_file, env,
+ max_logging_size);
+ }
+
+ void StopActivityLogging() override { cache_activity_logger_.StopLogging(); }
+
+ Status GetActivityLoggingStatus() override {
+ return cache_activity_logger_.bg_status();
+ }
+
+ private:
+ std::shared_ptr<Cache> cache_;
+ std::shared_ptr<Cache> key_only_cache_;
+ std::atomic<uint64_t> miss_times_;
+ std::atomic<uint64_t> hit_times_;
+ Statistics* stats_;
+ CacheActivityLogger cache_activity_logger_;
+
+ void inc_miss_counter() {
+ miss_times_.fetch_add(1, std::memory_order_relaxed);
+ }
+ void inc_hit_counter() { hit_times_.fetch_add(1, std::memory_order_relaxed); }
+};
+
+} // end anonymous namespace
+
+// For instrumentation purpose, use NewSimCache instead
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+ size_t sim_capacity, int num_shard_bits) {
+ LRUCacheOptions co;
+ co.capacity = sim_capacity;
+ co.num_shard_bits = num_shard_bits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ return NewSimCache(NewLRUCache(co), cache, num_shard_bits);
+}
+
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+ std::shared_ptr<Cache> cache,
+ int num_shard_bits) {
+ if (num_shard_bits >= 20) {
+ return nullptr; // the cache cannot be sharded into too many fine pieces
+ }
+ return std::make_shared<SimCacheImpl>(sim_cache, cache);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc
new file mode 100644
index 000000000..2e37cd347
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/sim_cache.h"
+
+#include <cstdlib>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCacheTest : public DBTestBase {
+ private:
+ size_t miss_count_ = 0;
+ size_t hit_count_ = 0;
+ size_t insert_count_ = 0;
+ size_t failure_count_ = 0;
+
+ public:
+ const size_t kNumBlocks = 5;
+ const size_t kValueSize = 1000;
+
+ SimCacheTest() : DBTestBase("sim_cache_test", /*env_do_fsync=*/true) {}
+
+ BlockBasedTableOptions GetTableOptions() {
+ BlockBasedTableOptions table_options;
+ // Set a small enough block size so that each key-value get its own block.
+ table_options.block_size = 1;
+ return table_options;
+ }
+
+ Options GetOptions(const BlockBasedTableOptions& table_options) {
+ Options options = CurrentOptions();
+ options.create_if_missing = true;
+ // options.compression = kNoCompression;
+ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ return options;
+ }
+
+ void InitTable(const Options& /*options*/) {
+ std::string value(kValueSize, 'a');
+ for (size_t i = 0; i < kNumBlocks * 2; i++) {
+ ASSERT_OK(Put(std::to_string(i), value.c_str()));
+ }
+ }
+
+ void RecordCacheCounters(const Options& options) {
+ miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ }
+
+ void CheckCacheCounters(const Options& options, size_t expected_misses,
+ size_t expected_hits, size_t expected_inserts,
+ size_t expected_failures) {
+ size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+ size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+ size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+ size_t new_failure_count =
+ TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+ ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+ ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+ ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+ ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+ miss_count_ = new_miss_count;
+ hit_count_ = new_hit_count;
+ insert_count_ = new_insert_count;
+ failure_count_ = new_failure_count;
+ }
+};
+
+TEST_F(SimCacheTest, SimCache) {
+ ReadOptions read_options;
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ InitTable(options);
+ LRUCacheOptions co;
+ co.capacity = 0;
+ co.num_shard_bits = 0;
+ co.strict_capacity_limit = false;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
+ table_options.block_cache = simCache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+ RecordCacheCounters(options);
+ // due to cache entry stats collector
+ uint64_t base_misses = simCache->get_miss_counter();
+
+ std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks);
+ Iterator* iter = nullptr;
+
+ // Load blocks into cache.
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(i));
+ ASSERT_OK(iter->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ iterators[i].reset(iter);
+ }
+ ASSERT_EQ(kNumBlocks, simCache->get_hit_counter() +
+ simCache->get_miss_counter() - base_misses);
+ ASSERT_EQ(0, simCache->get_hit_counter());
+ size_t usage = simCache->GetUsage();
+ ASSERT_LT(0, usage);
+ ASSERT_EQ(usage, simCache->GetSimUsage());
+ simCache->SetCapacity(usage);
+ ASSERT_EQ(usage, simCache->GetPinnedUsage());
+
+ // Test with strict capacity limit.
+ simCache->SetStrictCapacityLimit(true);
+ iter = db_->NewIterator(read_options);
+ iter->Seek(std::to_string(kNumBlocks * 2 - 1));
+ ASSERT_TRUE(iter->status().IsMemoryLimit());
+ CheckCacheCounters(options, 1, 0, 0, 1);
+ delete iter;
+ iter = nullptr;
+
+ // Release iterators and access cache again.
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ iterators[i].reset();
+ CheckCacheCounters(options, 0, 0, 0, 0);
+ }
+ // Add kNumBlocks again
+ for (size_t i = 0; i < kNumBlocks; i++) {
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options));
+ it->Seek(std::to_string(i));
+ ASSERT_OK(it->status());
+ CheckCacheCounters(options, 0, 1, 0, 0);
+ }
+ ASSERT_EQ(5, simCache->get_hit_counter());
+ for (size_t i = kNumBlocks; i < kNumBlocks * 2; i++) {
+ std::unique_ptr<Iterator> it(db_->NewIterator(read_options));
+ it->Seek(std::to_string(i));
+ ASSERT_OK(it->status());
+ CheckCacheCounters(options, 1, 0, 1, 0);
+ }
+ ASSERT_EQ(0, simCache->GetPinnedUsage());
+ ASSERT_EQ(3 * kNumBlocks + 1, simCache->get_hit_counter() +
+ simCache->get_miss_counter() - base_misses);
+ ASSERT_EQ(6, simCache->get_hit_counter());
+}
+
+TEST_F(SimCacheTest, SimCacheLogging) {
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ options.disable_auto_compactions = true;
+ LRUCacheOptions co;
+ co.capacity = 1024 * 1024;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
+ table_options.block_cache = sim_cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ Reopen(options);
+
+ int num_block_entries = 20;
+ for (int i = 0; i < num_block_entries; i++) {
+ ASSERT_OK(Put(Key(i), "val"));
+ ASSERT_OK(Flush());
+ }
+
+ std::string log_file = test::PerThreadDBPath(env_, "cache_log.txt");
+ ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_));
+ for (int i = 0; i < num_block_entries; i++) {
+ ASSERT_EQ(Get(Key(i)), "val");
+ }
+ for (int i = 0; i < num_block_entries; i++) {
+ ASSERT_EQ(Get(Key(i)), "val");
+ }
+ sim_cache->StopActivityLogging();
+ ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+ std::string file_contents = "";
+ ASSERT_OK(ReadFileToString(env_, log_file, &file_contents));
+ std::istringstream contents(file_contents);
+
+ int lookup_num = 0;
+ int add_num = 0;
+
+ std::string line;
+ // count number of lookups and additions
+ while (std::getline(contents, line)) {
+ // check if the line starts with LOOKUP or ADD
+ if (line.rfind("LOOKUP -", 0) == 0) {
+ ++lookup_num;
+ }
+ if (line.rfind("ADD -", 0) == 0) {
+ ++add_num;
+ }
+ }
+
+ // We asked for every block twice
+ ASSERT_EQ(lookup_num, num_block_entries * 2);
+
+ // We added every block only once, since the cache can hold all blocks
+ ASSERT_EQ(add_num, num_block_entries);
+
+ // Log things again but stop logging automatically after reaching 512 bytes
+ int max_size = 512;
+ ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size));
+ for (int it = 0; it < 10; it++) {
+ for (int i = 0; i < num_block_entries; i++) {
+ ASSERT_EQ(Get(Key(i)), "val");
+ }
+ }
+ ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+ uint64_t fsize = 0;
+ ASSERT_OK(env_->GetFileSize(log_file, &fsize));
+ // error margin of 100 bytes
+ ASSERT_LT(fsize, max_size + 100);
+ ASSERT_GT(fsize, max_size - 100);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
new file mode 100644
index 000000000..16f33934d
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+#include <memory>
+#include <sstream>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+
+CompactOnDeletionCollector::CompactOnDeletionCollector(
+ size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+ : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
+ current_bucket_(0),
+ num_keys_in_current_bucket_(0),
+ num_deletions_in_observation_window_(0),
+ deletion_trigger_(deletion_trigger),
+ deletion_ratio_(deletion_ratio),
+ deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1),
+ need_compaction_(false),
+ finished_(false) {
+ memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
+}
+
+// AddUserKey() will be called when a new key/value pair is inserted into the
+// table.
+// @params key the user key that is inserted into the table.
+// @params value the value that is inserted into the table.
+// @params file_size file size up to now
+Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
+ const Slice& /*value*/,
+ EntryType type,
+ SequenceNumber /*seq*/,
+ uint64_t /*file_size*/) {
+ assert(!finished_);
+ if (!bucket_size_ && !deletion_ratio_enabled_) {
+ // This collector is effectively disabled
+ return Status::OK();
+ }
+
+ if (need_compaction_) {
+ // If the output file already needs to be compacted, skip the check.
+ return Status::OK();
+ }
+
+ if (deletion_ratio_enabled_) {
+ total_entries_++;
+ if (type == kEntryDelete) {
+ deletion_entries_++;
+ }
+ }
+
+ if (bucket_size_) {
+ if (num_keys_in_current_bucket_ == bucket_size_) {
+ // When the current bucket is full, advance the cursor of the
+ // ring buffer to the next bucket.
+ current_bucket_ = (current_bucket_ + 1) % kNumBuckets;
+
+ // Update the current count of observed deletion keys by excluding
+ // the number of deletion keys in the oldest bucket in the
+ // observation window.
+ assert(num_deletions_in_observation_window_ >=
+ num_deletions_in_buckets_[current_bucket_]);
+ num_deletions_in_observation_window_ -=
+ num_deletions_in_buckets_[current_bucket_];
+ num_deletions_in_buckets_[current_bucket_] = 0;
+ num_keys_in_current_bucket_ = 0;
+ }
+
+ num_keys_in_current_bucket_++;
+ if (type == kEntryDelete) {
+ num_deletions_in_observation_window_++;
+ num_deletions_in_buckets_[current_bucket_]++;
+ if (num_deletions_in_observation_window_ >= deletion_trigger_) {
+ need_compaction_ = true;
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+Status CompactOnDeletionCollector::Finish(
+ UserCollectedProperties* /*properties*/) {
+ if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) {
+ double ratio = static_cast<double>(deletion_entries_) / total_entries_;
+ need_compaction_ = ratio >= deletion_ratio_;
+ }
+ finished_ = true;
+ return Status::OK();
+}
+static std::unordered_map<std::string, OptionTypeInfo>
+ on_deletion_collector_type_info = {
+#ifndef ROCKSDB_LITE
+ {"window_size",
+ {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+ OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+ [](const ConfigOptions&, const std::string&, const std::string& value,
+ void* addr) {
+ auto* factory =
+ static_cast<CompactOnDeletionCollectorFactory*>(addr);
+ factory->SetWindowSize(ParseSizeT(value));
+ return Status::OK();
+ },
+ [](const ConfigOptions&, const std::string&, const void* addr,
+ std::string* value) {
+ const auto* factory =
+ static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+ *value = std::to_string(factory->GetWindowSize());
+ return Status::OK();
+ },
+ nullptr}},
+ {"deletion_trigger",
+ {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+ OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+ [](const ConfigOptions&, const std::string&, const std::string& value,
+ void* addr) {
+ auto* factory =
+ static_cast<CompactOnDeletionCollectorFactory*>(addr);
+ factory->SetDeletionTrigger(ParseSizeT(value));
+ return Status::OK();
+ },
+ [](const ConfigOptions&, const std::string&, const void* addr,
+ std::string* value) {
+ const auto* factory =
+ static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+ *value = std::to_string(factory->GetDeletionTrigger());
+ return Status::OK();
+ },
+ nullptr}},
+ {"deletion_ratio",
+ {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+ OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+ [](const ConfigOptions&, const std::string&, const std::string& value,
+ void* addr) {
+ auto* factory =
+ static_cast<CompactOnDeletionCollectorFactory*>(addr);
+ factory->SetDeletionRatio(ParseDouble(value));
+ return Status::OK();
+ },
+ [](const ConfigOptions&, const std::string&, const void* addr,
+ std::string* value) {
+ const auto* factory =
+ static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+ *value = std::to_string(factory->GetDeletionRatio());
+ return Status::OK();
+ },
+ nullptr}},
+
+#endif // ROCKSDB_LITE
+};
+
+CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory(
+ size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+ : sliding_window_size_(sliding_window_size),
+ deletion_trigger_(deletion_trigger),
+ deletion_ratio_(deletion_ratio) {
+ RegisterOptions("", this, &on_deletion_collector_type_info);
+}
+
+TablePropertiesCollector*
+CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) {
+ return new CompactOnDeletionCollector(sliding_window_size_.load(),
+ deletion_trigger_.load(),
+ deletion_ratio_.load());
+}
+
+std::string CompactOnDeletionCollectorFactory::ToString() const {
+ std::ostringstream cfg;
+ cfg << Name() << " (Sliding window size = " << sliding_window_size_.load()
+ << " Deletion trigger = " << deletion_trigger_.load()
+ << " Deletion ratio = " << deletion_ratio_.load() << ')';
+ return cfg.str();
+}
+
+std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+ size_t deletion_trigger,
+ double deletion_ratio) {
+ return std::shared_ptr<CompactOnDeletionCollectorFactory>(
+ new CompactOnDeletionCollectorFactory(sliding_window_size,
+ deletion_trigger, deletion_ratio));
+}
+namespace {
+static int RegisterTablePropertiesCollectorFactories(
+ ObjectLibrary& library, const std::string& /*arg*/) {
+ library.AddFactory<TablePropertiesCollectorFactory>(
+ CompactOnDeletionCollectorFactory::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+ std::string* /* errmsg */) {
+ // By default, create a CompactionOnDeletionCollector that is disabled.
+ // Users will need to provide configuration parameters or call the
+ // corresponding Setter to enable the factory.
+ guard->reset(new CompactOnDeletionCollectorFactory(0, 0, 0));
+ return guard->get();
+ });
+ return 1;
+}
+} // namespace
+#endif // !ROCKSDB_LITE
+
+Status TablePropertiesCollectorFactory::CreateFromString(
+ const ConfigOptions& options, const std::string& value,
+ std::shared_ptr<TablePropertiesCollectorFactory>* result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()),
+ "");
+ });
+#endif // ROCKSDB_LITE
+ return LoadSharedObject<TablePropertiesCollectorFactory>(options, value,
+ nullptr, result);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
new file mode 100644
index 000000000..2f7dc4f1b
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/table_properties_collectors.h"
+namespace ROCKSDB_NAMESPACE {
+
+class CompactOnDeletionCollector : public TablePropertiesCollector {
+ public:
+ CompactOnDeletionCollector(size_t sliding_window_size,
+ size_t deletion_trigger, double deletion_raatio);
+
+ // AddUserKey() will be called when a new key/value pair is inserted into the
+ // table.
+ // @params key the user key that is inserted into the table.
+ // @params value the value that is inserted into the table.
+ // @params file_size file size up to now
+ virtual Status AddUserKey(const Slice& key, const Slice& value,
+ EntryType type, SequenceNumber seq,
+ uint64_t file_size) override;
+
+ // Finish() will be called when a table has already been built and is ready
+ // for writing the properties block.
+ // @params properties User will add their collected statistics to
+ // `properties`.
+ virtual Status Finish(UserCollectedProperties* /*properties*/) override;
+
+ // Return the human-readable properties, where the key is property name and
+ // the value is the human-readable form of value.
+ virtual UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties();
+ }
+
+ // The name of the properties collector can be used for debugging purpose.
+ virtual const char* Name() const override {
+ return "CompactOnDeletionCollector";
+ }
+
+ // EXPERIMENTAL Return whether the output file should be further compacted
+ virtual bool NeedCompact() const override { return need_compaction_; }
+
+ static const int kNumBuckets = 128;
+
+ private:
+ void Reset();
+
+ // A ring buffer that used to count the number of deletion entries for every
+ // "bucket_size_" keys.
+ size_t num_deletions_in_buckets_[kNumBuckets];
+ // the number of keys in a bucket
+ size_t bucket_size_;
+
+ size_t current_bucket_;
+ size_t num_keys_in_current_bucket_;
+ size_t num_deletions_in_observation_window_;
+ size_t deletion_trigger_;
+ const double deletion_ratio_;
+ const bool deletion_ratio_enabled_;
+ size_t total_entries_ = 0;
+ size_t deletion_entries_ = 0;
+ // true if the current SST file needs to be compacted.
+ bool need_compaction_;
+ bool finished_;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
new file mode 100644
index 000000000..88aeb8d5c
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -0,0 +1,245 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "port/stack_trace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(CompactOnDeletionCollector, DeletionRatio) {
+ TablePropertiesCollectorFactory::Context context;
+ context.column_family_id =
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+ const size_t kTotalEntries = 100;
+
+ {
+ // Disable deletion ratio.
+ for (double deletion_ratio : {-1.5, -1.0, 0.0, 1.5, 2.0}) {
+ auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+ std::unique_ptr<TablePropertiesCollector> collector(
+ factory->CreateTablePropertiesCollector(context));
+ for (size_t i = 0; i < kTotalEntries; i++) {
+ // All entries are deletion entries.
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+ ASSERT_FALSE(collector->NeedCompact());
+ }
+ ASSERT_OK(collector->Finish(nullptr));
+ ASSERT_FALSE(collector->NeedCompact());
+ }
+ }
+
+ {
+ for (double deletion_ratio : {0.3, 0.5, 0.8, 1.0}) {
+ auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+ const size_t deletion_entries_trigger =
+ static_cast<size_t>(deletion_ratio * kTotalEntries);
+ for (int delta : {-1, 0, 1}) {
+ // Actual deletion entry ratio <, =, > deletion_ratio
+ size_t actual_deletion_entries = deletion_entries_trigger + delta;
+ std::unique_ptr<TablePropertiesCollector> collector(
+ factory->CreateTablePropertiesCollector(context));
+ for (size_t i = 0; i < kTotalEntries; i++) {
+ if (i < actual_deletion_entries) {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+ } else {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ ASSERT_FALSE(collector->NeedCompact());
+ }
+ ASSERT_OK(collector->Finish(nullptr));
+ if (delta >= 0) {
+ // >= deletion_ratio
+ ASSERT_TRUE(collector->NeedCompact());
+ } else {
+ ASSERT_FALSE(collector->NeedCompact());
+ }
+ }
+ }
+ }
+}
+
+TEST(CompactOnDeletionCollector, SlidingWindow) {
+ const int kWindowSizes[] = {1000, 10000, 10000, 127, 128, 129,
+ 255, 256, 257, 2, 10000};
+ const int kDeletionTriggers[] = {500, 9500, 4323, 47, 61, 128,
+ 250, 250, 250, 2, 2};
+ TablePropertiesCollectorFactory::Context context;
+ context.column_family_id =
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
+ std::vector<int> window_sizes;
+ std::vector<int> deletion_triggers;
+ // deterministic tests
+ for (int test = 0; test < 9; ++test) {
+ window_sizes.emplace_back(kWindowSizes[test]);
+ deletion_triggers.emplace_back(kDeletionTriggers[test]);
+ }
+
+ // randomize tests
+ Random rnd(301);
+ const int kMaxTestSize = 100000l;
+ for (int random_test = 0; random_test < 10; random_test++) {
+ int window_size = rnd.Uniform(kMaxTestSize) + 1;
+ int deletion_trigger = rnd.Uniform(window_size);
+ window_sizes.emplace_back(window_size);
+ deletion_triggers.emplace_back(deletion_trigger);
+ }
+
+ assert(window_sizes.size() == deletion_triggers.size());
+
+ for (size_t test = 0; test < window_sizes.size(); ++test) {
+ const int kBucketSize = 128;
+ const int kWindowSize = window_sizes[test];
+ const int kPaddedWindowSize =
+ kBucketSize * ((window_sizes[test] + kBucketSize - 1) / kBucketSize);
+ const int kNumDeletionTrigger = deletion_triggers[test];
+ const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize;
+ // Simple test
+ {
+ auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+ kNumDeletionTrigger);
+ const int kSample = 10;
+ for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+ std::unique_ptr<TablePropertiesCollector> collector(
+ factory->CreateTablePropertiesCollector(context));
+ int deletions = 0;
+ for (int i = 0; i < kPaddedWindowSize; ++i) {
+ if (i % kSample < delete_rate) {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+ deletions++;
+ } else {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ }
+ if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
+ std::abs(deletions - kNumDeletionTrigger) > kBias) {
+ fprintf(stderr,
+ "[Error] collector->NeedCompact() != (%d >= %d)"
+ " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+ deletions, kNumDeletionTrigger, kWindowSize,
+ kNumDeletionTrigger);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_OK(collector->Finish(nullptr));
+ }
+ }
+
+ // Only one section of a file satisfies the compaction trigger
+ {
+ auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+ kNumDeletionTrigger);
+ const int kSample = 10;
+ for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+ std::unique_ptr<TablePropertiesCollector> collector(
+ factory->CreateTablePropertiesCollector(context));
+ int deletions = 0;
+ for (int section = 0; section < 5; ++section) {
+ int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+ for (int i = 0; i < initial_entries; ++i) {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ }
+ for (int i = 0; i < kPaddedWindowSize; ++i) {
+ if (i % kSample < delete_rate) {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+ deletions++;
+ } else {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ }
+ for (int section = 0; section < 5; ++section) {
+ int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+ for (int i = 0; i < ending_entries; ++i) {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ }
+ if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
+ std::abs(deletions - kNumDeletionTrigger) > kBias) {
+ fprintf(stderr,
+ "[Error] collector->NeedCompact() %d != (%d >= %d)"
+ " with kWindowSize = %d, kNumDeletionTrigger = %d\n",
+ collector->NeedCompact(), deletions, kNumDeletionTrigger,
+ kWindowSize, kNumDeletionTrigger);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_OK(collector->Finish(nullptr));
+ }
+ }
+
+ // TEST 3: Issues a lots of deletes, but their density is not
+ // high enough to trigger compaction.
+ {
+ std::unique_ptr<TablePropertiesCollector> collector;
+ auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+ kNumDeletionTrigger);
+ collector.reset(factory->CreateTablePropertiesCollector(context));
+ assert(collector->NeedCompact() == false);
+ // Insert "kNumDeletionTrigger * 0.95" deletions for every
+ // "kWindowSize" and verify compaction is not needed.
+ const int kDeletionsPerSection = kNumDeletionTrigger * 95 / 100;
+ if (kDeletionsPerSection >= 0) {
+ for (int section = 0; section < 200; ++section) {
+ for (int i = 0; i < kPaddedWindowSize; ++i) {
+ if (i < kDeletionsPerSection) {
+ ASSERT_OK(collector->AddUserKey("hello", "rocksdb", kEntryDelete,
+ 0, 0));
+ } else {
+ ASSERT_OK(
+ collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+ }
+ }
+ }
+ if (collector->NeedCompact() &&
+ std::abs(kDeletionsPerSection - kNumDeletionTrigger) > kBias) {
+ fprintf(stderr,
+ "[Error] collector->NeedCompact() != false"
+ " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+ kWindowSize, kNumDeletionTrigger);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_OK(collector->Finish(nullptr));
+ }
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#else
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n");
+ return 0;
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.cc b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc
new file mode 100644
index 000000000..5886d3539
--- /dev/null
+++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/trace/file_trace_reader_writer.h"
+
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "trace_replay/trace_replay.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const unsigned int FileTraceReader::kBufferSize = 1024; // 1KB
+
+FileTraceReader::FileTraceReader(
+ std::unique_ptr<RandomAccessFileReader>&& reader)
+ : file_reader_(std::move(reader)),
+ offset_(0),
+ buffer_(new char[kBufferSize]) {}
+
+FileTraceReader::~FileTraceReader() {
+ Close().PermitUncheckedError();
+ delete[] buffer_;
+}
+
+Status FileTraceReader::Close() {
+ file_reader_.reset();
+ return Status::OK();
+}
+
+Status FileTraceReader::Reset() {
+ if (file_reader_ == nullptr) {
+ return Status::IOError("TraceReader is closed.");
+ }
+ offset_ = 0;
+ return Status::OK();
+}
+
+Status FileTraceReader::Read(std::string* data) {
+ assert(file_reader_ != nullptr);
+ Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize,
+ &result_, buffer_, nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+ if (result_.size() == 0) {
+ // No more data to read
+ // Todo: Come up with a better way to indicate end of data. May be this
+ // could be avoided once footer is introduced.
+ return Status::Incomplete();
+ }
+ if (result_.size() < kTraceMetadataSize) {
+ return Status::Corruption("Corrupted trace file.");
+ }
+ *data = result_.ToString();
+ offset_ += kTraceMetadataSize;
+
+ uint32_t payload_len =
+ DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]);
+
+ // Read Payload
+ unsigned int bytes_to_read = payload_len;
+ unsigned int to_read =
+ bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+ while (to_read > 0) {
+ s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_,
+ nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ return s;
+ }
+ if (result_.size() < to_read) {
+ return Status::Corruption("Corrupted trace file.");
+ }
+ data->append(result_.data(), result_.size());
+
+ offset_ += to_read;
+ bytes_to_read -= to_read;
+ to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+ }
+
+ return s;
+}
+
+FileTraceWriter::FileTraceWriter(
+ std::unique_ptr<WritableFileWriter>&& file_writer)
+ : file_writer_(std::move(file_writer)) {}
+
+FileTraceWriter::~FileTraceWriter() { Close().PermitUncheckedError(); }
+
+Status FileTraceWriter::Close() {
+ file_writer_.reset();
+ return Status::OK();
+}
+
+Status FileTraceWriter::Write(const Slice& data) {
+ return file_writer_->Append(data);
+}
+
+uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); }
+
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceReader>* trace_reader) {
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ Status s = RandomAccessFileReader::Create(
+ env->GetFileSystem(), trace_filename, FileOptions(env_options),
+ &file_reader, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ trace_reader->reset(new FileTraceReader(std::move(file_reader)));
+ return s;
+}
+
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+ const std::string& trace_filename,
+ std::unique_ptr<TraceWriter>* trace_writer) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename,
+ FileOptions(env_options), &file_writer,
+ nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.h b/src/rocksdb/utilities/trace/file_trace_reader_writer.h
new file mode 100644
index 000000000..65d483108
--- /dev/null
+++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/trace_reader_writer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// FileTraceReader allows reading RocksDB traces from a file.
+class FileTraceReader : public TraceReader {
+ public:
+ explicit FileTraceReader(std::unique_ptr<RandomAccessFileReader>&& reader);
+ ~FileTraceReader();
+
+ virtual Status Read(std::string* data) override;
+ virtual Status Close() override;
+ virtual Status Reset() override;
+
+ private:
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ Slice result_;
+ size_t offset_;
+ char* const buffer_;
+
+ static const unsigned int kBufferSize;
+};
+
+// FileTraceWriter allows writing RocksDB traces to a file.
+class FileTraceWriter : public TraceWriter {
+ public:
+ explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer);
+ ~FileTraceWriter();
+
+ virtual Status Write(const Slice& data) override;
+ virtual Status Close() override;
+ virtual uint64_t GetFileSize() override;
+
+ private:
+ std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/trace/replayer_impl.cc b/src/rocksdb/utilities/trace/replayer_impl.cc
new file mode 100644
index 000000000..31023f1a2
--- /dev/null
+++ b/src/rocksdb/utilities/trace/replayer_impl.cc
@@ -0,0 +1,316 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/trace/replayer_impl.h"
+
+#include <cmath>
+#include <thread>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ReplayerImpl::ReplayerImpl(DB* db,
+ const std::vector<ColumnFamilyHandle*>& handles,
+ std::unique_ptr<TraceReader>&& reader)
+ : Replayer(),
+ trace_reader_(std::move(reader)),
+ prepared_(false),
+ trace_end_(false),
+ header_ts_(0),
+ exec_handler_(TraceRecord::NewExecutionHandler(db, handles)),
+ env_(db->GetEnv()),
+ trace_file_version_(-1) {}
+
+ReplayerImpl::~ReplayerImpl() {
+ exec_handler_.reset();
+ trace_reader_.reset();
+}
+
+Status ReplayerImpl::Prepare() {
+ Trace header;
+ int db_version;
+ Status s = ReadHeader(&header);
+ if (!s.ok()) {
+ return s;
+ }
+ s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version);
+ if (!s.ok()) {
+ return s;
+ }
+ header_ts_ = header.ts;
+ prepared_ = true;
+ trace_end_ = false;
+ return Status::OK();
+}
+
+Status ReplayerImpl::Next(std::unique_ptr<TraceRecord>* record) {
+ if (!prepared_) {
+ return Status::Incomplete("Not prepared!");
+ }
+ if (trace_end_) {
+ return Status::Incomplete("Trace end.");
+ }
+
+ Trace trace;
+ Status s = ReadTrace(&trace); // ReadTrace is atomic
+ // Reached the trace end.
+ if (s.ok() && trace.type == kTraceEnd) {
+ trace_end_ = true;
+ return Status::Incomplete("Trace end.");
+ }
+ if (!s.ok() || record == nullptr) {
+ return s;
+ }
+
+ return TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, record);
+}
+
+Status ReplayerImpl::Execute(const std::unique_ptr<TraceRecord>& record,
+ std::unique_ptr<TraceRecordResult>* result) {
+ return record->Accept(exec_handler_.get(), result);
+}
+
+Status ReplayerImpl::Replay(
+ const ReplayOptions& options,
+ const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+ result_callback) {
+ if (options.fast_forward <= 0.0) {
+ return Status::InvalidArgument("Wrong fast forward speed!");
+ }
+
+ if (!prepared_) {
+ return Status::Incomplete("Not prepared!");
+ }
+ if (trace_end_) {
+ return Status::Incomplete("Trace end.");
+ }
+
+ Status s = Status::OK();
+
+ if (options.num_threads <= 1) {
+ // num_threads == 0 or num_threads == 1 uses single thread.
+ std::chrono::system_clock::time_point replay_epoch =
+ std::chrono::system_clock::now();
+
+ while (s.ok()) {
+ Trace trace;
+ s = ReadTrace(&trace);
+ // If already at trace end, ReadTrace should return Status::Incomplete().
+ if (!s.ok()) {
+ break;
+ }
+
+ // No need to sleep before breaking the loop if at the trace end.
+ if (trace.type == kTraceEnd) {
+ trace_end_ = true;
+ s = Status::Incomplete("Trace end.");
+ break;
+ }
+
+ // In single-threaded replay, decode first then sleep.
+ std::unique_ptr<TraceRecord> record;
+ s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, &record);
+ if (!s.ok() && !s.IsNotSupported()) {
+ break;
+ }
+
+ std::chrono::system_clock::time_point sleep_to =
+ replay_epoch +
+ std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+ 1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+ if (sleep_to > std::chrono::system_clock::now()) {
+ std::this_thread::sleep_until(sleep_to);
+ }
+
+ // Skip unsupported traces, stop for other errors.
+ if (s.IsNotSupported()) {
+ if (result_callback != nullptr) {
+ result_callback(s, nullptr);
+ }
+ s = Status::OK();
+ continue;
+ }
+
+ if (result_callback == nullptr) {
+ s = Execute(record, nullptr);
+ } else {
+ std::unique_ptr<TraceRecordResult> res;
+ s = Execute(record, &res);
+ result_callback(s, std::move(res));
+ }
+ }
+ } else {
+ // Multi-threaded replay.
+ ThreadPoolImpl thread_pool;
+ thread_pool.SetHostEnv(env_);
+ thread_pool.SetBackgroundThreads(static_cast<int>(options.num_threads));
+
+ std::mutex mtx;
+ // Background decoding and execution status.
+ Status bg_s = Status::OK();
+ uint64_t last_err_ts = static_cast<uint64_t>(-1);
+ // Callback function used in background work to update bg_s for the ealiest
+ // TraceRecord which has execution error. This is different from the
+ // timestamp of the first execution error (either start or end timestamp).
+ //
+ // Suppose TraceRecord R1, R2, with timestamps T1 < T2. Their execution
+ // timestamps are T1_start, T1_end, T2_start, T2_end.
+ // Single-thread: there must be T1_start < T1_end < T2_start < T2_end.
+ // Multi-thread: T1_start < T2_start may not be enforced. Orders of them are
+ // totally unknown.
+ // In order to report the same `first` error in both single-thread and
+ // multi-thread replay, we can only rely on the TraceRecords' timestamps,
+ // rather than their executin timestamps. Although in single-thread replay,
+ // the first error is also the last error, while in multi-thread replay, the
+ // first error may not be the first error in execution, and it may not be
+ // the last error in exeution as well.
+ auto error_cb = [&mtx, &bg_s, &last_err_ts](Status err, uint64_t err_ts) {
+ std::lock_guard<std::mutex> gd(mtx);
+ // Only record the first error.
+ if (!err.ok() && !err.IsNotSupported() && err_ts < last_err_ts) {
+ bg_s = err;
+ last_err_ts = err_ts;
+ }
+ };
+
+ std::chrono::system_clock::time_point replay_epoch =
+ std::chrono::system_clock::now();
+
+ while (bg_s.ok() && s.ok()) {
+ Trace trace;
+ s = ReadTrace(&trace);
+ // If already at trace end, ReadTrace should return Status::Incomplete().
+ if (!s.ok()) {
+ break;
+ }
+
+ TraceType trace_type = trace.type;
+
+ // No need to sleep before breaking the loop if at the trace end.
+ if (trace_type == kTraceEnd) {
+ trace_end_ = true;
+ s = Status::Incomplete("Trace end.");
+ break;
+ }
+
+ // In multi-threaded replay, sleep first then start decoding and
+ // execution in a thread.
+ std::chrono::system_clock::time_point sleep_to =
+ replay_epoch +
+ std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+ 1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+ if (sleep_to > std::chrono::system_clock::now()) {
+ std::this_thread::sleep_until(sleep_to);
+ }
+
+ if (trace_type == kTraceWrite || trace_type == kTraceGet ||
+ trace_type == kTraceIteratorSeek ||
+ trace_type == kTraceIteratorSeekForPrev ||
+ trace_type == kTraceMultiGet) {
+ std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
+ ra->trace_entry = std::move(trace);
+ ra->handler = exec_handler_.get();
+ ra->trace_file_version = trace_file_version_;
+ ra->error_cb = error_cb;
+ ra->result_cb = result_callback;
+ thread_pool.Schedule(&ReplayerImpl::BackgroundWork, ra.release(),
+ nullptr, nullptr);
+ } else {
+ // Skip unsupported traces.
+ if (result_callback != nullptr) {
+ result_callback(Status::NotSupported("Unsupported trace type."),
+ nullptr);
+ }
+ }
+ }
+
+ thread_pool.WaitForJobsAndJoinAllThreads();
+ if (!bg_s.ok()) {
+ s = bg_s;
+ }
+ }
+
+ if (s.IsIncomplete()) {
+ // Reaching eof returns Incomplete status at the moment.
+ // Could happen when killing a process without calling EndTrace() API.
+ // TODO: Add better error handling.
+ trace_end_ = true;
+ return Status::OK();
+ }
+ return s;
+}
+
+uint64_t ReplayerImpl::GetHeaderTimestamp() const { return header_ts_; }
+
+Status ReplayerImpl::ReadHeader(Trace* header) {
+ assert(header != nullptr);
+ Status s = trace_reader_->Reset();
+ if (!s.ok()) {
+ return s;
+ }
+ std::string encoded_trace;
+ // Read the trace head
+ s = trace_reader_->Read(&encoded_trace);
+ if (!s.ok()) {
+ return s;
+ }
+
+ return TracerHelper::DecodeHeader(encoded_trace, header);
+}
+
+Status ReplayerImpl::ReadTrace(Trace* trace) {
+ assert(trace != nullptr);
+ std::string encoded_trace;
+ // We don't know if TraceReader is implemented thread-safe, so we protect the
+ // reading trace part with a mutex. The decoding part does not need to be
+ // protected since it's local.
+ {
+ std::lock_guard<std::mutex> guard(mutex_);
+ Status s = trace_reader_->Read(&encoded_trace);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return TracerHelper::DecodeTrace(encoded_trace, trace);
+}
+
+void ReplayerImpl::BackgroundWork(void* arg) {
+ std::unique_ptr<ReplayerWorkerArg> ra(
+ reinterpret_cast<ReplayerWorkerArg*>(arg));
+ assert(ra != nullptr);
+
+ std::unique_ptr<TraceRecord> record;
+ Status s = TracerHelper::DecodeTraceRecord(&(ra->trace_entry),
+ ra->trace_file_version, &record);
+ if (!s.ok()) {
+ // Stop the replay
+ if (ra->error_cb != nullptr) {
+ ra->error_cb(s, ra->trace_entry.ts);
+ }
+ // Report the result
+ if (ra->result_cb != nullptr) {
+ ra->result_cb(s, nullptr);
+ }
+ return;
+ }
+
+ if (ra->result_cb == nullptr) {
+ s = record->Accept(ra->handler, nullptr);
+ } else {
+ std::unique_ptr<TraceRecordResult> res;
+ s = record->Accept(ra->handler, &res);
+ ra->result_cb(s, std::move(res));
+ }
+ record.reset();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/trace/replayer_impl.h b/src/rocksdb/utilities/trace/replayer_impl.h
new file mode 100644
index 000000000..367b0b51e
--- /dev/null
+++ b/src/rocksdb/utilities/trace/replayer_impl.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReplayerImpl : public Replayer {
+ public:
+ ReplayerImpl(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+ std::unique_ptr<TraceReader>&& reader);
+ ~ReplayerImpl() override;
+
+ using Replayer::Prepare;
+ Status Prepare() override;
+
+ using Replayer::Next;
+ Status Next(std::unique_ptr<TraceRecord>* record) override;
+
+ using Replayer::Execute;
+ Status Execute(const std::unique_ptr<TraceRecord>& record,
+ std::unique_ptr<TraceRecordResult>* result) override;
+
+ using Replayer::Replay;
+ Status Replay(
+ const ReplayOptions& options,
+ const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+ result_callback) override;
+
+ using Replayer::GetHeaderTimestamp;
+ uint64_t GetHeaderTimestamp() const override;
+
+ private:
+ Status ReadHeader(Trace* header);
+ Status ReadTrace(Trace* trace);
+
+ // Generic function to execute a Trace in a thread pool.
+ static void BackgroundWork(void* arg);
+
+ std::unique_ptr<TraceReader> trace_reader_;
+ std::mutex mutex_;
+ std::atomic<bool> prepared_;
+ std::atomic<bool> trace_end_;
+ uint64_t header_ts_;
+ std::unique_ptr<TraceRecord::Handler> exec_handler_;
+ Env* env_;
+ // When reading the trace header, the trace file version can be parsed.
+ // Replayer will use different decode method to get the trace content based
+ // on different trace file version.
+ int trace_file_version_;
+};
+
+// Arguments passed to BackgroundWork() for replaying in a thread pool.
+struct ReplayerWorkerArg {
+ Trace trace_entry;
+ int trace_file_version;
+ // Handler to execute TraceRecord.
+ TraceRecord::Handler* handler;
+ // Callback function to report the error status and the timestamp of the
+ // TraceRecord (not the start/end timestamp of executing the TraceRecord).
+ std::function<void(Status, uint64_t)> error_cb;
+ // Callback function to report the trace execution status and operation
+ // execution status/result(s).
+ std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)> result_cb;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.cc b/src/rocksdb/utilities/transactions/lock/lock_manager.cc
new file mode 100644
index 000000000..df16b32ad
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_manager.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+ const TransactionDBOptions& opt) {
+ assert(db);
+ if (opt.lock_mgr_handle) {
+ // A custom lock manager was provided in options
+ auto mgr = opt.lock_mgr_handle->getLockManager();
+ return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
+ } else {
+ // Use a point lock manager by default
+ return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.h b/src/rocksdb/utilities/transactions/lock/lock_manager.h
new file mode 100644
index 000000000..a5ce1948c
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_manager.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+class LockManager {
+ public:
+ virtual ~LockManager() {}
+
+ // Whether supports locking a specific key.
+ virtual bool IsPointLockSupported() const = 0;
+
+ // Whether supports locking a range of keys.
+ virtual bool IsRangeLockSupported() const = 0;
+
+ // Locks acquired through this LockManager should be tracked by
+ // the LockTrackers created through the returned factory.
+ virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0;
+
+ // Enable locking for the specified column family.
+ // Caller should guarantee that this column family is not already enabled.
+ virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+ // Disable locking for the specified column family.
+ // Caller should guarantee that this column family is no longer used.
+ virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+ // Attempt to lock a key or a key range. If OK status is returned, the caller
+ // is responsible for calling UnLock() on this key.
+ virtual Status TryLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id,
+ const std::string& key, Env* env, bool exclusive) = 0;
+ // The range [start, end] are inclusive at both sides.
+ virtual Status TryLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id, const Endpoint& start,
+ const Endpoint& end, Env* env, bool exclusive) = 0;
+
+ // Unlock a key or a range locked by TryLock(). txn must be the same
+ // Transaction that locked this key.
+ virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+ Env* env) = 0;
+ virtual void UnLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id, const std::string& key,
+ Env* env) = 0;
+ virtual void UnLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id, const Endpoint& start,
+ const Endpoint& end, Env* env) = 0;
+
+ using PointLockStatus = std::unordered_multimap<ColumnFamilyId, KeyLockInfo>;
+ virtual PointLockStatus GetPointLockStatus() = 0;
+
+ using RangeLockStatus =
+ std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+ virtual RangeLockStatus GetRangeLockStatus() = 0;
+
+ virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+
+ virtual void Resize(uint32_t new_size) = 0;
+};
+
+// LockManager should always be constructed through this factory method,
+// instead of constructing through concrete implementations' constructor.
+// Caller owns the returned pointer.
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+ const TransactionDBOptions& opt);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_tracker.h b/src/rocksdb/utilities/transactions/lock/lock_tracker.h
new file mode 100644
index 000000000..5fa228a82
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_tracker.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Request for locking a single key.
+struct PointLockRequest {
+ // The id of the key's column family.
+ ColumnFamilyId column_family_id = 0;
+ // The key to lock.
+ std::string key;
+ // The sequence number from which there is no concurrent update to key.
+ SequenceNumber seq = 0;
+ // Whether the lock is acquired only for read.
+ bool read_only = false;
+ // Whether the lock is in exclusive mode.
+ bool exclusive = true;
+};
+
+// Request for locking a range of keys.
+struct RangeLockRequest {
+ // The id of the key's column family.
+ ColumnFamilyId column_family_id;
+
+ // The range to be locked
+ Endpoint start_endp;
+ Endpoint end_endp;
+};
+
+struct PointLockStatus {
+ // Whether the key is locked.
+ bool locked = false;
+ // Whether the key is locked in exclusive mode.
+ bool exclusive = true;
+ // The sequence number in the tracked PointLockRequest.
+ SequenceNumber seq = 0;
+};
+
+// Return status when calling LockTracker::Untrack.
+enum class UntrackStatus {
+ // The lock is not tracked at all, so no lock to untrack.
+ NOT_TRACKED,
+ // The lock is untracked but not removed from the tracker.
+ UNTRACKED,
+ // The lock is removed from the tracker.
+ REMOVED,
+};
+
+// Tracks the lock requests.
+// In PessimisticTransaction, it tracks the locks acquired through LockMgr;
+// In OptimisticTransaction, since there is no LockMgr, it tracks the lock
+// intention. Not thread-safe.
+class LockTracker {
+ public:
+ virtual ~LockTracker() {}
+
+ // Whether supports locking a specific key.
+ virtual bool IsPointLockSupported() const = 0;
+
+ // Whether supports locking a range of keys.
+ virtual bool IsRangeLockSupported() const = 0;
+
+ // Tracks the acquirement of a lock on key.
+ //
+ // If this method is not supported, leave it as a no-op.
+ virtual void Track(const PointLockRequest& /*lock_request*/) = 0;
+
+ // Untracks the lock on a key.
+ // seq and exclusive in lock_request are not used.
+ //
+ // If this method is not supported, leave it as a no-op and
+ // returns NOT_TRACKED.
+ virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0;
+
+ // Counterpart of Track(const PointLockRequest&) for RangeLockRequest.
+ virtual void Track(const RangeLockRequest& /*lock_request*/) = 0;
+
+ // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest.
+ virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0;
+
+ // Merges lock requests tracked in the specified tracker into the current
+ // tracker.
+ //
+ // E.g. for point lock, if a key in tracker is not yet tracked,
+ // track this new key; otherwise, merge the tracked information of the key
+ // such as lock's exclusiveness, read/write statistics.
+ //
+ // If this method is not supported, leave it as a no-op.
+ //
+ // REQUIRED: the specified tracker must be of the same concrete class type as
+ // the current tracker.
+ virtual void Merge(const LockTracker& /*tracker*/) = 0;
+
+ // This is a reverse operation of Merge.
+ //
+ // E.g. for point lock, if a key exists in both current and the sepcified
+ // tracker, then subtract the information (such as read/write statistics) of
+ // the key in the specified tracker from the current tracker.
+ //
+ // If this method is not supported, leave it as a no-op.
+ //
+ // REQUIRED:
+ // The specified tracker must be of the same concrete class type as
+ // the current tracker.
+ // The tracked locks in the specified tracker must be a subset of those
+ // tracked by the current tracker.
+ virtual void Subtract(const LockTracker& /*tracker*/) = 0;
+
+ // Clears all tracked locks.
+ virtual void Clear() = 0;
+
+ // Gets the new locks (excluding the locks that have been tracked before the
+ // save point) tracked since the specified save point, the result is stored
+ // in an internally constructed LockTracker and returned.
+ //
+ // save_point_tracker is the tracker used by a SavePoint to track locks
+ // tracked after creating the SavePoint.
+ //
+ // The implementation should document whether point lock, or range lock, or
+ // both are considered in this method.
+ // If this method is not supported, returns nullptr.
+ //
+ // REQUIRED:
+ // The save_point_tracker must be of the same concrete class type as the
+ // current tracker.
+ // The tracked locks in the specified tracker must be a subset of those
+ // tracked by the current tracker.
+ virtual LockTracker* GetTrackedLocksSinceSavePoint(
+ const LockTracker& /*save_point_tracker*/) const = 0;
+
+ // Gets lock related information of the key.
+ //
+ // If point lock is not supported, always returns LockStatus with
+ // locked=false.
+ virtual PointLockStatus GetPointLockStatus(
+ ColumnFamilyId /*column_family_id*/,
+ const std::string& /*key*/) const = 0;
+
+ // Gets number of tracked point locks.
+ //
+ // If point lock is not supported, always returns 0.
+ virtual uint64_t GetNumPointLocks() const = 0;
+
+ class ColumnFamilyIterator {
+ public:
+ virtual ~ColumnFamilyIterator() {}
+
+ // Whether there are remaining column families.
+ virtual bool HasNext() const = 0;
+
+ // Gets next column family id.
+ //
+ // If HasNext is false, calling this method has undefined behavior.
+ virtual ColumnFamilyId Next() = 0;
+ };
+
+ // Gets an iterator for column families.
+ //
+ // Returned iterator must not be nullptr.
+ // If there is no column family to iterate,
+ // returns an empty non-null iterator.
+ // Caller owns the returned pointer.
+ virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0;
+
+ class KeyIterator {
+ public:
+ virtual ~KeyIterator() {}
+
+ // Whether there are remaining keys.
+ virtual bool HasNext() const = 0;
+
+ // Gets the next key.
+ //
+ // If HasNext is false, calling this method has undefined behavior.
+ virtual const std::string& Next() = 0;
+ };
+
+ // Gets an iterator for keys with tracked point locks in the column family.
+ //
+ // The column family must exist.
+ // Returned iterator must not be nullptr.
+ // Caller owns the returned pointer.
+ virtual KeyIterator* GetKeyIterator(
+ ColumnFamilyId /*column_family_id*/) const = 0;
+};
+
+// LockTracker should always be constructed through this factory.
+// Each LockManager owns a LockTrackerFactory.
+class LockTrackerFactory {
+ public:
+ // Caller owns the returned pointer.
+ virtual LockTracker* Create() const = 0;
+ virtual ~LockTrackerFactory() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc
new file mode 100644
index 000000000..b362a164d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc
@@ -0,0 +1,721 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct LockInfo {
+ bool exclusive;
+ autovector<TransactionID> txn_ids;
+
+ // Transaction locks are not valid after this time in us
+ uint64_t expiration_time;
+
+ LockInfo(TransactionID id, uint64_t time, bool ex)
+ : exclusive(ex), expiration_time(time) {
+ txn_ids.push_back(id);
+ }
+ LockInfo(const LockInfo& lock_info)
+ : exclusive(lock_info.exclusive),
+ txn_ids(lock_info.txn_ids),
+ expiration_time(lock_info.expiration_time) {}
+ void operator=(const LockInfo& lock_info) {
+ exclusive = lock_info.exclusive;
+ txn_ids = lock_info.txn_ids;
+ expiration_time = lock_info.expiration_time;
+ }
+ DECLARE_DEFAULT_MOVES(LockInfo);
+};
+
+struct LockMapStripe {
+ explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
+ stripe_mutex = factory->AllocateMutex();
+ stripe_cv = factory->AllocateCondVar();
+ assert(stripe_mutex);
+ assert(stripe_cv);
+ }
+
+ // Mutex must be held before modifying keys map
+ std::shared_ptr<TransactionDBMutex> stripe_mutex;
+
+ // Condition Variable per stripe for waiting on a lock
+ std::shared_ptr<TransactionDBCondVar> stripe_cv;
+
+ // Locked keys mapped to the info about the transactions that locked them.
+ // TODO(agiardullo): Explore performance of other data structures.
+ UnorderedMap<std::string, LockInfo> keys;
+};
+
+// Map of #num_stripes LockMapStripes
+struct LockMap {
+ explicit LockMap(size_t num_stripes,
+ std::shared_ptr<TransactionDBMutexFactory> factory)
+ : num_stripes_(num_stripes) {
+ lock_map_stripes_.reserve(num_stripes);
+ for (size_t i = 0; i < num_stripes; i++) {
+ LockMapStripe* stripe = new LockMapStripe(factory);
+ lock_map_stripes_.push_back(stripe);
+ }
+ }
+
+ ~LockMap() {
+ for (auto stripe : lock_map_stripes_) {
+ delete stripe;
+ }
+ }
+
+ // Number of sepearate LockMapStripes to create, each with their own Mutex
+ const size_t num_stripes_;
+
+ // Count of keys that are currently locked in this column family.
+ // (Only maintained if PointLockManager::max_num_locks_ is positive.)
+ std::atomic<int64_t> lock_cnt{0};
+
+ std::vector<LockMapStripe*> lock_map_stripes_;
+
+ size_t GetStripe(const std::string& key) const;
+};
+
+namespace {
+void UnrefLockMapsCache(void* ptr) {
+ // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+ auto lock_maps_cache =
+ static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
+ delete lock_maps_cache;
+}
+} // anonymous namespace
+
+PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
+ const TransactionDBOptions& opt)
+ : txn_db_impl_(txn_db),
+ default_num_stripes_(opt.num_stripes),
+ max_num_locks_(opt.max_num_locks),
+ lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+ dlock_buffer_(opt.max_num_deadlocks),
+ mutex_factory_(opt.custom_mutex_factory
+ ? opt.custom_mutex_factory
+ : std::make_shared<TransactionDBMutexFactoryImpl>()) {}
+
+size_t LockMap::GetStripe(const std::string& key) const {
+ assert(num_stripes_ > 0);
+ return FastRange64(GetSliceNPHash64(key), num_stripes_);
+}
+
+void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
+ InstrumentedMutexLock l(&lock_map_mutex_);
+
+ if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
+ lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
+ default_num_stripes_, mutex_factory_));
+ } else {
+ // column_family already exists in lock map
+ assert(false);
+ }
+}
+
+void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) {
+ // Remove lock_map for this column family. Since the lock map is stored
+ // as a shared ptr, concurrent transactions can still keep using it
+ // until they release their references to it.
+ {
+ InstrumentedMutexLock l(&lock_map_mutex_);
+
+ auto lock_maps_iter = lock_maps_.find(cf->GetID());
+ if (lock_maps_iter == lock_maps_.end()) {
+ return;
+ }
+
+ lock_maps_.erase(lock_maps_iter);
+ } // lock_map_mutex_
+
+ // Clear all thread-local caches
+ autovector<void*> local_caches;
+ lock_maps_cache_->Scrape(&local_caches, nullptr);
+ for (auto cache : local_caches) {
+ delete static_cast<LockMaps*>(cache);
+ }
+}
+
+// Look up the LockMap std::shared_ptr for a given column_family_id.
+// Note: The LockMap is only valid as long as the caller is still holding on
+// to the returned std::shared_ptr.
+std::shared_ptr<LockMap> PointLockManager::GetLockMap(
+ ColumnFamilyId column_family_id) {
+ // First check thread-local cache
+ if (lock_maps_cache_->Get() == nullptr) {
+ lock_maps_cache_->Reset(new LockMaps());
+ }
+
+ auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
+
+ auto lock_map_iter = lock_maps_cache->find(column_family_id);
+ if (lock_map_iter != lock_maps_cache->end()) {
+ // Found lock map for this column family.
+ return lock_map_iter->second;
+ }
+
+ // Not found in local cache, grab mutex and check shared LockMaps
+ InstrumentedMutexLock l(&lock_map_mutex_);
+
+ lock_map_iter = lock_maps_.find(column_family_id);
+ if (lock_map_iter == lock_maps_.end()) {
+ return std::shared_ptr<LockMap>(nullptr);
+ } else {
+ // Found lock map. Store in thread-local cache and return.
+ std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
+ lock_maps_cache->insert({column_family_id, lock_map});
+
+ return lock_map;
+ }
+}
+
+// Returns true if this lock has expired and can be acquired by another
+// transaction.
+// If false, sets *expire_time to the expiration time of the lock according
+// to Env->GetMicros() or 0 if no expiration.
+bool PointLockManager::IsLockExpired(TransactionID txn_id,
+ const LockInfo& lock_info, Env* env,
+ uint64_t* expire_time) {
+ if (lock_info.expiration_time == 0) {
+ *expire_time = 0;
+ return false;
+ }
+
+ auto now = env->NowMicros();
+ bool expired = lock_info.expiration_time <= now;
+ if (!expired) {
+ // return how many microseconds until lock will be expired
+ *expire_time = lock_info.expiration_time;
+ } else {
+ for (auto id : lock_info.txn_ids) {
+ if (txn_id == id) {
+ continue;
+ }
+
+ bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id);
+ if (!success) {
+ expired = false;
+ *expire_time = 0;
+ break;
+ }
+ }
+ }
+
+ return expired;
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id,
+ const std::string& key, Env* env,
+ bool exclusive) {
+ // Lookup lock map for this column family id
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+ LockMap* lock_map = lock_map_ptr.get();
+ if (lock_map == nullptr) {
+ char msg[255];
+ snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
+ column_family_id);
+
+ return Status::InvalidArgument(msg);
+ }
+
+ // Need to lock the mutex for the stripe that this key hashes to
+ size_t stripe_num = lock_map->GetStripe(key);
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+ LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
+ int64_t timeout = txn->GetLockTimeout();
+
+ return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
+ timeout, lock_info);
+}
+
+// Helper function for TryLock().
+Status PointLockManager::AcquireWithTimeout(
+ PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+ ColumnFamilyId column_family_id, const std::string& key, Env* env,
+ int64_t timeout, const LockInfo& lock_info) {
+ Status result;
+ uint64_t end_time = 0;
+
+ if (timeout > 0) {
+ uint64_t start_time = env->NowMicros();
+ end_time = start_time + timeout;
+ }
+
+ if (timeout < 0) {
+ // If timeout is negative, we wait indefinitely to acquire the lock
+ result = stripe->stripe_mutex->Lock();
+ } else {
+ result = stripe->stripe_mutex->TryLockFor(timeout);
+ }
+
+ if (!result.ok()) {
+ // failed to acquire mutex
+ return result;
+ }
+
+ // Acquire lock if we are able to
+ uint64_t expire_time_hint = 0;
+ autovector<TransactionID> wait_ids;
+ result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+ &expire_time_hint, &wait_ids);
+
+ if (!result.ok() && timeout != 0) {
+ PERF_TIMER_GUARD(key_lock_wait_time);
+ PERF_COUNTER_ADD(key_lock_wait_count, 1);
+ // If we weren't able to acquire the lock, we will keep retrying as long
+ // as the timeout allows.
+ bool timed_out = false;
+ do {
+ // Decide how long to wait
+ int64_t cv_end_time = -1;
+ if (expire_time_hint > 0 && end_time > 0) {
+ cv_end_time = std::min(expire_time_hint, end_time);
+ } else if (expire_time_hint > 0) {
+ cv_end_time = expire_time_hint;
+ } else if (end_time > 0) {
+ cv_end_time = end_time;
+ }
+
+ assert(result.IsBusy() || wait_ids.size() != 0);
+
+ // We are dependent on a transaction to finish, so perform deadlock
+ // detection.
+ if (wait_ids.size() != 0) {
+ if (txn->IsDeadlockDetect()) {
+ if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+ lock_info.exclusive, env)) {
+ result = Status::Busy(Status::SubCode::kDeadlock);
+ stripe->stripe_mutex->UnLock();
+ return result;
+ }
+ }
+ txn->SetWaitingTxn(wait_ids, column_family_id, &key);
+ }
+
+ TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
+ if (cv_end_time < 0) {
+ // Wait indefinitely
+ result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+ } else {
+ uint64_t now = env->NowMicros();
+ if (static_cast<uint64_t>(cv_end_time) > now) {
+ result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
+ cv_end_time - now);
+ }
+ }
+
+ if (wait_ids.size() != 0) {
+ txn->ClearWaitingTxn();
+ if (txn->IsDeadlockDetect()) {
+ DecrementWaiters(txn, wait_ids);
+ }
+ }
+
+ if (result.IsTimedOut()) {
+ timed_out = true;
+ // Even though we timed out, we will still make one more attempt to
+ // acquire lock below (it is possible the lock expired and we
+ // were never signaled).
+ }
+
+ if (result.ok() || result.IsTimedOut()) {
+ result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+ &expire_time_hint, &wait_ids);
+ }
+ } while (!result.ok() && !timed_out);
+ }
+
+ stripe->stripe_mutex->UnLock();
+
+ return result;
+}
+
+void PointLockManager::DecrementWaiters(
+ const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids) {
+ std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+ DecrementWaitersImpl(txn, wait_ids);
+}
+
+void PointLockManager::DecrementWaitersImpl(
+ const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids) {
+ auto id = txn->GetID();
+ assert(wait_txn_map_.Contains(id));
+ wait_txn_map_.Delete(id);
+
+ for (auto wait_id : wait_ids) {
+ rev_wait_txn_map_.Get(wait_id)--;
+ if (rev_wait_txn_map_.Get(wait_id) == 0) {
+ rev_wait_txn_map_.Delete(wait_id);
+ }
+ }
+}
+
+bool PointLockManager::IncrementWaiters(
+ const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids, const std::string& key,
+ const uint32_t& cf_id, const bool& exclusive, Env* const env) {
+ auto id = txn->GetID();
+ std::vector<int> queue_parents(
+ static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+ std::vector<TransactionID> queue_values(
+ static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+ std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+ assert(!wait_txn_map_.Contains(id));
+
+ wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key});
+
+ for (auto wait_id : wait_ids) {
+ if (rev_wait_txn_map_.Contains(wait_id)) {
+ rev_wait_txn_map_.Get(wait_id)++;
+ } else {
+ rev_wait_txn_map_.Insert(wait_id, 1);
+ }
+ }
+
+ // No deadlock if nobody is waiting on self.
+ if (!rev_wait_txn_map_.Contains(id)) {
+ return false;
+ }
+
+ const auto* next_ids = &wait_ids;
+ int parent = -1;
+ int64_t deadlock_time = 0;
+ for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
+ int i = 0;
+ if (next_ids) {
+ for (; i < static_cast<int>(next_ids->size()) &&
+ tail + i < txn->GetDeadlockDetectDepth();
+ i++) {
+ queue_values[tail + i] = (*next_ids)[i];
+ queue_parents[tail + i] = parent;
+ }
+ tail += i;
+ }
+
+ // No more items in the list, meaning no deadlock.
+ if (tail == head) {
+ return false;
+ }
+
+ auto next = queue_values[head];
+ if (next == id) {
+ std::vector<DeadlockInfo> path;
+ while (head != -1) {
+ assert(wait_txn_map_.Contains(queue_values[head]));
+
+ auto extracted_info = wait_txn_map_.Get(queue_values[head]);
+ path.push_back({queue_values[head], extracted_info.m_cf_id,
+ extracted_info.m_exclusive,
+ extracted_info.m_waiting_key});
+ head = queue_parents[head];
+ }
+ if (!env->GetCurrentTime(&deadlock_time).ok()) {
+ /*
+ TODO(AR) this preserves the current behaviour whilst checking the
+ status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED
+ passes. Should we instead raise an error if !ok() ?
+ */
+ deadlock_time = 0;
+ }
+ std::reverse(path.begin(), path.end());
+ dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
+ deadlock_time = 0;
+ DecrementWaitersImpl(txn, wait_ids);
+ return true;
+ } else if (!wait_txn_map_.Contains(next)) {
+ next_ids = nullptr;
+ continue;
+ } else {
+ parent = head;
+ next_ids = &(wait_txn_map_.Get(next).m_neighbors);
+ }
+ }
+
+ // Wait cycle too big, just assume deadlock.
+ if (!env->GetCurrentTime(&deadlock_time).ok()) {
+ /*
+ TODO(AR) this preserves the current behaviour whilst checking the status
+ of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes.
+ Should we instead raise an error if !ok() ?
+ */
+ deadlock_time = 0;
+ }
+ dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
+ DecrementWaitersImpl(txn, wait_ids);
+ return true;
+}
+
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+// or 0 if no expiration.
+// REQUIRED: Stripe mutex must be held.
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+ const std::string& key, Env* env,
+ const LockInfo& txn_lock_info,
+ uint64_t* expire_time,
+ autovector<TransactionID>* txn_ids) {
+ assert(txn_lock_info.txn_ids.size() == 1);
+
+ Status result;
+ // Check if this key is already locked
+ auto stripe_iter = stripe->keys.find(key);
+ if (stripe_iter != stripe->keys.end()) {
+ // Lock already held
+ LockInfo& lock_info = stripe_iter->second;
+ assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+
+ if (lock_info.exclusive || txn_lock_info.exclusive) {
+ if (lock_info.txn_ids.size() == 1 &&
+ lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
+ // The list contains one txn and we're it, so just take it.
+ lock_info.exclusive = txn_lock_info.exclusive;
+ lock_info.expiration_time = txn_lock_info.expiration_time;
+ } else {
+ // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
+ // it's there for a shared lock with multiple holders which was not
+ // caught in the first case.
+ if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
+ expire_time)) {
+ // lock is expired, can steal it
+ lock_info.txn_ids = txn_lock_info.txn_ids;
+ lock_info.exclusive = txn_lock_info.exclusive;
+ lock_info.expiration_time = txn_lock_info.expiration_time;
+ // lock_cnt does not change
+ } else {
+ result = Status::TimedOut(Status::SubCode::kLockTimeout);
+ *txn_ids = lock_info.txn_ids;
+ }
+ }
+ } else {
+ // We are requesting shared access to a shared lock, so just grant it.
+ lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
+ // Using std::max means that expiration time never goes down even when
+ // a transaction is removed from the list. The correct solution would be
+ // to track expiry for every transaction, but this would also work for
+ // now.
+ lock_info.expiration_time =
+ std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+ }
+ } else { // Lock not held.
+ // Check lock limit
+ if (max_num_locks_ > 0 &&
+ lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
+ result = Status::Busy(Status::SubCode::kLockLimit);
+ } else {
+ // acquire lock
+ stripe->keys.emplace(key, txn_lock_info);
+
+ // Maintain lock count if there is a limit on the number of locks
+ if (max_num_locks_) {
+ lock_map->lock_cnt++;
+ }
+ }
+ }
+
+ return result;
+}
+
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+ const std::string& key, LockMapStripe* stripe,
+ LockMap* lock_map, Env* env) {
+#ifdef NDEBUG
+ (void)env;
+#endif
+ TransactionID txn_id = txn->GetID();
+
+ auto stripe_iter = stripe->keys.find(key);
+ if (stripe_iter != stripe->keys.end()) {
+ auto& txns = stripe_iter->second.txn_ids;
+ auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+ // Found the key we locked. unlock it.
+ if (txn_it != txns.end()) {
+ if (txns.size() == 1) {
+ stripe->keys.erase(stripe_iter);
+ } else {
+ auto last_it = txns.end() - 1;
+ if (txn_it != last_it) {
+ *txn_it = *last_it;
+ }
+ txns.pop_back();
+ }
+
+ if (max_num_locks_ > 0) {
+ // Maintain lock count if there is a limit on the number of locks.
+ assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
+ lock_map->lock_cnt--;
+ }
+ }
+ } else {
+ // This key is either not locked or locked by someone else. This should
+ // only happen if the unlocking transaction has expired.
+ assert(txn->GetExpirationTime() > 0 &&
+ txn->GetExpirationTime() < env->NowMicros());
+ }
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id,
+ const std::string& key, Env* env) {
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+ LockMap* lock_map = lock_map_ptr.get();
+ if (lock_map == nullptr) {
+ // Column Family must have been dropped.
+ return;
+ }
+
+ // Lock the mutex for the stripe that this key hashes to
+ size_t stripe_num = lock_map->GetStripe(key);
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+ stripe->stripe_mutex->Lock().PermitUncheckedError();
+ UnLockKey(txn, key, stripe, lock_map, env);
+ stripe->stripe_mutex->UnLock();
+
+ // Signal waiting threads to retry locking
+ stripe->stripe_cv->NotifyAll();
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+ const LockTracker& tracker, Env* env) {
+ std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+ tracker.GetColumnFamilyIterator());
+ assert(cf_it != nullptr);
+ while (cf_it->HasNext()) {
+ ColumnFamilyId cf = cf_it->Next();
+ std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
+ LockMap* lock_map = lock_map_ptr.get();
+ if (!lock_map) {
+ // Column Family must have been dropped.
+ return;
+ }
+
+ // Bucket keys by lock_map_ stripe
+ UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe(
+ lock_map->num_stripes_);
+ std::unique_ptr<LockTracker::KeyIterator> key_it(
+ tracker.GetKeyIterator(cf));
+ assert(key_it != nullptr);
+ while (key_it->HasNext()) {
+ const std::string& key = key_it->Next();
+ size_t stripe_num = lock_map->GetStripe(key);
+ keys_by_stripe[stripe_num].push_back(&key);
+ }
+
+ // For each stripe, grab the stripe mutex and unlock all keys in this stripe
+ for (auto& stripe_iter : keys_by_stripe) {
+ size_t stripe_num = stripe_iter.first;
+ auto& stripe_keys = stripe_iter.second;
+
+ assert(lock_map->lock_map_stripes_.size() > stripe_num);
+ LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+ stripe->stripe_mutex->Lock().PermitUncheckedError();
+
+ for (const std::string* key : stripe_keys) {
+ UnLockKey(txn, *key, stripe, lock_map, env);
+ }
+
+ stripe->stripe_mutex->UnLock();
+
+ // Signal waiting threads to retry locking
+ stripe->stripe_cv->NotifyAll();
+ }
+ }
+}
+
+PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
+ PointLockStatus data;
+ // Lock order here is important. The correct order is lock_map_mutex_, then
+ // for every column family ID in ascending order lock every stripe in
+ // ascending order.
+ InstrumentedMutexLock l(&lock_map_mutex_);
+
+ std::vector<uint32_t> cf_ids;
+ for (const auto& map : lock_maps_) {
+ cf_ids.push_back(map.first);
+ }
+ std::sort(cf_ids.begin(), cf_ids.end());
+
+ for (auto i : cf_ids) {
+ const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+ // Iterate and lock all stripes in ascending order.
+ for (const auto& j : stripes) {
+ j->stripe_mutex->Lock().PermitUncheckedError();
+ for (const auto& it : j->keys) {
+ struct KeyLockInfo info;
+ info.exclusive = it.second.exclusive;
+ info.key = it.first;
+ for (const auto& id : it.second.txn_ids) {
+ info.ids.push_back(id);
+ }
+ data.insert({i, info});
+ }
+ }
+ }
+
+ // Unlock everything. Unlocking order is not important.
+ for (auto i : cf_ids) {
+ const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+ for (const auto& j : stripes) {
+ j->stripe_mutex->UnLock();
+ }
+ }
+
+ return data;
+}
+
+std::vector<DeadlockPath> PointLockManager::GetDeadlockInfoBuffer() {
+ return dlock_buffer_.PrepareBuffer();
+}
+
+void PointLockManager::Resize(uint32_t target_size) {
+ dlock_buffer_.Resize(target_size);
+}
+
+PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() {
+ return {};
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* /* txn */,
+ ColumnFamilyId /* cf_id */,
+ const Endpoint& /* start */,
+ const Endpoint& /* end */, Env* /* env */,
+ bool /* exclusive */) {
+ return Status::NotSupported(
+ "PointLockManager does not support range locking");
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
+ ColumnFamilyId /* cf_id */,
+ const Endpoint& /* start */,
+ const Endpoint& /* end */, Env* /* env */) {
+ // no-op
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h
new file mode 100644
index 000000000..eeb34f3be
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h
@@ -0,0 +1,224 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/utilities/transaction.h"
+#include "util/autovector.h"
+#include "util/hash_containers.h"
+#include "util/hash_map.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+struct LockInfo;
+struct LockMap;
+struct LockMapStripe;
+
+template <class Path>
+class DeadlockInfoBufferTempl {
+ private:
+ std::vector<Path> paths_buffer_;
+ uint32_t buffer_idx_;
+ std::mutex paths_buffer_mutex_;
+
+ std::vector<Path> Normalize() {
+ auto working = paths_buffer_;
+
+ if (working.empty()) {
+ return working;
+ }
+
+ // Next write occurs at a nonexistent path's slot
+ if (paths_buffer_[buffer_idx_].empty()) {
+ working.resize(buffer_idx_);
+ } else {
+ std::rotate(working.begin(), working.begin() + buffer_idx_,
+ working.end());
+ }
+
+ return working;
+ }
+
+ public:
+ explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks)
+ : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
+
+ void AddNewPath(Path path) {
+ std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+ if (paths_buffer_.empty()) {
+ return;
+ }
+
+ paths_buffer_[buffer_idx_] = std::move(path);
+ buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
+ }
+
+ void Resize(uint32_t target_size) {
+ std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+ paths_buffer_ = Normalize();
+
+ // Drop the deadlocks that will no longer be needed ater the normalize
+ if (target_size < paths_buffer_.size()) {
+ paths_buffer_.erase(
+ paths_buffer_.begin(),
+ paths_buffer_.begin() + (paths_buffer_.size() - target_size));
+ buffer_idx_ = 0;
+ }
+ // Resize the buffer to the target size and restore the buffer's idx
+ else {
+ auto prev_size = paths_buffer_.size();
+ paths_buffer_.resize(target_size);
+ buffer_idx_ = (uint32_t)prev_size;
+ }
+ }
+
+ std::vector<Path> PrepareBuffer() {
+ std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+ // Reversing the normalized vector returns the latest deadlocks first
+ auto working = Normalize();
+ std::reverse(working.begin(), working.end());
+
+ return working;
+ }
+};
+
+using DeadlockInfoBuffer = DeadlockInfoBufferTempl<DeadlockPath>;
+
+struct TrackedTrxInfo {
+ autovector<TransactionID> m_neighbors;
+ uint32_t m_cf_id;
+ bool m_exclusive;
+ std::string m_waiting_key;
+};
+
+class PointLockManager : public LockManager {
+ public:
+ PointLockManager(PessimisticTransactionDB* db,
+ const TransactionDBOptions& opt);
+ // No copying allowed
+ PointLockManager(const PointLockManager&) = delete;
+ PointLockManager& operator=(const PointLockManager&) = delete;
+
+ ~PointLockManager() override {}
+
+ bool IsPointLockSupported() const override { return true; }
+
+ bool IsRangeLockSupported() const override { return false; }
+
+ const LockTrackerFactory& GetLockTrackerFactory() const override {
+ return PointLockTrackerFactory::Get();
+ }
+
+ // Creates a new LockMap for this column family. Caller should guarantee
+ // that this column family does not already exist.
+ void AddColumnFamily(const ColumnFamilyHandle* cf) override;
+ // Deletes the LockMap for this column family. Caller should guarantee that
+ // this column family is no longer in use.
+ void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
+
+ Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const std::string& key, Env* env, bool exclusive) override;
+ Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const Endpoint& start, const Endpoint& end, Env* env,
+ bool exclusive) override;
+
+ void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+ Env* env) override;
+ void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const std::string& key, Env* env) override;
+ void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const Endpoint& start, const Endpoint& end, Env* env) override;
+
+ PointLockStatus GetPointLockStatus() override;
+
+ RangeLockStatus GetRangeLockStatus() override;
+
+ std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+ void Resize(uint32_t new_size) override;
+
+ private:
+ PessimisticTransactionDB* txn_db_impl_;
+
+ // Default number of lock map stripes per column family
+ const size_t default_num_stripes_;
+
+ // Limit on number of keys locked per column family
+ const int64_t max_num_locks_;
+
+ // The following lock order must be satisfied in order to avoid deadlocking
+ // ourselves.
+ // - lock_map_mutex_
+ // - stripe mutexes in ascending cf id, ascending stripe order
+ // - wait_txn_map_mutex_
+ //
+ // Must be held when accessing/modifying lock_maps_.
+ InstrumentedMutex lock_map_mutex_;
+
+ // Map of ColumnFamilyId to locked key info
+ using LockMaps = UnorderedMap<uint32_t, std::shared_ptr<LockMap>>;
+ LockMaps lock_maps_;
+
+ // Thread-local cache of entries in lock_maps_. This is an optimization
+ // to avoid acquiring a mutex in order to look up a LockMap
+ std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
+
+ // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
+ std::mutex wait_txn_map_mutex_;
+
+ // Maps from waitee -> number of waiters.
+ HashMap<TransactionID, int> rev_wait_txn_map_;
+ // Maps from waiter -> waitee.
+ HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_;
+ DeadlockInfoBuffer dlock_buffer_;
+
+ // Used to allocate mutexes/condvars to use when locking keys
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+ bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env,
+ uint64_t* wait_time);
+
+ std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
+
+ Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
+ LockMapStripe* stripe, uint32_t column_family_id,
+ const std::string& key, Env* env, int64_t timeout,
+ const LockInfo& lock_info);
+
+ Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+ const std::string& key, Env* env,
+ const LockInfo& lock_info, uint64_t* wait_time,
+ autovector<TransactionID>* txn_ids);
+
+ void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+ LockMapStripe* stripe, LockMap* lock_map, Env* env);
+
+ bool IncrementWaiters(const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids,
+ const std::string& key, const uint32_t& cf_id,
+ const bool& exclusive, Env* const env);
+ void DecrementWaiters(const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids);
+ void DecrementWaitersImpl(const PessimisticTransaction* txn,
+ const autovector<TransactionID>& wait_ids);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc
new file mode 100644
index 000000000..525fdea71
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2020-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This test is not applicable for Range Lock manager as Range Lock Manager
+// operates on Column Families, not their ids.
+TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+ MockColumnFamilyHandle cf(1024);
+ locker_->RemoveColumnFamily(&cf);
+ auto txn = NewTxn();
+ auto s = locker_->TryLock(txn, 1024, "k", env_, true);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_STREQ(s.getState(), "Column family id not found: 1024");
+ delete txn;
+}
+
+TEST_F(PointLockManagerTest, LockStatus) {
+ MockColumnFamilyHandle cf1(1024), cf2(2048);
+ locker_->AddColumnFamily(&cf1);
+ locker_->AddColumnFamily(&cf2);
+
+ auto txn1 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true));
+ ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true));
+
+ auto txn2 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false));
+ ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false));
+
+ auto s = locker_->GetPointLockStatus();
+ ASSERT_EQ(s.size(), 4u);
+ for (uint32_t cf_id : {1024, 2048}) {
+ ASSERT_EQ(s.count(cf_id), 2u);
+ auto range = s.equal_range(cf_id);
+ for (auto it = range.first; it != range.second; it++) {
+ ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2");
+ if (it->second.key == "k1") {
+ ASSERT_EQ(it->second.exclusive, true);
+ ASSERT_EQ(it->second.ids.size(), 1u);
+ ASSERT_EQ(it->second.ids[0], txn1->GetID());
+ } else if (it->second.key == "k2") {
+ ASSERT_EQ(it->second.exclusive, false);
+ ASSERT_EQ(it->second.ids.size(), 1u);
+ ASSERT_EQ(it->second.ids[0], txn2->GetID());
+ }
+ }
+ }
+
+ // Cleanup
+ locker_->UnLock(txn1, 1024, "k1", env_);
+ locker_->UnLock(txn1, 2048, "k1", env_);
+ locker_->UnLock(txn2, 1024, "k2", env_);
+ locker_->UnLock(txn2, 2048, "k2", env_);
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockExclusive) {
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+
+ auto txn1 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true));
+ locker_->UnLock(txn1, 1, "k", env_);
+
+ auto txn2 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+ // Cleanup
+ locker_->UnLock(txn2, 1, "k", env_);
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockShared) {
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+
+ auto txn1 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+ locker_->UnLock(txn1, 1, "k", env_);
+
+ auto txn2 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+ // Cleanup
+ locker_->UnLock(txn2, 1, "k", env_);
+
+ delete txn1;
+ delete txn2;
+}
+
+// This test doesn't work with Range Lock Manager, because Range Lock Manager
+// doesn't support deadlock_detect_depth.
+
+TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+ // Tests that when detecting deadlock, if the detection depth is exceeded,
+ // it's also viewed as deadlock.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ TransactionOptions txn_opt;
+ txn_opt.deadlock_detect = true;
+ txn_opt.deadlock_detect_depth = 1;
+ txn_opt.lock_timeout = 1000000;
+ auto txn1 = NewTxn(txn_opt);
+ auto txn2 = NewTxn(txn_opt);
+ auto txn3 = NewTxn(txn_opt);
+ auto txn4 = NewTxn(txn_opt);
+ // "a ->(k) b" means transaction a is waiting for transaction b to release
+ // the held lock on key k.
+ // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1
+ // txn3's deadlock detection will exceed the detection depth 1,
+ // which will be viewed as a deadlock.
+ // NOTE:
+ // txn4 ->(k3) -> txn3 must be set up before
+ // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3,
+ // it must have another txn waiting on it, which is txn4 in this case.
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+ port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+ // block because txn1 is holding a lock on k1.
+ locker_->TryLock(txn2, 1, "k1", env_, true);
+ });
+
+ ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
+
+ port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+ // block because txn3 is holding a lock on k1.
+ locker_->TryLock(txn4, 1, "k3", env_, true);
+ });
+
+ auto s = locker_->TryLock(txn3, 1, "k2", env_, true);
+ ASSERT_TRUE(s.IsBusy());
+ ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+ std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+ ASSERT_EQ(deadlock_paths.size(), 1u);
+ ASSERT_TRUE(deadlock_paths[0].limit_exceeded);
+
+ locker_->UnLock(txn1, 1, "k1", env_);
+ locker_->UnLock(txn3, 1, "k3", env_);
+ t1.join();
+ t2.join();
+
+ delete txn4;
+ delete txn3;
+ delete txn2;
+ delete txn1;
+}
+
+INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
+ ::testing::Values(nullptr));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h
new file mode 100644
index 000000000..ca9f46bf9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h
@@ -0,0 +1,324 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+ explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+ ~MockColumnFamilyHandle() override {}
+
+ const std::string& GetName() const override { return name_; }
+
+ ColumnFamilyId GetID() const override { return cf_id_; }
+
+ Status GetDescriptor(ColumnFamilyDescriptor*) override {
+ return Status::OK();
+ }
+
+ const Comparator* GetComparator() const override {
+ return BytewiseComparator();
+ }
+
+ private:
+ ColumnFamilyId cf_id_;
+ std::string name_ = "MockCF";
+};
+
+class PointLockManagerTest : public testing::Test {
+ public:
+ void SetUp() override {
+ env_ = Env::Default();
+ db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+ ASSERT_OK(env_->CreateDir(db_dir_));
+
+ Options opt;
+ opt.create_if_missing = true;
+ TransactionDBOptions txn_opt;
+ txn_opt.transaction_lock_timeout = 0;
+
+ ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+
+ // CAUTION: This test creates a separate lock manager object (right, NOT
+ // the one that the TransactionDB is using!), and runs tests on it.
+ locker_.reset(new PointLockManager(
+ static_cast<PessimisticTransactionDB*>(db_), txn_opt));
+
+ wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+ }
+
+ void TearDown() override {
+ delete db_;
+ EXPECT_OK(DestroyDir(env_, db_dir_));
+ }
+
+ PessimisticTransaction* NewTxn(
+ TransactionOptions txn_opt = TransactionOptions()) {
+ Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
+ return reinterpret_cast<PessimisticTransaction*>(txn);
+ }
+
+ protected:
+ Env* env_;
+ std::shared_ptr<LockManager> locker_;
+ const char* wait_sync_point_name_;
+ friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
+
+ private:
+ std::string db_dir_;
+ TransactionDB* db_;
+};
+
+using init_func_t = void (*)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+ public testing::WithParamInterface<init_func_t> {
+ public:
+ void SetUp() override {
+ // If a custom setup function was provided, use it. Otherwise, use what we
+ // have inherited.
+ auto init_func = GetParam();
+ if (init_func)
+ (*init_func)(this);
+ else
+ PointLockManagerTest::SetUp();
+ }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+ // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+ // Cleanup
+ locker_->UnLock(txn, 1, "k", env_);
+
+ delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+ // Tests that a txn can acquire shared lock on the same key repeatedly.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+ // Cleanup
+ locker_->UnLock(txn, 1, "k", env_);
+
+ delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+ // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+ // Cleanup
+ locker_->UnLock(txn, 1, "k", env_);
+ delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+ // Tests that a txn can acquire a shared lock after acquiring an exclusive
+ // lock on the same key.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+ ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+ // Cleanup
+ locker_->UnLock(txn, 1, "k", env_);
+ delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+ // Tests that lock conflicts lead to lock timeout.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn1 = NewTxn();
+ auto txn2 = NewTxn();
+
+ {
+ // exclusive-exclusive conflict.
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+ auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ {
+ // exclusive-shared conflict.
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+ auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ {
+ // shared-exclusive conflict.
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+ auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ // Cleanup
+ locker_->UnLock(txn1, 1, "k1", env_);
+ locker_->UnLock(txn1, 1, "k2", env_);
+
+ delete txn1;
+ delete txn2;
+}
+
+port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
+ std::function<void()> f) {
+ std::atomic<bool> reached(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread t(f);
+
+ while (!reached.load()) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ return t;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+ // Tests that shared locks can be concurrently held by multiple transactions.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ auto txn1 = NewTxn();
+ auto txn2 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+ // Cleanup
+ locker_->UnLock(txn1, 1, "k", env_);
+ locker_->UnLock(txn2, 1, "k", env_);
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+ // Tests that deadlock can be detected.
+ // Deadlock scenario:
+ // txn1 exclusively locks k1, and wants to lock k2;
+ // txn2 exclusively locks k2, and wants to lock k1.
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+ TransactionOptions txn_opt;
+ txn_opt.deadlock_detect = true;
+ txn_opt.lock_timeout = 1000000;
+ auto txn1 = NewTxn(txn_opt);
+ auto txn2 = NewTxn(txn_opt);
+
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+ // txn1 tries to lock k2, will block forever.
+ port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+ // block because txn2 is holding a lock on k2.
+ locker_->TryLock(txn1, 1, "k2", env_, true);
+ });
+
+ auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+ ASSERT_TRUE(s.IsBusy());
+ ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+ std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+ ASSERT_EQ(deadlock_paths.size(), 1u);
+ ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+ std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+ ASSERT_EQ(deadlocks.size(), 2u);
+
+ ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+ ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+ ASSERT_TRUE(deadlocks[0].m_exclusive);
+ ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+ ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+ ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+ ASSERT_TRUE(deadlocks[1].m_exclusive);
+ ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+ locker_->UnLock(txn2, 1, "k2", env_);
+ t.join();
+
+ // Cleanup
+ locker_->UnLock(txn1, 1, "k1", env_);
+ locker_->UnLock(txn1, 1, "k2", env_);
+ delete txn2;
+ delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+ MockColumnFamilyHandle cf(1);
+ locker_->AddColumnFamily(&cf);
+
+ auto txn1 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+ auto txn2 = NewTxn();
+ ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+ auto txn3 = NewTxn();
+ txn3->SetLockTimeout(10000);
+ port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+ ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+ locker_->UnLock(txn3, 1, "k", env_);
+ });
+
+ // Ok, now txn3 is waiting for lock on "k", which is owned by two
+ // transactions. Check that GetWaitingTxns reports this correctly
+ uint32_t wait_cf_id;
+ std::string wait_key;
+ auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+ ASSERT_EQ(wait_cf_id, 1u);
+ ASSERT_EQ(wait_key, "k");
+ ASSERT_EQ(waiters.size(), 2);
+ bool waits_correct =
+ (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+ (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+ ASSERT_EQ(waits_correct, true);
+
+ // Release locks so txn3 can proceed with execution
+ locker_->UnLock(txn1, 1, "k", env_);
+ locker_->UnLock(txn2, 1, "k", env_);
+
+ // Wait until txn3 finishes
+ t1.join();
+
+ delete txn1;
+ delete txn2;
+ delete txn3;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc
new file mode 100644
index 000000000..6204a8f02
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TrackedKeysColumnFamilyIterator
+ : public LockTracker::ColumnFamilyIterator {
+ public:
+ explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys)
+ : tracked_keys_(keys), it_(keys.begin()) {}
+
+ bool HasNext() const override { return it_ != tracked_keys_.end(); }
+
+ ColumnFamilyId Next() override { return (it_++)->first; }
+
+ private:
+ const TrackedKeys& tracked_keys_;
+ TrackedKeys::const_iterator it_;
+};
+
+class TrackedKeysIterator : public LockTracker::KeyIterator {
+ public:
+ TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id)
+ : key_infos_(keys.at(id)), it_(key_infos_.begin()) {}
+
+ bool HasNext() const override { return it_ != key_infos_.end(); }
+
+ const std::string& Next() override { return (it_++)->first; }
+
+ private:
+ const TrackedKeyInfos& key_infos_;
+ TrackedKeyInfos::const_iterator it_;
+};
+
+} // namespace
+
+void PointLockTracker::Track(const PointLockRequest& r) {
+ auto& keys = tracked_keys_[r.column_family_id];
+ auto result = keys.try_emplace(r.key, r.seq);
+ auto it = result.first;
+ if (!result.second && r.seq < it->second.seq) {
+ // Now tracking this key with an earlier sequence number
+ it->second.seq = r.seq;
+ }
+ // else we do not update the seq. The smaller the tracked seq, the stronger it
+ // the guarantee since it implies from the seq onward there has not been a
+ // concurrent update to the key. So we update the seq if it implies stronger
+ // guarantees, i.e., if it is smaller than the existing tracked seq.
+
+ if (r.read_only) {
+ it->second.num_reads++;
+ } else {
+ it->second.num_writes++;
+ }
+
+ it->second.exclusive = it->second.exclusive || r.exclusive;
+}
+
+UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) {
+ auto cf_keys = tracked_keys_.find(r.column_family_id);
+ if (cf_keys == tracked_keys_.end()) {
+ return UntrackStatus::NOT_TRACKED;
+ }
+
+ auto& keys = cf_keys->second;
+ auto it = keys.find(r.key);
+ if (it == keys.end()) {
+ return UntrackStatus::NOT_TRACKED;
+ }
+
+ bool untracked = false;
+ auto& info = it->second;
+ if (r.read_only) {
+ if (info.num_reads > 0) {
+ info.num_reads--;
+ untracked = true;
+ }
+ } else {
+ if (info.num_writes > 0) {
+ info.num_writes--;
+ untracked = true;
+ }
+ }
+
+ bool removed = false;
+ if (info.num_reads == 0 && info.num_writes == 0) {
+ keys.erase(it);
+ if (keys.empty()) {
+ tracked_keys_.erase(cf_keys);
+ }
+ removed = true;
+ }
+
+ if (removed) {
+ return UntrackStatus::REMOVED;
+ }
+ if (untracked) {
+ return UntrackStatus::UNTRACKED;
+ }
+ return UntrackStatus::NOT_TRACKED;
+}
+
+void PointLockTracker::Merge(const LockTracker& tracker) {
+ const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+ for (const auto& cf_keys : t.tracked_keys_) {
+ ColumnFamilyId cf = cf_keys.first;
+ const auto& keys = cf_keys.second;
+
+ auto current_cf_keys = tracked_keys_.find(cf);
+ if (current_cf_keys == tracked_keys_.end()) {
+ tracked_keys_.emplace(cf_keys);
+ } else {
+ auto& current_keys = current_cf_keys->second;
+ for (const auto& key_info : keys) {
+ const std::string& key = key_info.first;
+ const TrackedKeyInfo& info = key_info.second;
+ // If key was not previously tracked, just copy the whole struct over.
+ // Otherwise, some merging needs to occur.
+ auto current_info = current_keys.find(key);
+ if (current_info == current_keys.end()) {
+ current_keys.emplace(key_info);
+ } else {
+ current_info->second.Merge(info);
+ }
+ }
+ }
+ }
+}
+
+void PointLockTracker::Subtract(const LockTracker& tracker) {
+ const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+ for (const auto& cf_keys : t.tracked_keys_) {
+ ColumnFamilyId cf = cf_keys.first;
+ const auto& keys = cf_keys.second;
+
+ auto& current_keys = tracked_keys_.at(cf);
+ for (const auto& key_info : keys) {
+ const std::string& key = key_info.first;
+ const TrackedKeyInfo& info = key_info.second;
+ uint32_t num_reads = info.num_reads;
+ uint32_t num_writes = info.num_writes;
+
+ auto current_key_info = current_keys.find(key);
+ assert(current_key_info != current_keys.end());
+
+ // Decrement the total reads/writes of this key by the number of
+ // reads/writes done since the last SavePoint.
+ if (num_reads > 0) {
+ assert(current_key_info->second.num_reads >= num_reads);
+ current_key_info->second.num_reads -= num_reads;
+ }
+ if (num_writes > 0) {
+ assert(current_key_info->second.num_writes >= num_writes);
+ current_key_info->second.num_writes -= num_writes;
+ }
+ if (current_key_info->second.num_reads == 0 &&
+ current_key_info->second.num_writes == 0) {
+ current_keys.erase(current_key_info);
+ }
+ }
+ }
+}
+
+LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint(
+ const LockTracker& save_point_tracker) const {
+ // Examine the number of reads/writes performed on all keys written
+ // since the last SavePoint and compare to the total number of reads/writes
+ // for each key.
+ LockTracker* t = new PointLockTracker();
+ const PointLockTracker& save_point_t =
+ static_cast<const PointLockTracker&>(save_point_tracker);
+ for (const auto& cf_keys : save_point_t.tracked_keys_) {
+ ColumnFamilyId cf = cf_keys.first;
+ const auto& keys = cf_keys.second;
+
+ auto& current_keys = tracked_keys_.at(cf);
+ for (const auto& key_info : keys) {
+ const std::string& key = key_info.first;
+ const TrackedKeyInfo& info = key_info.second;
+ uint32_t num_reads = info.num_reads;
+ uint32_t num_writes = info.num_writes;
+
+ auto current_key_info = current_keys.find(key);
+ assert(current_key_info != current_keys.end());
+ assert(current_key_info->second.num_reads >= num_reads);
+ assert(current_key_info->second.num_writes >= num_writes);
+
+ if (current_key_info->second.num_reads == num_reads &&
+ current_key_info->second.num_writes == num_writes) {
+ // All the reads/writes to this key were done in the last savepoint.
+ PointLockRequest r;
+ r.column_family_id = cf;
+ r.key = key;
+ r.seq = info.seq;
+ r.read_only = (num_writes == 0);
+ r.exclusive = info.exclusive;
+ t->Track(r);
+ }
+ }
+ }
+ return t;
+}
+
+PointLockStatus PointLockTracker::GetPointLockStatus(
+ ColumnFamilyId column_family_id, const std::string& key) const {
+ assert(IsPointLockSupported());
+ PointLockStatus status;
+ auto it = tracked_keys_.find(column_family_id);
+ if (it == tracked_keys_.end()) {
+ return status;
+ }
+
+ const auto& keys = it->second;
+ auto key_it = keys.find(key);
+ if (key_it == keys.end()) {
+ return status;
+ }
+
+ const TrackedKeyInfo& key_info = key_it->second;
+ status.locked = true;
+ status.exclusive = key_info.exclusive;
+ status.seq = key_info.seq;
+ return status;
+}
+
+uint64_t PointLockTracker::GetNumPointLocks() const {
+ uint64_t num_keys = 0;
+ for (const auto& cf_keys : tracked_keys_) {
+ num_keys += cf_keys.second.size();
+ }
+ return num_keys;
+}
+
+LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator()
+ const {
+ return new TrackedKeysColumnFamilyIterator(tracked_keys_);
+}
+
+LockTracker::KeyIterator* PointLockTracker::GetKeyIterator(
+ ColumnFamilyId column_family_id) const {
+ assert(tracked_keys_.find(column_family_id) != tracked_keys_.end());
+ return new TrackedKeysIterator(tracked_keys_, column_family_id);
+}
+
+void PointLockTracker::Clear() { tracked_keys_.clear(); }
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h
new file mode 100644
index 000000000..daf6f9aa2
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TrackedKeyInfo {
+ // Earliest sequence number that is relevant to this transaction for this key
+ SequenceNumber seq;
+
+ uint32_t num_writes;
+ uint32_t num_reads;
+
+ bool exclusive;
+
+ explicit TrackedKeyInfo(SequenceNumber seq_no)
+ : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
+
+ void Merge(const TrackedKeyInfo& info) {
+ assert(seq <= info.seq);
+ num_reads += info.num_reads;
+ num_writes += info.num_writes;
+ exclusive = exclusive || info.exclusive;
+ }
+};
+
+using TrackedKeyInfos = std::unordered_map<std::string, TrackedKeyInfo>;
+
+using TrackedKeys = std::unordered_map<ColumnFamilyId, TrackedKeyInfos>;
+
+// Tracks point locks on single keys.
+class PointLockTracker : public LockTracker {
+ public:
+ PointLockTracker() = default;
+
+ PointLockTracker(const PointLockTracker&) = delete;
+ PointLockTracker& operator=(const PointLockTracker&) = delete;
+
+ bool IsPointLockSupported() const override { return true; }
+
+ bool IsRangeLockSupported() const override { return false; }
+
+ void Track(const PointLockRequest& lock_request) override;
+
+ UntrackStatus Untrack(const PointLockRequest& lock_request) override;
+
+ void Track(const RangeLockRequest& /*lock_request*/) override {}
+
+ UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+ return UntrackStatus::NOT_TRACKED;
+ }
+
+ void Merge(const LockTracker& tracker) override;
+
+ void Subtract(const LockTracker& tracker) override;
+
+ void Clear() override;
+
+ virtual LockTracker* GetTrackedLocksSinceSavePoint(
+ const LockTracker& save_point_tracker) const override;
+
+ PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+ const std::string& key) const override;
+
+ uint64_t GetNumPointLocks() const override;
+
+ ColumnFamilyIterator* GetColumnFamilyIterator() const override;
+
+ KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override;
+
+ private:
+ TrackedKeys tracked_keys_;
+};
+
+class PointLockTrackerFactory : public LockTrackerFactory {
+ public:
+ static const PointLockTrackerFactory& Get() {
+ static const PointLockTrackerFactory instance;
+ return instance;
+ }
+
+ LockTracker* Create() const override { return new PointLockTracker(); }
+
+ private:
+ PointLockTrackerFactory() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h
new file mode 100644
index 000000000..01899542e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h
@@ -0,0 +1,36 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+//
+// Generic definitions for a Range-based Lock Manager
+//
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+ A base class for all Range-based lock managers
+
+ See also class RangeLockManagerHandle in
+ include/rocksdb/utilities/transaction_db.h
+*/
+class RangeLockManagerBase : public LockManager {
+ public:
+ // Geting a point lock is reduced to getting a range lock on a single-point
+ // range
+ using LockManager::TryLock;
+ Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const std::string& key, Env* env, bool exclusive) override {
+ Endpoint endp(key.data(), key.size(), false);
+ return TryLock(txn, column_family_id, endp, endp, env, exclusive);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc
new file mode 100644
index 000000000..bce66c1f3
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc
@@ -0,0 +1,459 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeLockingTest : public ::testing::Test {
+ public:
+ TransactionDB* db;
+ std::string dbname;
+ Options options;
+
+ std::shared_ptr<RangeLockManagerHandle> range_lock_mgr;
+ TransactionDBOptions txn_db_options;
+
+ RangeLockingTest() : db(nullptr) {
+ options.create_if_missing = true;
+ dbname = test::PerThreadDBPath("range_locking_testdb");
+
+ EXPECT_OK(DestroyDB(dbname, options));
+
+ range_lock_mgr.reset(NewRangeLockManager(nullptr));
+ txn_db_options.lock_mgr_handle = range_lock_mgr;
+
+ auto s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ assert(s.ok());
+ }
+
+ ~RangeLockingTest() {
+ delete db;
+ db = nullptr;
+ // This is to skip the assert statement in FaultInjectionTestEnv. There
+ // seems to be a bug in btrfs that the makes readdir return recently
+ // unlink-ed files. By using the default fs we simply ignore errors resulted
+ // from attempting to delete such files in DestroyDB.
+ EXPECT_OK(DestroyDB(dbname, options));
+ }
+
+ PessimisticTransaction* NewTxn(
+ TransactionOptions txn_opt = TransactionOptions()) {
+ Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt);
+ return reinterpret_cast<PessimisticTransaction*>(txn);
+ }
+};
+
+// TODO: set a smaller lock wait timeout so that the test runs faster.
+TEST_F(RangeLockingTest, BasicRangeLocking) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ std::string value;
+ ReadOptions read_options;
+ auto cf = db->DefaultColumnFamily();
+
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ // Get a range lock
+ ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+ // Check that range Lock inhibits an overlapping range lock
+ {
+ auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ // Check that range Lock inhibits an overlapping point lock
+ {
+ auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value);
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ // Get a point lock, check that it inhibits range locks
+ ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value")));
+ {
+ auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p"));
+ ASSERT_TRUE(s.IsTimedOut());
+ }
+
+ ASSERT_OK(txn0->Commit());
+ txn1->Rollback();
+
+ delete txn0;
+ delete txn1;
+}
+
+TEST_F(RangeLockingTest, MyRocksLikeUpdate) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ auto cf = db->DefaultColumnFamily();
+ Status s;
+
+ // Get a range lock for the range we are about to update
+ ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+ bool try_range_lock_called = false;
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "RangeTreeLockManager::TryRangeLock:enter",
+ [&](void* /*arg*/) { try_range_lock_called = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // For performance reasons, the following must NOT call lock_mgr->TryLock():
+ // We verify that by checking the value of try_range_lock_called.
+ ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"),
+ /*assume_tracked=*/true));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ASSERT_FALSE(try_range_lock_called);
+
+ txn0->Rollback();
+
+ delete txn0;
+}
+
+TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ auto cf = db->DefaultColumnFamily();
+ Status s;
+ std::string value;
+ txn_options.lock_timeout = 10;
+
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ // Get the shared lock in txn0
+ s = txn0->GetForUpdate(ReadOptions(), cf, Slice("a"), &value,
+ false /*exclusive*/);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Get the shared lock on the same key in txn1
+ s = txn1->GetForUpdate(ReadOptions(), cf, Slice("a"), &value,
+ false /*exclusive*/);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Now, try getting an exclusive lock that overlaps with the above
+ s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b"));
+ ASSERT_TRUE(s.IsTimedOut());
+
+ txn0->Rollback();
+ txn1->Rollback();
+
+ delete txn0;
+ delete txn1;
+}
+
+TEST_F(RangeLockingTest, SnapshotValidation) {
+ Status s;
+ Slice key_slice = Slice("k");
+ ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+
+ auto txn0 = NewTxn();
+ txn0->Put(key_slice, Slice("initial"));
+ txn0->Commit();
+
+ // txn1
+ auto txn1 = NewTxn();
+ txn1->SetSnapshot();
+ std::string val1;
+ ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1));
+ ASSERT_EQ(val1, "initial");
+ val1 = val1 + std::string("-txn1");
+
+ ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1)));
+
+ // txn2
+ auto txn2 = NewTxn();
+ txn2->SetSnapshot();
+ std::string val2;
+ // This will see the original value as nothing is committed
+ // This is also Get, so it is doesn't acquire any locks.
+ ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2));
+ ASSERT_EQ(val2, "initial");
+
+ // txn1
+ ASSERT_OK(txn1->Commit());
+
+ // txn2
+ val2 = val2 + std::string("-txn2");
+ // Now, this call should do Snapshot Validation and fail:
+ s = txn2->Put(cfh, key_slice, Slice(val2));
+ ASSERT_TRUE(s.IsBusy());
+
+ ASSERT_OK(txn2->Commit());
+
+ delete txn0;
+ delete txn1;
+ delete txn2;
+}
+
+TEST_F(RangeLockingTest, MultipleTrxLockStatusData) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ auto cf = db->DefaultColumnFamily();
+
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ // Get a range lock
+ ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z")));
+ ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e")));
+
+ auto s = range_lock_mgr->GetRangeLockStatusData();
+ ASSERT_EQ(s.size(), 2);
+ for (auto it = s.begin(); it != s.end(); ++it) {
+ ASSERT_EQ(it->first, cf->GetID());
+ auto val = it->second;
+ ASSERT_FALSE(val.start.inf_suffix);
+ ASSERT_FALSE(val.end.inf_suffix);
+ ASSERT_TRUE(val.exclusive);
+ ASSERT_EQ(val.ids.size(), 1);
+ if (val.ids[0] == txn0->GetID()) {
+ ASSERT_EQ(val.start.slice, "z");
+ ASSERT_EQ(val.end.slice, "z");
+ } else if (val.ids[0] == txn1->GetID()) {
+ ASSERT_EQ(val.start.slice, "b");
+ ASSERT_EQ(val.end.slice, "e");
+ } else {
+ FAIL(); // Unknown transaction ID.
+ }
+ }
+
+ delete txn0;
+ delete txn1;
+}
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+#else
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+
+#ifndef SKIP_LOCK_ESCALATION_TEST
+TEST_F(RangeLockingTest, BasicLockEscalation) {
+ auto cf = db->DefaultColumnFamily();
+
+ auto counters = range_lock_mgr->GetStatus();
+
+ // Initially not using any lock memory
+ ASSERT_EQ(counters.current_lock_memory, 0);
+ ASSERT_EQ(counters.escalation_count, 0);
+
+ ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000));
+
+ // Insert until we see lock escalations
+ auto txn = NewTxn();
+
+ // Get the locks until we hit an escalation
+ for (int i = 0; i < 2020; i++) {
+ std::ostringstream buf;
+ buf << std::setw(8) << std::setfill('0') << i;
+ std::string buf_str = buf.str();
+ ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+ }
+ counters = range_lock_mgr->GetStatus();
+ ASSERT_GT(counters.escalation_count, 0);
+ ASSERT_LE(counters.current_lock_memory, 2000);
+
+ delete txn;
+}
+
+// An escalation barrier function. Allow escalation iff the first two bytes are
+// identical.
+static bool escalation_barrier(const Endpoint& a, const Endpoint& b) {
+ assert(a.slice.size() > 2);
+ assert(b.slice.size() > 2);
+ if (memcmp(a.slice.data(), b.slice.data(), 2)) {
+ return true; // This is a barrier
+ } else {
+ return false; // No barrier
+ }
+}
+
+TEST_F(RangeLockingTest, LockEscalationBarrier) {
+ auto cf = db->DefaultColumnFamily();
+
+ auto counters = range_lock_mgr->GetStatus();
+
+ // Initially not using any lock memory
+ ASSERT_EQ(counters.escalation_count, 0);
+
+ range_lock_mgr->SetMaxLockMemory(8000);
+ range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier);
+
+ // Insert enough locks to cause lock escalations to happen
+ auto txn = NewTxn();
+ const int N = 2000;
+ for (int i = 0; i < N; i++) {
+ std::ostringstream buf;
+ buf << std::setw(4) << std::setfill('0') << i;
+ std::string buf_str = buf.str();
+ ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+ }
+ counters = range_lock_mgr->GetStatus();
+ ASSERT_GT(counters.escalation_count, 0);
+
+ // Check that lock escalation was not performed across escalation barriers:
+ // Use another txn to acquire locks near the barriers.
+ auto txn2 = NewTxn();
+ range_lock_mgr->SetMaxLockMemory(500000);
+ for (int i = 100; i < N; i += 100) {
+ std::ostringstream buf;
+ buf << std::setw(4) << std::setfill('0') << i - 1 << "-a";
+ std::string buf_str = buf.str();
+ // Check that we CAN get a lock near the escalation barrier
+ ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+ }
+
+ txn->Rollback();
+ txn2->Rollback();
+ delete txn;
+ delete txn2;
+}
+
+#endif
+
+TEST_F(RangeLockingTest, LockWaitCount) {
+ TransactionOptions txn_options;
+ auto cf = db->DefaultColumnFamily();
+ txn_options.lock_timeout = 50;
+ Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options);
+ Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options);
+
+ // Get a range lock
+ ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+ uint64_t lock_waits1 = range_lock_mgr->GetStatus().lock_wait_count;
+ // Attempt to get a conflicting lock
+ auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+ ASSERT_TRUE(s.IsTimedOut());
+
+ // Check that the counter was incremented
+ uint64_t lock_waits2 = range_lock_mgr->GetStatus().lock_wait_count;
+ ASSERT_EQ(lock_waits1 + 1, lock_waits2);
+
+ txn0->Rollback();
+ txn1->Rollback();
+
+ delete txn0;
+ delete txn1;
+}
+
+TEST_F(RangeLockingTest, LockWaiteeAccess) {
+ TransactionOptions txn_options;
+ auto cf = db->DefaultColumnFamily();
+ txn_options.lock_timeout = 60;
+ Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options);
+ Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options);
+
+ // Get a range lock
+ ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+ std::atomic<bool> reached(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "RangeTreeLockManager::TryRangeLock:EnterWaitingTxn", [&](void* /*arg*/) {
+ reached.store(true);
+ std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ port::Thread t([&]() {
+ // Attempt to get a conflicting lock
+ auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+ ASSERT_TRUE(s.ok());
+ txn1->Rollback();
+ });
+
+ while (!reached.load()) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Release locks and free the transaction
+ txn0->Rollback();
+ delete txn0;
+
+ t.join();
+
+ delete txn1;
+}
+
+void PointLockManagerTestExternalSetup(PointLockManagerTest* self) {
+ self->env_ = Env::Default();
+ self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+ ASSERT_OK(self->env_->CreateDir(self->db_dir_));
+
+ Options opt;
+ opt.create_if_missing = true;
+ TransactionDBOptions txn_opt;
+ txn_opt.transaction_lock_timeout = 0;
+
+ auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>();
+ self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager());
+ std::shared_ptr<RangeLockManagerHandle> range_lock_mgr =
+ std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_);
+ txn_opt.lock_mgr_handle = range_lock_mgr;
+
+ ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_));
+ self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn";
+}
+
+INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest,
+ ::testing::Values(PointLockManagerTestExternalSetup));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else // OS_WIN
+
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "skipped as Range Locking is not supported on Windows\n");
+ return 0;
+}
+
+#endif // OS_WIN
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "skipped as transactions are not supported in rocksdb_lite\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
new file mode 100644
index 000000000..dba13ed2d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
@@ -0,0 +1,661 @@
+ GNU AFFERO GENERAL PUBLIC LICENSE
+ Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+ A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate. Many developers of free software are heartened and
+encouraged by the resulting cooperation. However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+ The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community. It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server. Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+ An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals. This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU Affero General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Remote Network Interaction; Use with the GNU General Public License.
+
+ Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software. This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time. Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source. For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code. There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
new file mode 100644
index 000000000..ecbfc770f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
@@ -0,0 +1,174 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
new file mode 100644
index 000000000..d511905c1
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README
new file mode 100644
index 000000000..2ea86bf46
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README
@@ -0,0 +1,13 @@
+The files in this directory originally come from
+https://github.com/percona/PerconaFT/.
+
+This directory only includes the "locktree" part of PerconaFT, and its
+dependencies.
+
+The following modifications were made:
+- Make locktree usable outside of PerconaFT library
+- Add shared read-only lock support
+
+The files named *_subst.* are substitutes of the PerconaFT's files, they
+contain replacements of PerconaFT's functionality.
+
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h
new file mode 100644
index 000000000..5aa826c8e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h
@@ -0,0 +1,76 @@
+#ifndef _DB_H
+#define _DB_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+typedef struct __toku_dbt DBT;
+
+// port: this is currently not used
+struct simple_dbt {
+ uint32_t len;
+ void *data;
+};
+
+// engine status info
+// engine status is passed to handlerton as an array of
+// TOKU_ENGINE_STATUS_ROW_S[]
+typedef enum {
+ STATUS_FS_STATE = 0, // interpret as file system state (redzone) enum
+ STATUS_UINT64, // interpret as uint64_t
+ STATUS_CHARSTR, // interpret as char *
+ STATUS_UNIXTIME, // interpret as time_t
+ STATUS_TOKUTIME, // interpret as tokutime_t
+ STATUS_PARCOUNT, // interpret as PARTITIONED_COUNTER
+ STATUS_DOUBLE // interpret as double
+} toku_engine_status_display_type;
+
+typedef enum {
+ TOKU_ENGINE_STATUS = (1ULL << 0), // Include when asking for engine status
+ TOKU_GLOBAL_STATUS =
+ (1ULL << 1), // Include when asking for information_schema.global_status
+} toku_engine_status_include_type;
+
+typedef struct __toku_engine_status_row {
+ const char *keyname; // info schema key, should not change across revisions
+ // without good reason
+ const char
+ *columnname; // column for mysql, e.g. information_schema.global_status.
+ // TOKUDB_ will automatically be prefixed.
+ const char *legend; // the text that will appear at user interface
+ toku_engine_status_display_type type; // how to interpret the value
+ toku_engine_status_include_type
+ include; // which kinds of callers should get read this row?
+ union {
+ double dnum;
+ uint64_t num;
+ const char *str;
+ char datebuf[26];
+ struct partitioned_counter *parcount;
+ } value;
+} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S;
+
+#define DB_BUFFER_SMALL -30999
+#define DB_LOCK_DEADLOCK -30995
+#define DB_LOCK_NOTGRANTED -30994
+#define DB_NOTFOUND -30989
+#define DB_KEYEXIST -30996
+#define DB_DBT_MALLOC 8
+#define DB_DBT_REALLOC 64
+#define DB_DBT_USERMEM 256
+
+/* PerconaFT specific error codes */
+#define TOKUDB_OUT_OF_LOCKS -100000
+
+typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid,
+ uint64_t blocking_txnid);
+
+struct __toku_dbt {
+ void *data;
+ size_t size;
+ size_t ulen;
+ // One of DB_DBT_XXX flags
+ uint32_t flags;
+};
+
+#endif
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
new file mode 100644
index 000000000..718efc623
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
@@ -0,0 +1,138 @@
+/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b);
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+ size_t key2len);
+
+int toku_builtin_compare_fun(const DBT *, const DBT *)
+ __attribute__((__visibility__("default")));
+
+namespace toku {
+
+// a comparator object encapsulates the data necessary for
+// comparing two keys in a fractal tree. it further understands
+// that points may be positive or negative infinity.
+
+class comparator {
+ void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) {
+ _cmp = cmp;
+ _cmp_arg = cmp_arg;
+ _memcmp_magic = memcmp_magic;
+ }
+
+ public:
+ // This magic value is reserved to mean that the magic has not been set.
+ static const uint8_t MEMCMP_MAGIC_NONE = 0;
+
+ void create(ft_compare_func cmp, void *cmp_arg,
+ uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
+ init(cmp, cmp_arg, memcmp_magic);
+ }
+
+ // inherit the attributes of another comparator, but keep our own
+ // copy of fake_db that is owned separately from the one given.
+ void inherit(const comparator &cmp) {
+ invariant_notnull(cmp._cmp);
+ init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic);
+ }
+
+ // like inherit, but doesn't require that the this comparator
+ // was already created
+ void create_from(const comparator &cmp) { inherit(cmp); }
+
+ void destroy() {}
+
+ ft_compare_func get_compare_func() const { return _cmp; }
+
+ uint8_t get_memcmp_magic() const { return _memcmp_magic; }
+
+ bool valid() const { return _cmp != nullptr; }
+
+ inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
+ return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
+ }
+
+ int operator()(const DBT *a, const DBT *b) const {
+ if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b),
+ 0)) {
+ return toku_dbt_infinite_compare(a, b);
+ } else if (_memcmp_magic != MEMCMP_MAGIC_NONE
+ // If `a' has the memcmp magic..
+ && dbt_has_memcmp_magic(a)
+ // ..then we expect `b' to also have the memcmp magic
+ && __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
+ assert(0); // psergey: this branch should not be taken.
+ return toku_builtin_compare_fun(a, b);
+ } else {
+ // yikes, const sadness here
+ return _cmp(_cmp_arg, a, b);
+ }
+ }
+
+ private:
+ ft_compare_func _cmp;
+ void *_cmp_arg;
+
+ uint8_t _memcmp_magic;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
new file mode 100644
index 000000000..1b4511172
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
@@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../portability/toku_race_tools.h"
+#include "../util/status.h"
+
+//
+// Lock Tree Manager statistics
+//
+class LTM_STATUS_S {
+ public:
+ enum {
+ LTM_SIZE_CURRENT = 0,
+ LTM_SIZE_LIMIT,
+ LTM_ESCALATION_COUNT,
+ LTM_ESCALATION_TIME,
+ LTM_ESCALATION_LATEST_RESULT,
+ LTM_NUM_LOCKTREES,
+ LTM_LOCK_REQUESTS_PENDING,
+ LTM_STO_NUM_ELIGIBLE,
+ LTM_STO_END_EARLY_COUNT,
+ LTM_STO_END_EARLY_TIME,
+ LTM_WAIT_COUNT,
+ LTM_WAIT_TIME,
+ LTM_LONG_WAIT_COUNT,
+ LTM_LONG_WAIT_TIME,
+ LTM_TIMEOUT_COUNT,
+ LTM_WAIT_ESCALATION_COUNT,
+ LTM_WAIT_ESCALATION_TIME,
+ LTM_LONG_WAIT_ESCALATION_COUNT,
+ LTM_LONG_WAIT_ESCALATION_TIME,
+ LTM_STATUS_NUM_ROWS // must be last
+ };
+
+ void init(void);
+ void destroy(void);
+
+ TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
+
+ private:
+ bool m_initialized = false;
+};
+typedef LTM_STATUS_S* LTM_STATUS;
+extern LTM_STATUS_S ltm_status;
+
+#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num
+
+void toku_status_init(void); // just call ltm_status.init();
+void toku_status_destroy(void); // just call ltm_status.destroy();
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
new file mode 100644
index 000000000..5110cd482
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "concurrent_tree.h"
+
+// PORT #include <toku_assert.h>
+namespace toku {
+
+void concurrent_tree::create(const comparator *cmp) {
+ // start with an empty root node. we do this instead of
+ // setting m_root to null so there's always a root to lock
+ m_root.create_root(cmp);
+}
+
+void concurrent_tree::destroy(void) { m_root.destroy_root(); }
+
+bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
+
+uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
+ return sizeof(treenode);
+}
+
+void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
+ // the first step in acquiring a locked keyrange is locking the root
+ treenode *const root = &tree->m_root;
+ m_tree = tree;
+ m_subtree = root;
+ m_range = keyrange::get_infinite_range();
+ root->mutex_lock();
+}
+
+void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
+ treenode *const root = &m_tree->m_root;
+
+ treenode *subtree;
+ if (root->is_empty() || root->range_overlaps(range)) {
+ subtree = root;
+ } else {
+ // we do not have a precomputed comparison hint, so pass null
+ const keyrange::comparison *cmp_hint = nullptr;
+ subtree = root->find_node_with_overlapping_child(range, cmp_hint);
+ }
+
+ // subtree is locked. it will be unlocked when this is release()'d
+ invariant_notnull(subtree);
+ m_range = range;
+ m_subtree = subtree;
+}
+
+bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+ TXNID new_owner) {
+ return m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
+void concurrent_tree::locked_keyrange::release(void) {
+ m_subtree->mutex_unlock();
+}
+
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+ TXNID txnid, bool is_shared) {
+ // empty means no children, and only the root should ever be empty
+ if (m_subtree->is_empty()) {
+ m_subtree->set_range_and_txnid(range, txnid, is_shared);
+ } else {
+ m_subtree->insert(range, txnid, is_shared);
+ }
+}
+
+void concurrent_tree::locked_keyrange::remove(const keyrange &range,
+ TXNID txnid) {
+ invariant(!m_subtree->is_empty());
+ treenode *new_subtree = m_subtree->remove(range, txnid);
+ // if removing range changed the root of the subtree,
+ // then the subtree must be the root of the entire tree.
+ if (new_subtree == nullptr) {
+ invariant(m_subtree->is_root());
+ invariant(m_subtree->is_empty());
+ }
+}
+
+void concurrent_tree::locked_keyrange::remove_all(void) {
+ m_subtree->recursive_remove();
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
new file mode 100644
index 000000000..e1bfb86c5
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+#include "keyrange.h"
+#include "treenode.h"
+
+namespace toku {
+
+// A concurrent_tree stores non-overlapping ranges.
+// Access to disjoint parts of the tree usually occurs concurrently.
+
+class concurrent_tree {
+ public:
+ // A locked_keyrange gives you exclusive access to read and write
+ // operations that occur on any keys in that range. You only have
+ // the right to operate on keys in that range or keys that were read
+ // from the keyrange using iterate()
+ //
+ // Access model:
+ // - user prepares a locked keyrange. all threads serialize behind prepare().
+ // - user breaks the serialzation point by acquiring a range, or releasing.
+ // - one thread operates on a certain locked_keyrange object at a time.
+ // - when the thread is finished, it releases
+
+ class locked_keyrange {
+ public:
+ // effect: prepare to acquire a locked keyrange over the given
+ // concurrent_tree, preventing other threads from preparing
+ // until this thread either does acquire() or release().
+ // note: operations performed on a prepared keyrange are equivalent
+ // to ones performed on an acquired keyrange over -inf, +inf.
+ // rationale: this provides the user with a serialization point for
+ // descending
+ // or modifying the the tree. it also proives a convenient way of
+ // doing serializable operations on the tree.
+ // There are two valid sequences of calls:
+ // - prepare, acquire, [operations], release
+ // - prepare, [operations],release
+ void prepare(concurrent_tree *tree);
+
+ // requires: the locked keyrange was prepare()'d
+ // effect: acquire a locked keyrange over the given concurrent_tree.
+ // the locked keyrange represents the range of keys overlapped
+ // by the given range
+ void acquire(const keyrange &range);
+
+ // effect: releases a locked keyrange and the mutex it holds
+ void release(void);
+
+ // effect: iterate over each range this locked_keyrange represents,
+ // calling function->fn() on each node's keyrange and txnid
+ // until there are no more or the function returns false
+ template <class F>
+ void iterate(F *function) const {
+ // if the subtree is non-empty, traverse it by calling the given
+ // function on each range, txnid pair found that overlaps.
+ if (!m_subtree->is_empty()) {
+ m_subtree->traverse_overlaps(m_range, function);
+ }
+ }
+
+ // Adds another owner to the lock on the specified keyrange.
+ // requires: the keyrange contains one treenode whose bounds are
+ // exactly equal to the specifed range (no sub/supersets)
+ bool add_shared_owner(const keyrange &range, TXNID new_owner);
+
+ // inserts the given range into the tree, with an associated txnid.
+ // requires: range does not overlap with anything in this locked_keyrange
+ // rationale: caller is responsible for only inserting unique ranges
+ void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+ // effect: removes the given range from the tree.
+ // - txnid=TXNID_ANY means remove the range no matter what its
+ // owners are
+ // - Other value means remove the specified txnid from
+ // ownership (if the range has other owners, it will remain
+ // in the tree)
+ // requires: range exists exactly in this locked_keyrange
+ // rationale: caller is responsible for only removing existing ranges
+ void remove(const keyrange &range, TXNID txnid);
+
+ // effect: removes all of the keys represented by this locked keyrange
+ // rationale: we'd like a fast way to empty out a tree
+ void remove_all(void);
+
+ private:
+ // the concurrent tree this locked keyrange is for
+ concurrent_tree *m_tree;
+
+ // the range of keys this locked keyrange represents
+ keyrange m_range;
+
+ // the subtree under which all overlapping ranges exist
+ treenode *m_subtree;
+
+ friend class concurrent_tree_unit_test;
+ };
+
+ // effect: initialize the tree to an empty state
+ void create(const comparator *cmp);
+
+ // effect: destroy the tree.
+ // requires: tree is empty
+ void destroy(void);
+
+ // returns: true iff the tree is empty
+ bool is_empty(void);
+
+ // returns: the memory overhead of a single insertion into the tree
+ static uint64_t get_insertion_memory_overhead(void);
+
+ private:
+ // the root needs to always exist so there's a lock to grab
+ // even if the tree is empty. that's why we store a treenode
+ // here and not a pointer to one.
+ treenode m_root;
+
+ friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
new file mode 100644
index 000000000..e50ace5a9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
@@ -0,0 +1,222 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "keyrange.h"
+
+#include "../util/dbt.h"
+
+namespace toku {
+
+// create a keyrange by borrowing the left and right dbt
+// pointers. no memory is copied. no checks for infinity needed.
+void keyrange::create(const DBT *left, const DBT *right) {
+ init_empty();
+ m_left_key = left;
+ m_right_key = right;
+}
+
+// destroy the key copies. if they were never set, then destroy does nothing.
+void keyrange::destroy(void) {
+ toku_destroy_dbt(&m_left_key_copy);
+ toku_destroy_dbt(&m_right_key_copy);
+}
+
+// create a keyrange by copying the keys from the given range.
+void keyrange::create_copy(const keyrange &range) {
+ // start with an initialized, empty range
+ init_empty();
+
+ // optimize the case where the left and right keys are the same.
+ // we'd like to only have one copy of the data.
+ if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
+ set_both_keys(range.get_left_key());
+ } else {
+ // replace our empty left and right keys with
+ // copies of the range's left and right keys
+ replace_left_key(range.get_left_key());
+ replace_right_key(range.get_right_key());
+ }
+}
+
+// extend this keyrange by choosing the leftmost and rightmost
+// endpoints between this range and the given. replaced keys
+// in this range are freed and inherited keys are copied.
+void keyrange::extend(const comparator &cmp, const keyrange &range) {
+ const DBT *range_left = range.get_left_key();
+ const DBT *range_right = range.get_right_key();
+ if (cmp(range_left, get_left_key()) < 0) {
+ replace_left_key(range_left);
+ }
+ if (cmp(range_right, get_right_key()) > 0) {
+ replace_right_key(range_right);
+ }
+}
+
+// how much memory does this keyrange take?
+// - the size of the left and right keys
+// --- ignore the fact that we may have optimized the point case.
+// it complicates things for little gain.
+// - the size of the keyrange class itself
+uint64_t keyrange::get_memory_size(void) const {
+ const DBT *left_key = get_left_key();
+ const DBT *right_key = get_right_key();
+ return left_key->size + right_key->size + sizeof(keyrange);
+}
+
+// compare ranges.
+keyrange::comparison keyrange::compare(const comparator &cmp,
+ const keyrange &range) const {
+ if (cmp(get_right_key(), range.get_left_key()) < 0) {
+ return comparison::LESS_THAN;
+ } else if (cmp(get_left_key(), range.get_right_key()) > 0) {
+ return comparison::GREATER_THAN;
+ } else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
+ cmp(get_right_key(), range.get_right_key()) == 0) {
+ return comparison::EQUALS;
+ } else {
+ return comparison::OVERLAPS;
+ }
+}
+
+bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
+ // equality is a stronger form of overlapping.
+ // so two ranges "overlap" if they're either equal or just overlapping.
+ comparison c = compare(cmp, range);
+ return c == comparison::EQUALS || c == comparison::OVERLAPS;
+}
+
+keyrange keyrange::get_infinite_range(void) {
+ keyrange range;
+ range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+ return range;
+}
+
+void keyrange::init_empty(void) {
+ m_left_key = nullptr;
+ m_right_key = nullptr;
+ toku_init_dbt(&m_left_key_copy);
+ toku_init_dbt(&m_right_key_copy);
+ m_point_range = false;
+}
+
+const DBT *keyrange::get_left_key(void) const {
+ if (m_left_key) {
+ return m_left_key;
+ } else {
+ return &m_left_key_copy;
+ }
+}
+
+const DBT *keyrange::get_right_key(void) const {
+ if (m_right_key) {
+ return m_right_key;
+ } else {
+ return &m_right_key_copy;
+ }
+}
+
+// copy the given once and set both the left and right pointers.
+// optimization for point ranges, so the left and right ranges
+// are not copied twice.
+void keyrange::set_both_keys(const DBT *key) {
+ if (toku_dbt_is_infinite(key)) {
+ m_left_key = key;
+ m_right_key = key;
+ } else {
+ toku_clone_dbt(&m_left_key_copy, *key);
+ toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
+ }
+ m_point_range = true;
+}
+
+// destroy the current left key. set and possibly copy the new one
+void keyrange::replace_left_key(const DBT *key) {
+ // a little magic:
+ //
+ // if this is a point range, then the left and right keys share
+ // one copy of the data, and it lives in the left key copy. so
+ // if we're replacing the left key, move the real data to the
+ // right key copy instead of destroying it. now, the memory is
+ // owned by the right key and the left key may be replaced.
+ if (m_point_range) {
+ m_right_key_copy = m_left_key_copy;
+ } else {
+ toku_destroy_dbt(&m_left_key_copy);
+ }
+
+ if (toku_dbt_is_infinite(key)) {
+ m_left_key = key;
+ } else {
+ toku_clone_dbt(&m_left_key_copy, *key);
+ m_left_key = nullptr;
+ }
+ m_point_range = false;
+}
+
+// destroy the current right key. set and possibly copy the new one
+void keyrange::replace_right_key(const DBT *key) {
+ toku_destroy_dbt(&m_right_key_copy);
+ if (toku_dbt_is_infinite(key)) {
+ m_right_key = key;
+ } else {
+ toku_clone_dbt(&m_right_key_copy, *key);
+ m_right_key = nullptr;
+ }
+ m_point_range = false;
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
new file mode 100644
index 000000000..f9aeea0c4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+
+namespace toku {
+
+// A keyrange has a left and right key as endpoints.
+//
+// When a keyrange is created it owns no memory, but when it copies
+// or extends another keyrange, it copies memory as necessary. This
+// means it is cheap in the common case.
+
+class keyrange {
+ public:
+ // effect: constructor that borrows left and right key pointers.
+ // no memory is allocated or copied.
+ void create(const DBT *left_key, const DBT *right_key);
+
+ // effect: constructor that allocates and copies another keyrange's points.
+ void create_copy(const keyrange &range);
+
+ // effect: destroys the keyrange, freeing any allocated memory
+ void destroy(void);
+
+ // effect: extends the keyrange by choosing the leftmost and rightmost
+ // endpoints from this range and the given range.
+ // replaced keys in this range are freed, new keys are copied.
+ void extend(const comparator &cmp, const keyrange &range);
+
+ // returns: the amount of memory this keyrange takes. does not account
+ // for point optimizations or malloc overhead.
+ uint64_t get_memory_size(void) const;
+
+ // returns: pointer to the left key of this range
+ const DBT *get_left_key(void) const;
+
+ // returns: pointer to the right key of this range
+ const DBT *get_right_key(void) const;
+
+ // two ranges are either equal, lt, gt, or overlapping
+ enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
+
+ // effect: compares this range to the given range
+ // returns: LESS_THAN if given range is strictly to the left
+ // GREATER_THAN if given range is strictly to the right
+ // EQUALS if given range has the same left and right endpoints
+ // OVERLAPS if at least one of the given range's endpoints falls
+ // between this range's endpoints
+ comparison compare(const comparator &cmp, const keyrange &range) const;
+
+ // returns: true if the range and the given range are equal or overlapping
+ bool overlaps(const comparator &cmp, const keyrange &range) const;
+
+ // returns: a keyrange representing -inf, +inf
+ static keyrange get_infinite_range(void);
+
+ private:
+ // some keys should be copied, some keys should not be.
+ //
+ // to support both, we use two DBTs for copies and two pointers
+ // for temporaries. the access rule is:
+ // - if a pointer is non-null, then it reprsents the key.
+ // - otherwise the pointer is null, and the key is in the copy.
+ DBT m_left_key_copy;
+ DBT m_right_key_copy;
+ const DBT *m_left_key;
+ const DBT *m_right_key;
+
+ // if this range is a point range, then m_left_key == m_right_key
+ // and the actual data is stored exactly once in m_left_key_copy.
+ bool m_point_range;
+
+ // effect: initializes a keyrange to be empty
+ void init_empty(void);
+
+ // effect: copies the given key once into the left key copy
+ // and sets the right key copy to share the left.
+ // rationale: optimization for point ranges to only do one malloc
+ void set_both_keys(const DBT *key);
+
+ // effect: destroys the current left key. sets and copies the new one.
+ void replace_left_key(const DBT *key);
+
+ // effect: destroys the current right key. sets and copies the new one.
+ void replace_right_key(const DBT *key);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
new file mode 100644
index 000000000..3d217be70
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "lock_request.h"
+
+#include "../portability/toku_race_tools.h"
+#include "../portability/txn_subst.h"
+#include "../util/dbt.h"
+#include "locktree.h"
+
+namespace toku {
+
+// initialize a lock request's internals
+void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
+ m_txnid = TXNID_NONE;
+ m_conflicting_txnid = TXNID_NONE;
+ m_start_time = 0;
+ m_left_key = nullptr;
+ m_right_key = nullptr;
+ toku_init_dbt(&m_left_key_copy);
+ toku_init_dbt(&m_right_key_copy);
+
+ m_type = type::UNKNOWN;
+ m_lt = nullptr;
+
+ m_complete_r = 0;
+ m_state = state::UNINITIALIZED;
+ m_info = nullptr;
+
+ // psergey-todo: this condition is for interruptible wait
+ // note: moved to here from lock_request::create:
+ toku_external_cond_init(mutex_factory, &m_wait_cond);
+
+ m_start_test_callback = nullptr;
+ m_start_before_pending_test_callback = nullptr;
+ m_retry_test_callback = nullptr;
+}
+
+// destroy a lock request.
+void lock_request::destroy(void) {
+ invariant(m_state != state::PENDING);
+ invariant(m_state != state::DESTROYED);
+ m_state = state::DESTROYED;
+ toku_destroy_dbt(&m_left_key_copy);
+ toku_destroy_dbt(&m_right_key_copy);
+ toku_external_cond_destroy(&m_wait_cond);
+}
+
+// set the lock request parameters. this API allows a lock request to be reused.
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
+ const DBT *right_key, lock_request::type lock_type,
+ bool big_txn, void *extra) {
+ invariant(m_state != state::PENDING);
+ m_lt = lt;
+
+ m_txnid = txnid;
+ m_left_key = left_key;
+ m_right_key = right_key;
+ toku_destroy_dbt(&m_left_key_copy);
+ toku_destroy_dbt(&m_right_key_copy);
+ m_type = lock_type;
+ m_state = state::INITIALIZED;
+ m_info = lt ? lt->get_lock_request_info() : nullptr;
+ m_big_txn = big_txn;
+ m_extra = extra;
+}
+
+// get rid of any stored left and right key copies and
+// replace them with copies of the given left and right key
+void lock_request::copy_keys() {
+ if (!toku_dbt_is_infinite(m_left_key)) {
+ toku_clone_dbt(&m_left_key_copy, *m_left_key);
+ m_left_key = &m_left_key_copy;
+ }
+ if (!toku_dbt_is_infinite(m_right_key)) {
+ toku_clone_dbt(&m_right_key_copy, *m_right_key);
+ m_right_key = &m_right_key_copy;
+ }
+}
+
+// what are the conflicts for this pending lock request?
+void lock_request::get_conflicts(txnid_set *conflicts) {
+ invariant(m_state == state::PENDING);
+ const bool is_write_request = m_type == type::WRITE;
+ m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
+ conflicts);
+}
+
+// build a wait-for-graph for this lock request and the given conflict set
+// for each transaction B that blocks A's lock request
+// if B is blocked then
+// add (A,T) to the WFG and if B is new, fill in the WFG from B
+void lock_request::build_wait_graph(wfg *wait_graph,
+ const txnid_set &conflicts) {
+ uint32_t num_conflicts = conflicts.size();
+ for (uint32_t i = 0; i < num_conflicts; i++) {
+ TXNID conflicting_txnid = conflicts.get(i);
+ lock_request *conflicting_request = find_lock_request(conflicting_txnid);
+ invariant(conflicting_txnid != m_txnid);
+ invariant(conflicting_request != this);
+ if (conflicting_request) {
+ bool already_exists = wait_graph->node_exists(conflicting_txnid);
+ wait_graph->add_edge(m_txnid, conflicting_txnid);
+ if (!already_exists) {
+ // recursively build the wait for graph rooted at the conflicting
+ // request, given its set of lock conflicts.
+ txnid_set other_conflicts;
+ other_conflicts.create();
+ conflicting_request->get_conflicts(&other_conflicts);
+ conflicting_request->build_wait_graph(wait_graph, other_conflicts);
+ other_conflicts.destroy();
+ }
+ }
+ }
+}
+
+// returns: true if the current set of lock requests contains
+// a deadlock, false otherwise.
+bool lock_request::deadlock_exists(const txnid_set &conflicts) {
+ wfg wait_graph;
+ wait_graph.create();
+
+ build_wait_graph(&wait_graph, conflicts);
+
+ std::function<void(TXNID)> reporter;
+ if (m_deadlock_cb) {
+ reporter = [this](TXNID a) {
+ lock_request *req = find_lock_request(a);
+ if (req) {
+ m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
+ req->m_left_key, req->m_right_key);
+ }
+ };
+ }
+
+ bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
+ wait_graph.destroy();
+ return deadlock;
+}
+
+// try to acquire a lock described by this lock request.
+int lock_request::start(void) {
+ int r;
+
+ txnid_set conflicts;
+ conflicts.create();
+ if (m_type == type::WRITE) {
+ r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+ m_big_txn);
+ } else {
+ invariant(m_type == type::READ);
+ r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+ m_big_txn);
+ }
+
+ // if the lock is not granted, save it to the set of lock requests
+ // and check for a deadlock. if there is one, complete it as failed
+ if (r == DB_LOCK_NOTGRANTED) {
+ copy_keys();
+ m_state = state::PENDING;
+ m_start_time = toku_current_time_microsec() / 1000;
+ m_conflicting_txnid = conflicts.get(0);
+ if (m_start_before_pending_test_callback)
+ m_start_before_pending_test_callback();
+ toku_external_mutex_lock(&m_info->mutex);
+ insert_into_lock_requests();
+ if (deadlock_exists(conflicts)) {
+ remove_from_lock_requests();
+ r = DB_LOCK_DEADLOCK;
+ }
+ toku_external_mutex_unlock(&m_info->mutex);
+ if (m_start_test_callback) m_start_test_callback(); // test callback
+ }
+
+ if (r != DB_LOCK_NOTGRANTED) {
+ complete(r);
+ }
+
+ conflicts.destroy();
+ return r;
+}
+
+// sleep on the lock request until it becomes resolved or the wait time has
+// elapsed.
+int lock_request::wait(uint64_t wait_time_ms) {
+ return wait(wait_time_ms, 0, nullptr);
+}
+
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+ int (*killed_callback)(void),
+ void (*lock_wait_callback)(void *, lock_wait_infos *),
+ void *callback_arg) {
+ uint64_t t_now = toku_current_time_microsec();
+ uint64_t t_start = t_now;
+ uint64_t t_end = t_start + wait_time_ms * 1000;
+
+ toku_external_mutex_lock(&m_info->mutex);
+
+ // check again, this time locking out other retry calls
+ if (m_state == state::PENDING) {
+ lock_wait_infos conflicts_collector;
+ retry(&conflicts_collector);
+ if (m_state == state::PENDING) {
+ report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+ }
+ }
+
+ while (m_state == state::PENDING) {
+ // check if this thread is killed
+ if (killed_callback && killed_callback()) {
+ remove_from_lock_requests();
+ complete(DB_LOCK_NOTGRANTED);
+ continue;
+ }
+
+ // compute the time until we should wait
+ uint64_t t_wait;
+ if (killed_time_ms == 0) {
+ t_wait = t_end;
+ } else {
+ t_wait = t_now + killed_time_ms * 1000;
+ if (t_wait > t_end) t_wait = t_end;
+ }
+
+ int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
+ (int64_t)(t_wait - t_now));
+ invariant(r == 0 || r == ETIMEDOUT);
+
+ t_now = toku_current_time_microsec();
+ if (m_state == state::PENDING && (t_now >= t_end)) {
+ m_info->counters.timeout_count += 1;
+
+ // if we're still pending and we timed out, then remove our
+ // request from the set of lock requests and fail.
+ remove_from_lock_requests();
+
+ // complete sets m_state to COMPLETE, breaking us out of the loop
+ complete(DB_LOCK_NOTGRANTED);
+ }
+ }
+
+ uint64_t t_real_end = toku_current_time_microsec();
+ uint64_t duration = t_real_end - t_start;
+ m_info->counters.wait_count += 1;
+ m_info->counters.wait_time += duration;
+ if (duration >= 1000000) {
+ m_info->counters.long_wait_count += 1;
+ m_info->counters.long_wait_time += duration;
+ }
+ toku_external_mutex_unlock(&m_info->mutex);
+
+ invariant(m_state == state::COMPLETE);
+ return m_complete_r;
+}
+
+// complete this lock request with the given return value
+void lock_request::complete(int complete_r) {
+ m_complete_r = complete_r;
+ m_state = state::COMPLETE;
+}
+
+const DBT *lock_request::get_left_key(void) const { return m_left_key; }
+
+const DBT *lock_request::get_right_key(void) const { return m_right_key; }
+
+TXNID lock_request::get_txnid(void) const { return m_txnid; }
+
+uint64_t lock_request::get_start_time(void) const { return m_start_time; }
+
+TXNID lock_request::get_conflicting_txnid(void) const {
+ return m_conflicting_txnid;
+}
+
+int lock_request::retry(lock_wait_infos *conflicts_collector) {
+ invariant(m_state == state::PENDING);
+ int r;
+ txnid_set conflicts;
+ conflicts.create();
+
+ if (m_type == type::WRITE) {
+ r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+ m_big_txn);
+ } else {
+ r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+ m_big_txn);
+ }
+
+ // if the acquisition succeeded then remove ourselves from the
+ // set of lock requests, complete, and signal the waiting thread.
+ if (r == 0) {
+ remove_from_lock_requests();
+ complete(r);
+ if (m_retry_test_callback) m_retry_test_callback(); // test callback
+ toku_external_cond_broadcast(&m_wait_cond);
+ } else {
+ m_conflicting_txnid = conflicts.get(0);
+ add_conflicts_to_waits(&conflicts, conflicts_collector);
+ }
+ conflicts.destroy();
+
+ return r;
+}
+
+void lock_request::retry_all_lock_requests(
+ locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
+ void *callback_arg, void (*after_retry_all_test_callback)(void)) {
+ lt_lock_request_info *info = lt->get_lock_request_info();
+
+ // if there are no pending lock requests than there is nothing to do
+ // the unlocked data race on pending_is_empty is OK since lock requests
+ // are retried after added to the pending set.
+ if (info->pending_is_empty) return;
+
+ // get my retry generation (post increment of retry_want)
+ unsigned long long my_retry_want = (info->retry_want += 1);
+
+ toku_mutex_lock(&info->retry_mutex);
+
+ // here is the group retry algorithm.
+ // get the latest retry_want count and use it as the generation number of
+ // this retry operation. if this retry generation is > the last retry
+ // generation, then do the lock retries. otherwise, no lock retries
+ // are needed.
+ if ((my_retry_want - 1) == info->retry_done) {
+ for (;;) {
+ if (!info->running_retry) {
+ info->running_retry = true;
+ info->retry_done = info->retry_want;
+ toku_mutex_unlock(&info->retry_mutex);
+ retry_all_lock_requests_info(info, lock_wait_callback, callback_arg);
+ if (after_retry_all_test_callback) after_retry_all_test_callback();
+ toku_mutex_lock(&info->retry_mutex);
+ info->running_retry = false;
+ toku_cond_broadcast(&info->retry_cv);
+ break;
+ } else {
+ toku_cond_wait(&info->retry_cv, &info->retry_mutex);
+ }
+ }
+ }
+ toku_mutex_unlock(&info->retry_mutex);
+}
+
+void lock_request::retry_all_lock_requests_info(
+ lt_lock_request_info *info,
+ void (*lock_wait_callback)(void *, lock_wait_infos *), void *callback_arg) {
+ toku_external_mutex_lock(&info->mutex);
+ // retry all of the pending lock requests.
+ lock_wait_infos conflicts_collector;
+ for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
+ lock_request *request;
+ int r = info->pending_lock_requests.fetch(i, &request);
+ invariant_zero(r);
+
+ // retry the lock request. if it didn't succeed,
+ // move on to the next lock request. otherwise
+ // the request is gone from the list so we may
+ // read the i'th entry for the next one.
+ r = request->retry(&conflicts_collector);
+ if (r != 0) {
+ i++;
+ }
+ }
+
+ // call report_waits while holding the pending queue lock since
+ // the waiter object is still valid while it's in the queue
+ report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+
+ // future threads should only retry lock requests if some still exist
+ info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+ toku_external_mutex_unlock(&info->mutex);
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+ lock_wait_infos *wait_conflicts) {
+ wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
+ uint32_t num_conflicts = conflicts->size();
+ for (uint32_t i = 0; i < num_conflicts; i++) {
+ wait_conflicts->back().waitees.push_back(conflicts->get(i));
+ }
+}
+
+void lock_request::report_waits(lock_wait_infos *wait_conflicts,
+ void (*lock_wait_callback)(void *,
+ lock_wait_infos *),
+ void *callback_arg) {
+ if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
+}
+
+void *lock_request::get_extra(void) const { return m_extra; }
+
+void lock_request::kill_waiter(void) {
+ remove_from_lock_requests();
+ complete(DB_LOCK_NOTGRANTED);
+ toku_external_cond_broadcast(&m_wait_cond);
+}
+
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+ lt_lock_request_info *info = lt->get_lock_request_info();
+ toku_external_mutex_lock(&info->mutex);
+ for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
+ lock_request *request;
+ int r = info->pending_lock_requests.fetch(i, &request);
+ if (r == 0 && request->get_extra() == extra) {
+ request->kill_waiter();
+ break;
+ }
+ }
+ toku_external_mutex_unlock(&info->mutex);
+}
+
+// find another lock request by txnid. must hold the mutex.
+lock_request *lock_request::find_lock_request(const TXNID &txnid) {
+ lock_request *request;
+ int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+ txnid, &request, nullptr);
+ if (r != 0) {
+ request = nullptr;
+ }
+ return request;
+}
+
+// insert this lock request into the locktree's set. must hold the mutex.
+void lock_request::insert_into_lock_requests(void) {
+ uint32_t idx;
+ lock_request *request;
+ int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+ m_txnid, &request, &idx);
+ invariant(r == DB_NOTFOUND);
+ r = m_info->pending_lock_requests.insert_at(this, idx);
+ invariant_zero(r);
+ m_info->pending_is_empty = false;
+}
+
+// remove this lock request from the locktree's set. must hold the mutex.
+void lock_request::remove_from_lock_requests(void) {
+ uint32_t idx;
+ lock_request *request;
+ int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+ m_txnid, &request, &idx);
+ invariant_zero(r);
+ invariant(request == this);
+ r = m_info->pending_lock_requests.delete_at(idx);
+ invariant_zero(r);
+ if (m_info->pending_lock_requests.size() == 0)
+ m_info->pending_is_empty = true;
+}
+
+int lock_request::find_by_txnid(lock_request *const &request,
+ const TXNID &txnid) {
+ TXNID request_txnid = request->m_txnid;
+ if (request_txnid < txnid) {
+ return -1;
+ } else if (request_txnid == txnid) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+void lock_request::set_start_test_callback(void (*f)(void)) {
+ m_start_test_callback = f;
+}
+
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+ m_start_before_pending_test_callback = f;
+}
+
+void lock_request::set_retry_test_callback(void (*f)(void)) {
+ m_retry_test_callback = f;
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
new file mode 100644
index 000000000..d30e1e2ca
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
@@ -0,0 +1,255 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_pthread.h"
+#include "locktree.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Information about a lock wait
+struct lock_wait_info {
+ locktree *ltree; // the tree where wait happens
+ TXNID waiter; // the waiting transaction
+ void *m_extra; // lock_request's m_extra
+
+ // The transactions that are waited for.
+ std::vector<TXNID> waitees;
+};
+
+typedef std::vector<lock_wait_info> lock_wait_infos;
+
+// A lock request contains the db, the key range, the lock type, and
+// the transaction id that describes a potential row range lock.
+//
+// the typical use case is:
+// - initialize a lock request
+// - start to try to acquire the lock
+// - do something else
+// - wait for the lock request to be resolved on a timed condition
+// - destroy the lock request
+// a lock request is resolved when its state is no longer pending, or
+// when it becomes granted, or timedout, or deadlocked. when resolved, the
+// state of the lock request is changed and any waiting threads are awakened.
+
+class lock_request {
+ public:
+ enum type { UNKNOWN, READ, WRITE };
+
+ // effect: Initializes a lock request.
+ void create(toku_external_mutex_factory_t mutex_factory);
+
+ // effect: Destroys a lock request.
+ void destroy(void);
+
+ // effect: Resets the lock request parameters, allowing it to be reused.
+ // requires: Lock request was already created at some point
+ void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
+ type lock_type, bool big_txn, void *extra = nullptr);
+
+ // effect: Tries to acquire a lock described by this lock request.
+ // returns: The return code of locktree::acquire_[write,read]_lock()
+ // or DB_LOCK_DEADLOCK if this request would end up deadlocked.
+ int start(void);
+
+ // effect: Sleeps until either the request is granted or the wait time
+ // expires. returns: The return code of locktree::acquire_[write,read]_lock()
+ // or simply DB_LOCK_NOTGRANTED if the wait time expired.
+ int wait(uint64_t wait_time_ms);
+ int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+ int (*killed_callback)(void),
+ void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+ void *callback_arg = nullptr);
+
+ // return: left end-point of the lock range
+ const DBT *get_left_key(void) const;
+
+ // return: right end-point of the lock range
+ const DBT *get_right_key(void) const;
+
+ // return: the txnid waiting for a lock
+ TXNID get_txnid(void) const;
+
+ // return: when this lock request started, as milliseconds from epoch
+ uint64_t get_start_time(void) const;
+
+ // return: which txnid is blocking this request (there may be more, though)
+ TXNID get_conflicting_txnid(void) const;
+
+ // effect: Retries all of the lock requests for the given locktree.
+ // Any lock requests successfully restarted is completed and woken
+ // up.
+ // The rest remain pending.
+ static void retry_all_lock_requests(
+ locktree *lt,
+ void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+ void *callback_arg = nullptr,
+ void (*after_retry_test_callback)(void) = nullptr);
+ static void retry_all_lock_requests_info(
+ lt_lock_request_info *info,
+ void (*lock_wait_callback)(void *, lock_wait_infos *),
+ void *callback_arg);
+
+ void set_start_test_callback(void (*f)(void));
+ void set_start_before_pending_test_callback(void (*f)(void));
+ void set_retry_test_callback(void (*f)(void));
+
+ void *get_extra(void) const;
+
+ void kill_waiter(void);
+ static void kill_waiter(locktree *lt, void *extra);
+
+ private:
+ enum state {
+ UNINITIALIZED,
+ INITIALIZED,
+ PENDING,
+ COMPLETE,
+ DESTROYED,
+ };
+
+ // The keys for a lock request are stored "unowned" in m_left_key
+ // and m_right_key. When the request is about to go to sleep, it
+ // copies these keys and stores them in m_left_key_copy etc and
+ // sets the temporary pointers to null.
+ TXNID m_txnid;
+ TXNID m_conflicting_txnid;
+ uint64_t m_start_time;
+ const DBT *m_left_key;
+ const DBT *m_right_key;
+ DBT m_left_key_copy;
+ DBT m_right_key_copy;
+
+ // The lock request type and associated locktree
+ type m_type;
+ locktree *m_lt;
+
+ // If the lock request is in the completed state, then its
+ // final return value is stored in m_complete_r
+ int m_complete_r;
+ state m_state;
+
+ toku_external_cond_t m_wait_cond;
+
+ bool m_big_txn;
+
+ // the lock request info state stored in the
+ // locktree that this lock request is for.
+ struct lt_lock_request_info *m_info;
+
+ void *m_extra;
+
+ // effect: tries again to acquire the lock described by this lock request
+ // returns: 0 if retrying the request succeeded and is now complete
+ int retry(lock_wait_infos *collector);
+
+ void complete(int complete_r);
+
+ // effect: Finds another lock request by txnid.
+ // requires: The lock request info mutex is held
+ lock_request *find_lock_request(const TXNID &txnid);
+
+ // effect: Insert this lock request into the locktree's set.
+ // requires: the locktree's mutex is held
+ void insert_into_lock_requests(void);
+
+ // effect: Removes this lock request from the locktree's set.
+ // requires: The lock request info mutex is held
+ void remove_from_lock_requests(void);
+
+ // effect: Asks this request's locktree which txnids are preventing
+ // us from getting the lock described by this request.
+ // returns: conflicts is populated with the txnid's that this request
+ // is blocked on
+ void get_conflicts(txnid_set *conflicts);
+
+ // effect: Builds a wait-for-graph for this lock request and the given
+ // conflict set
+ void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
+
+ // returns: True if this lock request is in deadlock with the given conflicts
+ // set
+ bool deadlock_exists(const txnid_set &conflicts);
+
+ void copy_keys(void);
+
+ static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
+
+ // Report list of conflicts to lock wait callback.
+ static void report_waits(lock_wait_infos *wait_conflicts,
+ void (*lock_wait_callback)(void *,
+ lock_wait_infos *),
+ void *callback_arg);
+ void add_conflicts_to_waits(txnid_set *conflicts,
+ lock_wait_infos *wait_conflicts);
+
+ void (*m_start_test_callback)(void);
+ void (*m_start_before_pending_test_callback)(void);
+ void (*m_retry_test_callback)(void);
+
+ public:
+ std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
+
+ friend class lock_request_unit_test;
+};
+// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
+// This is ok as the PODness is not really required: lock_request objects are
+// not moved in memory or anything.
+// ENSURE_POD(lock_request);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
new file mode 100644
index 000000000..3d6a590c7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
@@ -0,0 +1,1023 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "locktree.h"
+
+#include <memory.h>
+
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+#include "../util/growable_array.h"
+#include "range_buffer.h"
+
+// including the concurrent_tree here expands the templates
+// and "defines" the implementation, so we do it here in
+// the locktree source file instead of the header.
+#include "concurrent_tree.h"
+
+namespace toku {
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given descriptor
+// and comparison fn.
+//
+// Each locktree has a reference count which it manages
+// but does nothing based on the value of the reference count - it is
+// up to the user of the locktree to destroy it when it sees fit.
+
+void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+ const comparator &cmp,
+ toku_external_mutex_factory_t mutex_factory) {
+ m_mgr = mgr;
+ m_dict_id = dict_id;
+
+ m_cmp.create_from(cmp);
+ m_reference_count = 1;
+ m_userdata = nullptr;
+
+ XCALLOC(m_rangetree);
+ m_rangetree->create(&m_cmp);
+
+ m_sto_txnid = TXNID_NONE;
+ m_sto_buffer.create();
+ m_sto_score = STO_SCORE_THRESHOLD;
+ m_sto_end_early_count = 0;
+ m_sto_end_early_time = 0;
+
+ m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool {
+ return false;
+ };
+
+ m_lock_request_info.init(mutex_factory);
+}
+
+void locktree::set_escalation_barrier_func(
+ lt_escalation_barrier_check_func func, void *extra) {
+ m_escalation_barrier = func;
+ m_escalation_barrier_arg = extra;
+}
+
+void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) {
+ pending_lock_requests.create();
+ pending_is_empty = true;
+ toku_external_mutex_init(mutex_factory, &mutex);
+ retry_want = retry_done = 0;
+ ZERO_STRUCT(counters);
+ ZERO_STRUCT(retry_mutex);
+ toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr);
+ toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr);
+ running_retry = false;
+
+ TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty,
+ sizeof(pending_is_empty));
+ TOKU_DRD_IGNORE_VAR(pending_is_empty);
+}
+
+void locktree::destroy(void) {
+ invariant(m_reference_count == 0);
+ invariant(m_lock_request_info.pending_lock_requests.size() == 0);
+ m_cmp.destroy();
+ m_rangetree->destroy();
+ toku_free(m_rangetree);
+ m_sto_buffer.destroy();
+ m_lock_request_info.destroy();
+}
+
+void lt_lock_request_info::destroy(void) {
+ pending_lock_requests.destroy();
+ toku_external_mutex_destroy(&mutex);
+ toku_mutex_destroy(&retry_mutex);
+ toku_cond_destroy(&retry_cv);
+}
+
+void locktree::add_reference(void) {
+ (void)toku_sync_add_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::release_reference(void) {
+ return toku_sync_sub_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::get_reference_count(void) { return m_reference_count; }
+
+// a container for a range/txnid pair
+struct row_lock {
+ keyrange range;
+ TXNID txnid;
+ bool is_shared;
+ TxnidVector *owners;
+};
+
+// iterate over a locked keyrange and copy out all of the data,
+// storing each row lock into the given growable array. the
+// caller does not own the range inside the returned row locks,
+// so remove from the tree with care using them as keys.
+static void iterate_and_get_overlapping_row_locks(
+ const concurrent_tree::locked_keyrange *lkr,
+ GrowableArray<row_lock> *row_locks) {
+ struct copy_fn_obj {
+ GrowableArray<row_lock> *row_locks;
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ row_lock lock = {.range = range,
+ .txnid = txnid,
+ .is_shared = is_shared,
+ .owners = owners};
+ row_locks->push(lock);
+ return true;
+ }
+ } copy_fn;
+ copy_fn.row_locks = row_locks;
+ lkr->iterate(&copy_fn);
+}
+
+// given a txnid and a set of overlapping row locks, determine
+// which txnids are conflicting, and store them in the conflicts
+// set, if given.
+static bool determine_conflicting_txnids(
+ const GrowableArray<row_lock> &row_locks, const TXNID &txnid,
+ txnid_set *conflicts) {
+ bool conflicts_exist = false;
+ const size_t num_overlaps = row_locks.get_size();
+ for (size_t i = 0; i < num_overlaps; i++) {
+ const row_lock lock = row_locks.fetch_unchecked(i);
+ const TXNID other_txnid = lock.txnid;
+ if (other_txnid != txnid) {
+ if (conflicts) {
+ if (other_txnid == TXNID_SHARED) {
+ // Add all shared lock owners, except this transaction.
+ for (TXNID shared_id : *lock.owners) {
+ if (shared_id != txnid) conflicts->add(shared_id);
+ }
+ } else {
+ conflicts->add(other_txnid);
+ }
+ }
+ conflicts_exist = true;
+ }
+ }
+ return conflicts_exist;
+}
+
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
+ const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
+ return lock.range.get_memory_size() + overhead;
+}
+
+// remove and destroy the given row lock from the locked keyrange,
+// then notify the memory tracker of the newly freed lock.
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
+ const row_lock &lock, TXNID txnid,
+ locktree_manager *mgr) {
+ const uint64_t mem_released = row_lock_size_in_tree(lock);
+ lkr->remove(lock.range, txnid);
+ if (mgr != nullptr) {
+ mgr->note_mem_released(mem_released);
+ }
+}
+
+// insert a row lock into the locked keyrange, then notify
+// the memory tracker of this newly acquired lock.
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
+ const row_lock &lock,
+ locktree_manager *mgr) {
+ uint64_t mem_used = row_lock_size_in_tree(lock);
+ lkr->insert(lock.range, lock.txnid, lock.is_shared);
+ if (mgr != nullptr) {
+ mgr->note_mem_used(mem_used);
+ }
+}
+
+void locktree::sto_begin(TXNID txnid) {
+ invariant(m_sto_txnid == TXNID_NONE);
+ invariant(m_sto_buffer.is_empty());
+ m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
+ uint64_t buffer_mem, delta;
+
+ // psergey: the below two lines do not make any sense
+ // (and it's the same in upstream TokuDB)
+ keyrange range;
+ range.create(left_key, right_key);
+
+ buffer_mem = m_sto_buffer.total_memory_size();
+ m_sto_buffer.append(left_key, right_key, is_write_request);
+ delta = m_sto_buffer.total_memory_size() - buffer_mem;
+ if (m_mgr != nullptr) {
+ m_mgr->note_mem_used(delta);
+ }
+}
+
+void locktree::sto_end(void) {
+ uint64_t mem_size = m_sto_buffer.total_memory_size();
+ if (m_mgr != nullptr) {
+ m_mgr->note_mem_released(mem_size);
+ }
+ m_sto_buffer.destroy();
+ m_sto_buffer.create();
+ m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early_no_accounting(void *prepared_lkr) {
+ sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+ sto_end();
+ toku_unsafe_set(m_sto_score, 0);
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+ m_sto_end_early_count++;
+
+ tokutime_t t0 = toku_time_now();
+ sto_end_early_no_accounting(prepared_lkr);
+ tokutime_t t1 = toku_time_now();
+
+ m_sto_end_early_time += (t1 - t0);
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+ // There should be something to migrate, and nothing in the rangetree.
+ invariant(!m_sto_buffer.is_empty());
+ invariant(m_rangetree->is_empty());
+
+ concurrent_tree sto_rangetree;
+ concurrent_tree::locked_keyrange sto_lkr;
+ sto_rangetree.create(&m_cmp);
+
+ // insert all of the ranges from the single txnid buffer into a new rangtree
+ range_buffer::iterator iter(&m_sto_buffer);
+ range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ sto_lkr.prepare(&sto_rangetree);
+ int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(),
+ rec.get_right_key(),
+ rec.get_exclusive_flag(), nullptr);
+ invariant_zero(r);
+ sto_lkr.release();
+ iter.next();
+ }
+
+ // Iterate the newly created rangetree and insert each range into the
+ // locktree's rangetree, on behalf of the old single txnid.
+ struct migrate_fn_obj {
+ concurrent_tree::locked_keyrange *dst_lkr;
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ // There can't be multiple owners in STO mode
+ invariant_zero(owners);
+ dst_lkr->insert(range, txnid, is_shared);
+ return true;
+ }
+ } migrate_fn;
+ migrate_fn.dst_lkr =
+ static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+ sto_lkr.prepare(&sto_rangetree);
+ sto_lkr.iterate(&migrate_fn);
+ sto_lkr.remove_all();
+ sto_lkr.release();
+ sto_rangetree.destroy();
+ invariant(!m_rangetree->is_empty());
+}
+
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+ const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
+ if (m_rangetree->is_empty() && m_sto_buffer.is_empty() &&
+ toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
+ // We can do the optimization because the rangetree is empty, and
+ // we know its worth trying because the sto score is big enough.
+ sto_begin(txnid);
+ } else if (m_sto_txnid != TXNID_NONE) {
+ // We are currently doing the optimization. Check if we need to cancel
+ // it because a new txnid appeared, or if the current single txnid has
+ // taken too many locks already.
+ if (m_sto_txnid != txnid ||
+ m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+ sto_end_early(prepared_lkr);
+ }
+ }
+
+ // At this point the sto txnid is properly set. If it is valid, then
+ // this txnid can append its lock to the sto buffer successfully.
+ if (m_sto_txnid != TXNID_NONE) {
+ invariant(m_sto_txnid == txnid);
+ sto_append(left_key, right_key, is_write_request);
+ return true;
+ } else {
+ invariant(m_sto_buffer.is_empty());
+ return false;
+ }
+}
+
+/*
+ Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+ this:
+ The set of overlapping rows locks consists of just one read-only shared
+ lock with the same endpoints as specified (in that case, we can just add
+ ourselves into that list)
+
+ @return true - One compatible shared lock
+ false - Otherwise
+*/
+static bool iterate_and_get_overlapping_row_locks2(
+ const concurrent_tree::locked_keyrange *lkr, const DBT *left_key,
+ const DBT *right_key, comparator *cmp, TXNID,
+ GrowableArray<row_lock> *row_locks) {
+ struct copy_fn_obj {
+ GrowableArray<row_lock> *row_locks;
+ bool first_call = true;
+ bool matching_lock_found = false;
+ const DBT *left_key, *right_key;
+ comparator *cmp;
+
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ if (first_call) {
+ first_call = false;
+ if (is_shared && !(*cmp)(left_key, range.get_left_key()) &&
+ !(*cmp)(right_key, range.get_right_key())) {
+ matching_lock_found = true;
+ }
+ } else {
+ // if we see multiple matching locks, it doesn't matter whether
+ // the first one was matching.
+ matching_lock_found = false;
+ }
+ row_lock lock = {.range = range,
+ .txnid = txnid,
+ .is_shared = is_shared,
+ .owners = owners};
+ row_locks->push(lock);
+ return true;
+ }
+ } copy_fn;
+ copy_fn.row_locks = row_locks;
+ copy_fn.left_key = left_key;
+ copy_fn.right_key = right_key;
+ copy_fn.cmp = cmp;
+ lkr->iterate(&copy_fn);
+ return copy_fn.matching_lock_found;
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+ const DBT *left_key,
+ const DBT *right_key,
+ bool is_write_request,
+ txnid_set *conflicts) {
+ int r = 0;
+ concurrent_tree::locked_keyrange *lkr;
+
+ keyrange requested_range;
+ requested_range.create(left_key, right_key);
+ lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+ lkr->acquire(requested_range);
+
+ // copy out the set of overlapping row locks.
+ GrowableArray<row_lock> overlapping_row_locks;
+ overlapping_row_locks.init();
+ bool matching_shared_lock_found = false;
+
+ if (is_write_request)
+ iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+ else {
+ matching_shared_lock_found = iterate_and_get_overlapping_row_locks2(
+ lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks);
+ // psergey-todo: what to do now? So, we have figured we have just one
+ // shareable lock. Need to add us into it as an owner but the lock
+ // pointer cannot be kept?
+ // A: use find_node_with_overlapping_child(key_range, nullptr);
+ // then, add ourselves to the owner list.
+ // Dont' foreget to release the subtree after that.
+ }
+
+ if (matching_shared_lock_found) {
+ // there is just one non-confliting matching shared lock.
+ // we are hilding a lock on it (see acquire() call above).
+ // we need to modify it to indicate there is another locker...
+ if (lkr->add_shared_owner(requested_range, txnid)) {
+ // Pretend shared lock uses as much memory.
+ row_lock new_lock = {.range = requested_range,
+ .txnid = txnid,
+ .is_shared = false,
+ .owners = nullptr};
+ uint64_t mem_used = row_lock_size_in_tree(new_lock);
+ if (m_mgr) {
+ m_mgr->note_mem_used(mem_used);
+ }
+ }
+ requested_range.destroy();
+ overlapping_row_locks.deinit();
+ return 0;
+ }
+
+ size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+ // if any overlapping row locks conflict with this request, bail out.
+
+ bool conflicts_exist =
+ determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+ if (!conflicts_exist) {
+ // there are no conflicts, so all of the overlaps are for the requesting
+ // txnid. so, we must consolidate all existing overlapping ranges and the
+ // requested range into one dominating range. then we insert the dominating
+ // range.
+ bool all_shared = !is_write_request;
+ for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+ row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
+ invariant(overlapping_lock.txnid == txnid);
+ requested_range.extend(m_cmp, overlapping_lock.range);
+ remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+ all_shared = all_shared && overlapping_lock.is_shared;
+ }
+
+ row_lock new_lock = {.range = requested_range,
+ .txnid = txnid,
+ .is_shared = all_shared,
+ .owners = nullptr};
+ insert_row_lock_into_tree(lkr, new_lock, m_mgr);
+ } else {
+ r = DB_LOCK_NOTGRANTED;
+ }
+
+ requested_range.destroy();
+ overlapping_row_locks.deinit();
+ return r;
+}
+
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+ const DBT *left_key, const DBT *right_key,
+ txnid_set *conflicts) {
+ int r = 0;
+
+ // we are only supporting write locks for simplicity
+ // invariant(is_write_request);
+
+ // acquire and prepare a locked keyrange over the requested range.
+ // prepare is a serialzation point, so we take the opportunity to
+ // try the single txnid optimization first.
+ concurrent_tree::locked_keyrange lkr;
+ lkr.prepare(m_rangetree);
+
+ bool acquired =
+ sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request);
+ if (!acquired) {
+ r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+ is_write_request, conflicts);
+ }
+
+ lkr.release();
+ return r;
+}
+
+int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
+ const DBT *left_key, const DBT *right_key,
+ txnid_set *conflicts, bool big_txn) {
+ // All ranges in the locktree must have left endpoints <= right endpoints.
+ // Range comparisons rely on this fact, so we make a paranoid invariant here.
+ paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+ int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn);
+ if (r == 0) {
+ r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
+ }
+ return r;
+}
+
+// the locktree silently upgrades read locks to write locks for simplicity
+int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key,
+ const DBT *right_key, txnid_set *conflicts,
+ bool big_txn) {
+ return try_acquire_lock(false, txnid, left_key, right_key, conflicts,
+ big_txn);
+}
+
+int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key,
+ const DBT *right_key, txnid_set *conflicts,
+ bool big_txn) {
+ return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
+}
+
+// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+// TXNID txnid);
+void locktree::dump_locks(void *cdata, dump_callback cb) {
+ concurrent_tree::locked_keyrange lkr;
+ keyrange range;
+ range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+
+ lkr.prepare(m_rangetree);
+ lkr.acquire(range);
+
+ TXNID sto_txn;
+ if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+ // insert all of the ranges from the single txnid buffer into a new rangtree
+ range_buffer::iterator iter(&m_sto_buffer);
+ range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn,
+ !rec.get_exclusive_flag(), nullptr);
+ iter.next();
+ }
+ } else {
+ GrowableArray<row_lock> all_locks;
+ all_locks.init();
+ iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+ const size_t n_locks = all_locks.get_size();
+ for (size_t i = 0; i < n_locks; i++) {
+ const row_lock lock = all_locks.fetch_unchecked(i);
+ (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(),
+ lock.txnid, lock.is_shared, lock.owners);
+ }
+ all_locks.deinit();
+ }
+ lkr.release();
+ range.destroy();
+}
+
+void locktree::get_conflicts(bool is_write_request, TXNID txnid,
+ const DBT *left_key, const DBT *right_key,
+ txnid_set *conflicts) {
+ // because we only support write locks, ignore this bit for now.
+ (void)is_write_request;
+
+ // preparing and acquire a locked keyrange over the range
+ keyrange range;
+ range.create(left_key, right_key);
+ concurrent_tree::locked_keyrange lkr;
+ lkr.prepare(m_rangetree);
+ lkr.acquire(range);
+
+ // copy out the set of overlapping row locks and determine the conflicts
+ GrowableArray<row_lock> overlapping_row_locks;
+ overlapping_row_locks.init();
+ iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+
+ // we don't care if conflicts exist. we just want the conflicts set populated.
+ (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+
+ lkr.release();
+ overlapping_row_locks.deinit();
+ range.destroy();
+}
+
+// Effect:
+// For each range in the lock tree that overlaps the given range and has
+// the given txnid, remove it.
+// Rationale:
+// In the common case, there is only the range [left_key, right_key] and
+// it is associated with txnid, so this is a single tree delete.
+//
+// However, consolidation and escalation change the objects in the tree
+// without telling the txn anything. In this case, the txn may own a
+// large range lock that represents its ownership of many smaller range
+// locks. For example, the txn may think it owns point locks on keys 1,
+// 2, and 3, but due to escalation, only the object [1,3] exists in the
+// tree.
+//
+// The first call for a small lock will remove the large range lock, and
+// the rest of the calls should do nothing. After the first release,
+// another thread can acquire one of the locks that the txn thinks it
+// still owns. That's ok, because the txn doesn't want it anymore (it
+// unlocks everything at once), but it may find a lock that it does not
+// own.
+//
+// In our example, the txn unlocks key 1, which actually removes the
+// whole lock [1,3]. Now, someone else can lock 2 before our txn gets
+// around to unlocking 2, so we should not remove that lock.
+void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
+ const DBT *left_key,
+ const DBT *right_key) {
+ keyrange release_range;
+ release_range.create(left_key, right_key);
+
+ // acquire and prepare a locked keyrange over the release range
+ concurrent_tree::locked_keyrange lkr;
+ lkr.prepare(m_rangetree);
+ lkr.acquire(release_range);
+
+ // copy out the set of overlapping row locks.
+ GrowableArray<row_lock> overlapping_row_locks;
+ overlapping_row_locks.init();
+ iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+ size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+ for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+ row_lock lock = overlapping_row_locks.fetch_unchecked(i);
+ // If this isn't our lock, that's ok, just don't remove it.
+ // See rationale above.
+ // psergey-todo: for shared locks, just remove ourselves from the
+ // owners.
+ if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) {
+ remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
+ }
+ }
+
+ lkr.release();
+ overlapping_row_locks.deinit();
+ release_range.destroy();
+}
+
+bool locktree::sto_txnid_is_valid_unsafe(void) const {
+ return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE;
+}
+
+int locktree::sto_get_score_unsafe(void) const {
+ return toku_unsafe_fetch(m_sto_score);
+}
+
+bool locktree::sto_try_release(TXNID txnid) {
+ bool released = false;
+ if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+ // check the bit again with a prepared locked keyrange,
+ // which protects the optimization bits and rangetree data
+ concurrent_tree::locked_keyrange lkr;
+ lkr.prepare(m_rangetree);
+ if (m_sto_txnid != TXNID_NONE) {
+ // this txnid better be the single txnid on this locktree,
+ // or else we are in big trouble (meaning the logic is broken)
+ invariant(m_sto_txnid == txnid);
+ invariant(m_rangetree->is_empty());
+ sto_end();
+ released = true;
+ }
+ lkr.release();
+ }
+ return released;
+}
+
+// release all of the locks for a txnid whose endpoints are pairs
+// in the given range buffer.
+void locktree::release_locks(TXNID txnid, const range_buffer *ranges,
+ bool all_trx_locks_hint) {
+ // try the single txn optimization. if it worked, then all of the
+ // locks are already released, otherwise we need to do it here.
+ bool released;
+ if (all_trx_locks_hint) {
+ // This will release all of the locks the transaction is holding
+ released = sto_try_release(txnid);
+ } else {
+ /*
+ psergey: we are asked to release *Some* of the locks the transaction
+ is holding.
+ We could try doing that without leaving the STO mode, but right now,
+ the easiest way is to exit the STO mode and let the non-STO code path
+ handle it.
+ */
+ if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+ // check the bit again with a prepared locked keyrange,
+ // which protects the optimization bits and rangetree data
+ concurrent_tree::locked_keyrange lkr;
+ lkr.prepare(m_rangetree);
+ if (m_sto_txnid != TXNID_NONE) {
+ sto_end_early(&lkr);
+ }
+ lkr.release();
+ }
+ released = false;
+ }
+ if (!released) {
+ range_buffer::iterator iter(ranges);
+ range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ const DBT *left_key = rec.get_left_key();
+ const DBT *right_key = rec.get_right_key();
+ // All ranges in the locktree must have left endpoints <= right endpoints.
+ // Range comparisons rely on this fact, so we make a paranoid invariant
+ // here.
+ paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+ remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+ iter.next();
+ }
+ // Increase the sto score slightly. Eventually it will hit
+ // the threshold and we'll try the optimization again. This
+ // is how a previously multithreaded system transitions into
+ // a single threaded system that benefits from the optimization.
+ if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) {
+ toku_sync_fetch_and_add(&m_sto_score, 1);
+ }
+ }
+}
+
+// iterate over a locked keyrange and extract copies of the first N
+// row locks, storing each one into the given array of size N,
+// then removing each extracted lock from the locked keyrange.
+static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
+ locktree_manager *mgr, row_lock *row_locks,
+ int num_to_extract) {
+ struct extract_fn_obj {
+ int num_extracted;
+ int num_to_extract;
+ row_lock *row_locks;
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ if (num_extracted < num_to_extract) {
+ row_lock lock;
+ lock.range.create_copy(range);
+ lock.txnid = txnid;
+ lock.is_shared = is_shared;
+ // deep-copy the set of owners:
+ if (owners)
+ lock.owners = new TxnidVector(*owners);
+ else
+ lock.owners = nullptr;
+ row_locks[num_extracted++] = lock;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ } extract_fn;
+
+ extract_fn.row_locks = row_locks;
+ extract_fn.num_to_extract = num_to_extract;
+ extract_fn.num_extracted = 0;
+ lkr->iterate(&extract_fn);
+
+ // now that the ranges have been copied out, complete
+ // the extraction by removing the ranges from the tree.
+ // use remove_row_lock_from_tree() so we properly track the
+ // amount of memory and number of locks freed.
+ int num_extracted = extract_fn.num_extracted;
+ invariant(num_extracted <= num_to_extract);
+ for (int i = 0; i < num_extracted; i++) {
+ remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
+ }
+
+ return num_extracted;
+}
+
+// Store each newly escalated lock in a range buffer for appropriate txnid.
+// We'll rebuild the locktree by iterating over these ranges, and then we
+// can pass back each txnid/buffer pair individually through a callback
+// to notify higher layers that locks have changed.
+struct txnid_range_buffer {
+ TXNID txnid;
+ range_buffer buffer;
+
+ static int find_by_txnid(struct txnid_range_buffer *const &other_buffer,
+ const TXNID &txnid) {
+ if (txnid < other_buffer->txnid) {
+ return -1;
+ } else if (other_buffer->txnid == txnid) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+};
+
+// escalate the locks in the locktree by merging adjacent
+// locks that have the same txnid into one larger lock.
+//
+// if there's only one txnid in the locktree then this
+// approach works well. if there are many txnids and each
+// has locks in a random/alternating order, then this does
+// not work so well.
+void locktree::escalate(lt_escalate_cb after_escalate_callback,
+ void *after_escalate_callback_extra) {
+ omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers;
+ range_buffers.create();
+
+ // prepare and acquire a locked keyrange on the entire locktree
+ concurrent_tree::locked_keyrange lkr;
+ keyrange infinite_range = keyrange::get_infinite_range();
+ lkr.prepare(m_rangetree);
+ lkr.acquire(infinite_range);
+
+ // if we're in the single txnid optimization, simply call it off.
+ // if you have to run escalation, you probably don't care about
+ // the optimization anyway, and this makes things easier.
+ if (m_sto_txnid != TXNID_NONE) {
+ // We are already accounting for this escalation time and
+ // count, so don't do it for sto_end_early too.
+ sto_end_early_no_accounting(&lkr);
+ }
+
+ // extract and remove batches of row locks from the locktree
+ int num_extracted;
+ const int num_row_locks_per_batch = 128;
+ row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf);
+
+ // we always remove the "first" n because we are removing n
+ // each time we do an extraction. so this loops until its empty.
+ while ((num_extracted = extract_first_n_row_locks(
+ &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) {
+ int current_index = 0;
+ while (current_index < num_extracted) {
+ // every batch of extracted locks is in range-sorted order. search
+ // through them and merge adjacent locks with the same txnid into
+ // one dominating lock and save it to a set of escalated locks.
+ //
+ // first, find the index of the next row lock that
+ // - belongs to a different txnid, or
+ // - belongs to several txnids, or
+ // - is a shared lock (we could potentially merge those but
+ // currently we don't), or
+ // - is across a lock escalation barrier.
+ int next_txnid_index = current_index + 1;
+
+ while (next_txnid_index < num_extracted &&
+ (extracted_buf[current_index].txnid ==
+ extracted_buf[next_txnid_index].txnid) &&
+ !extracted_buf[next_txnid_index].is_shared &&
+ !extracted_buf[next_txnid_index].owners &&
+ !m_escalation_barrier(
+ extracted_buf[current_index].range.get_right_key(),
+ extracted_buf[next_txnid_index].range.get_left_key(),
+ m_escalation_barrier_arg)) {
+ next_txnid_index++;
+ }
+
+ // Create an escalated range for the current txnid that dominates
+ // each range between the current indext and the next txnid's index.
+ // const TXNID current_txnid = extracted_buf[current_index].txnid;
+ const DBT *escalated_left_key =
+ extracted_buf[current_index].range.get_left_key();
+ const DBT *escalated_right_key =
+ extracted_buf[next_txnid_index - 1].range.get_right_key();
+
+ // Try to find a range buffer for the current txnid. Create one if it
+ // doesn't exist. Then, append the new escalated range to the buffer. (If
+ // a lock is shared by multiple txnids, append it each of txnid's lists)
+ TxnidVector *owners_ptr;
+ TxnidVector singleton_owner;
+ if (extracted_buf[current_index].owners)
+ owners_ptr = extracted_buf[current_index].owners;
+ else {
+ singleton_owner.insert(extracted_buf[current_index].txnid);
+ owners_ptr = &singleton_owner;
+ }
+
+ for (auto cur_txnid : *owners_ptr) {
+ uint32_t idx;
+ struct txnid_range_buffer *existing_range_buffer;
+ int r =
+ range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+ cur_txnid, &existing_range_buffer, &idx);
+ if (r == DB_NOTFOUND) {
+ struct txnid_range_buffer *XMALLOC(new_range_buffer);
+ new_range_buffer->txnid = cur_txnid;
+ new_range_buffer->buffer.create();
+ new_range_buffer->buffer.append(
+ escalated_left_key, escalated_right_key,
+ !extracted_buf[current_index].is_shared);
+ range_buffers.insert_at(new_range_buffer, idx);
+ } else {
+ invariant_zero(r);
+ invariant(existing_range_buffer->txnid == cur_txnid);
+ existing_range_buffer->buffer.append(
+ escalated_left_key, escalated_right_key,
+ !extracted_buf[current_index].is_shared);
+ }
+ }
+
+ current_index = next_txnid_index;
+ }
+
+ // destroy the ranges copied during the extraction
+ for (int i = 0; i < num_extracted; i++) {
+ delete extracted_buf[i].owners;
+ extracted_buf[i].range.destroy();
+ }
+ }
+ toku_free(extracted_buf);
+
+ // Rebuild the locktree from each range in each range buffer,
+ // then notify higher layers that the txnid's locks have changed.
+ //
+ // (shared locks: if a lock was initially shared between transactions TRX1,
+ // TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+ // TRX2, etc. This will succeed and an identical shared lock will be
+ // constructed)
+
+ invariant(m_rangetree->is_empty());
+ const uint32_t num_range_buffers = range_buffers.size();
+ for (uint32_t i = 0; i < num_range_buffers; i++) {
+ struct txnid_range_buffer *current_range_buffer;
+ int r = range_buffers.fetch(i, &current_range_buffer);
+ invariant_zero(r);
+ if (r == EINVAL) // Shouldn't happen, avoid compiler warning
+ continue;
+
+ const TXNID current_txnid = current_range_buffer->txnid;
+ range_buffer::iterator iter(&current_range_buffer->buffer);
+ range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ keyrange range;
+ range.create(rec.get_left_key(), rec.get_right_key());
+ row_lock lock = {.range = range,
+ .txnid = current_txnid,
+ .is_shared = !rec.get_exclusive_flag(),
+ .owners = nullptr};
+ insert_row_lock_into_tree(&lkr, lock, m_mgr);
+ iter.next();
+ }
+
+ // Notify higher layers that locks have changed for the current txnid
+ if (after_escalate_callback) {
+ after_escalate_callback(current_txnid, this, current_range_buffer->buffer,
+ after_escalate_callback_extra);
+ }
+ current_range_buffer->buffer.destroy();
+ }
+
+ while (range_buffers.size() > 0) {
+ struct txnid_range_buffer *buffer;
+ int r = range_buffers.fetch(0, &buffer);
+ invariant_zero(r);
+ r = range_buffers.delete_at(0);
+ invariant_zero(r);
+ toku_free(buffer);
+ }
+ range_buffers.destroy();
+
+ lkr.release();
+}
+
+void *locktree::get_userdata(void) const { return m_userdata; }
+
+void locktree::set_userdata(void *userdata) { m_userdata = userdata; }
+
+struct lt_lock_request_info *locktree::get_lock_request_info(void) {
+ return &m_lock_request_info;
+}
+
+void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); }
+
+locktree_manager *locktree::get_manager(void) const { return m_mgr; }
+
+int locktree::compare(const locktree *lt) const {
+ if (m_dict_id.dictid < lt->m_dict_id.dictid) {
+ return -1;
+ } else if (m_dict_id.dictid == lt->m_dict_id.dictid) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; }
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
new file mode 100644
index 000000000..f0f4b042d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
@@ -0,0 +1,580 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_external_pthread.h"
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+// PORT #include <ft/ft-ops.h> // just for DICTIONARY_ID..
+// PORT: ft-status for LTM_STATUS:
+#include "../ft/ft-status.h"
+
+struct DICTIONARY_ID {
+ uint64_t dictid;
+};
+
+#include "../util/omt.h"
+#include "range_buffer.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+class locktree;
+class locktree_manager;
+class lock_request;
+class concurrent_tree;
+
+typedef int (*lt_create_cb)(locktree *lt, void *extra);
+typedef void (*lt_destroy_cb)(locktree *lt);
+typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
+ const range_buffer &buffer, void *extra);
+
+typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b,
+ void *extra);
+
+struct lt_counters {
+ uint64_t wait_count, wait_time;
+ uint64_t long_wait_count, long_wait_time;
+ uint64_t timeout_count;
+
+ void add(const lt_counters &rhs) {
+ wait_count += rhs.wait_count;
+ wait_time += rhs.wait_time;
+ long_wait_count += rhs.long_wait_count;
+ long_wait_time += rhs.long_wait_time;
+ timeout_count += rhs.timeout_count;
+ }
+};
+
+// Lock request state for some locktree
+struct lt_lock_request_info {
+ omt<lock_request *> pending_lock_requests;
+ std::atomic_bool pending_is_empty;
+ toku_external_mutex_t mutex;
+ bool should_retry_lock_requests;
+ lt_counters counters;
+ std::atomic_ullong retry_want;
+ unsigned long long retry_done;
+ toku_mutex_t retry_mutex;
+ toku_cond_t retry_cv;
+ bool running_retry;
+
+ void init(toku_external_mutex_factory_t mutex_factory);
+ void destroy(void);
+};
+
+// The locktree manager manages a set of locktrees, one for each open
+// dictionary. Locktrees are retrieved from the manager. When they are no
+// longer needed, they are be released by the user.
+class locktree_manager {
+ public:
+ // param: create_cb, called just after a locktree is first created.
+ // destroy_cb, called just before a locktree is destroyed.
+ // escalate_cb, called after a locktree is escalated (with extra
+ // param)
+ void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+ lt_escalate_cb escalate_cb, void *extra,
+ toku_external_mutex_factory_t mutex_factory_arg);
+
+ void destroy(void);
+
+ size_t get_max_lock_memory(void);
+
+ int set_max_lock_memory(size_t max_lock_memory);
+
+ // effect: Get a locktree from the manager. If a locktree exists with the
+ // given
+ // dict_id, it is referenced and then returned. If one did not exist,
+ // it is created. It will use the comparator for comparing keys. The
+ // on_create callback (passed to locktree_manager::create()) will be
+ // called with the given extra parameter.
+ locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+ void *on_create_extra);
+
+ void reference_lt(locktree *lt);
+
+ // effect: Releases one reference on a locktree. If the reference count
+ // transitions
+ // to zero, the on_destroy callback is called before it gets
+ // destroyed.
+ void release_lt(locktree *lt);
+
+ void get_status(LTM_STATUS status);
+
+ // effect: calls the iterate function on each pending lock request
+ // note: holds the manager's mutex
+ typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+ TXNID txnid, const DBT *left_key,
+ const DBT *right_key,
+ TXNID blocking_txnid,
+ uint64_t start_time,
+ void *extra);
+ int iterate_pending_lock_requests(lock_request_iterate_callback cb,
+ void *extra);
+
+ // effect: Determines if too many locks or too much memory is being used,
+ // Runs escalation on the manager if so.
+ // param: big_txn, if the current transaction is 'big' (has spilled rollback
+ // logs) returns: 0 if there enough resources to create a new lock, or
+ // TOKUDB_OUT_OF_LOCKS
+ // if there are not enough resources and lock escalation failed to
+ // free up enough resources for a new lock.
+ int check_current_lock_constraints(bool big_txn);
+
+ bool over_big_threshold(void);
+
+ void note_mem_used(uint64_t mem_used);
+
+ void note_mem_released(uint64_t mem_freed);
+
+ bool out_of_locks(void) const;
+
+ // Escalate all locktrees
+ void escalate_all_locktrees(void);
+
+ // Escalate a set of locktrees
+ void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+ // effect: calls the private function run_escalation(), only ok to
+ // do for tests.
+ // rationale: to get better stress test coverage, we want a way to
+ // deterministicly trigger lock escalation.
+ void run_escalation_for_test(void);
+ void run_escalation(void);
+
+ // Add time t to the escalator's wait time statistics
+ void add_escalator_wait_time(uint64_t t);
+
+ void kill_waiter(void *extra);
+
+ private:
+ static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+ // tracks the current number of locks and lock memory
+ uint64_t m_max_lock_memory;
+ uint64_t m_current_lock_memory;
+
+ struct lt_counters m_lt_counters;
+
+ // the create and destroy callbacks for the locktrees
+ lt_create_cb m_lt_create_callback;
+ lt_destroy_cb m_lt_destroy_callback;
+ lt_escalate_cb m_lt_escalate_callback;
+ void *m_lt_escalate_callback_extra;
+
+ omt<locktree *> m_locktree_map;
+
+ toku_external_mutex_factory_t mutex_factory;
+
+ // the manager's mutex protects the locktree map
+ toku_mutex_t m_mutex;
+
+ void mutex_lock(void);
+
+ void mutex_unlock(void);
+
+ // Manage the set of open locktrees
+ locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+ void locktree_map_put(locktree *lt);
+ void locktree_map_remove(locktree *lt);
+
+ static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+ void escalator_init(void);
+ void escalator_destroy(void);
+
+ // statistics about lock escalation.
+ toku_mutex_t m_escalation_mutex;
+ uint64_t m_escalation_count;
+ tokutime_t m_escalation_time;
+ uint64_t m_escalation_latest_result;
+ uint64_t m_wait_escalation_count;
+ uint64_t m_wait_escalation_time;
+ uint64_t m_long_wait_escalation_count;
+ uint64_t m_long_wait_escalation_time;
+
+ // the escalator coordinates escalation on a set of locktrees for a bunch of
+ // threads
+ class locktree_escalator {
+ public:
+ void create(void);
+ void destroy(void);
+ void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+ void *extra);
+
+ private:
+ toku_mutex_t m_escalator_mutex;
+ toku_cond_t m_escalator_done;
+ bool m_escalator_running;
+ };
+
+ locktree_escalator m_escalator;
+
+ friend class manager_unit_test;
+};
+
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given comparator
+//
+// Locktrees are not created and destroyed by the user. Instead, they are
+// referenced and released using the locktree manager.
+//
+// A sample workflow looks like this:
+// - Create a manager.
+// - Get a locktree by dictionaroy id from the manager.
+// - Perform read/write lock acquision on the locktree, add references to
+// the locktree using the manager, release locks, release references, etc.
+// - ...
+// - Release the final reference to the locktree. It will be destroyed.
+// - Destroy the manager.
+class locktree {
+ public:
+ // effect: Creates a locktree
+ void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+ const comparator &cmp,
+ toku_external_mutex_factory_t mutex_factory);
+
+ void destroy(void);
+
+ // For thread-safe, external reference counting
+ void add_reference(void);
+
+ // requires: the reference count is > 0
+ // returns: the reference count, after decrementing it by one
+ uint32_t release_reference(void);
+
+ // returns: the current reference count
+ uint32_t get_reference_count(void);
+
+ // effect: Attempts to grant a read lock for the range of keys between
+ // [left_key, right_key]. returns: If the lock cannot be granted, return
+ // DB_LOCK_NOTGRANTED, and populate the
+ // given conflicts set with the txnids that hold conflicting locks in
+ // the range. If the locktree cannot create more locks, return
+ // TOKUDB_OUT_OF_LOCKS.
+ // note: Read locks cannot be shared between txnids, as one would expect.
+ // This is for simplicity since read locks are rare in MySQL.
+ int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+ txnid_set *conflicts, bool big_txn);
+
+ // effect: Attempts to grant a write lock for the range of keys between
+ // [left_key, right_key]. returns: If the lock cannot be granted, return
+ // DB_LOCK_NOTGRANTED, and populate the
+ // given conflicts set with the txnids that hold conflicting locks in
+ // the range. If the locktree cannot create more locks, return
+ // TOKUDB_OUT_OF_LOCKS.
+ int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+ txnid_set *conflicts, bool big_txn);
+
+ // effect: populate the conflicts set with the txnids that would preventing
+ // the given txnid from getting a lock on [left_key, right_key]
+ void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
+ const DBT *right_key, txnid_set *conflicts);
+
+ // effect: Release all of the lock ranges represented by the range buffer for
+ // a txnid.
+ void release_locks(TXNID txnid, const range_buffer *ranges,
+ bool all_trx_locks_hint = false);
+
+ // effect: Runs escalation on this locktree
+ void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+ // returns: The userdata associated with this locktree, or null if it has not
+ // been set.
+ void *get_userdata(void) const;
+
+ void set_userdata(void *userdata);
+
+ locktree_manager *get_manager(void) const;
+
+ void set_comparator(const comparator &cmp);
+
+ // Set the user-provided Lock Escalation Barrier check function and its
+ // argument
+ //
+ // Lock Escalation Barrier limits the scope of Lock Escalation.
+ // For two keys A and B (such that A < B),
+ // escalation_barrier_check_func(A, B)==true means that there's a lock
+ // escalation barrier between A and B, and lock escalation is not allowed to
+ // bridge the gap between A and B.
+ //
+ // This method sets the user-provided barrier check function and its
+ // parameter.
+ void set_escalation_barrier_func(lt_escalation_barrier_check_func func,
+ void *extra);
+
+ int compare(const locktree *lt) const;
+
+ DICTIONARY_ID get_dict_id() const;
+
+ // Private info struct for storing pending lock request state.
+ // Only to be used by lock requests. We store it here as
+ // something less opaque than usual to strike a tradeoff between
+ // abstraction and code complexity. It is still fairly abstract
+ // since the lock_request object is opaque
+ struct lt_lock_request_info *get_lock_request_info(void);
+
+ typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+ TXNID txnid, bool is_shared,
+ TxnidVector *owners);
+ void dump_locks(void *cdata, dump_callback cb);
+
+ private:
+ locktree_manager *m_mgr;
+ DICTIONARY_ID m_dict_id;
+ uint32_t m_reference_count;
+
+ // Since the memory referenced by this comparator is not owned by the
+ // locktree, the user must guarantee it will outlive the locktree.
+ //
+ // The ydb API accomplishes this by opening an ft_handle in the on_create
+ // callback, which will keep the underlying FT (and its descriptor) in memory
+ // for as long as the handle is open. The ft_handle is stored opaquely in the
+ // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+ comparator m_cmp;
+
+ lt_escalation_barrier_check_func m_escalation_barrier;
+ void *m_escalation_barrier_arg;
+
+ concurrent_tree *m_rangetree;
+
+ void *m_userdata;
+ struct lt_lock_request_info m_lock_request_info;
+
+ // psergey-todo:
+ // Each transaction also keeps a list of ranges it has locked.
+ // So, when a transaction is running in STO mode, two identical
+ // lists are kept: the STO lock list and transaction's owned locks
+ // list. Why can't we do with just one list?
+
+ // The following fields and members prefixed with "sto_" are for
+ // the single txnid optimization, intended to speed up the case
+ // when only one transaction is using the locktree. If we know
+ // the locktree has only one transaction, then acquiring locks
+ // takes O(1) work and releasing all locks takes O(1) work.
+ //
+ // How do we know that the locktree only has a single txnid?
+ // What do we do if it does?
+ //
+ // When a txn with txnid T requests a lock:
+ // - If the tree is empty, the optimization is possible. Set the single
+ // txnid to T, and insert the lock range into the buffer.
+ // - If the tree is not empty, check if the single txnid is T. If so,
+ // append the lock range to the buffer. Otherwise, migrate all of
+ // the locks in the buffer into the rangetree on behalf of txnid T,
+ // and invalid the single txnid.
+ //
+ // When a txn with txnid T releases its locks:
+ // - If the single txnid is valid, it must be for T. Destroy the buffer.
+ // - If it's not valid, release locks the normal way in the rangetree.
+ //
+ // To carry out the optimization we need to record a single txnid
+ // and a range buffer for each locktree, each protected by the root
+ // lock of the locktree's rangetree. The root lock for a rangetree
+ // is grabbed by preparing a locked keyrange on the rangetree.
+ TXNID m_sto_txnid;
+ range_buffer m_sto_buffer;
+
+ // The single txnid optimization speeds up the case when only one
+ // transaction is using the locktree. But it has the potential to
+ // hurt the case when more than one txnid exists.
+ //
+ // There are two things we need to do to make the optimization only
+ // optimize the case we care about, and not hurt the general case.
+ //
+ // Bound the worst-case latency for lock migration when the
+ // optimization stops working:
+ // - Idea: Stop the optimization and migrate immediate if we notice
+ // the single txnid has takes many locks in the range buffer.
+ // - Implementation: Enforce a max size on the single txnid range buffer.
+ // - Analysis: Choosing the perfect max value, M, is difficult to do
+ // without some feedback from the field. Intuition tells us that M should
+ // not be so small that the optimization is worthless, and it should not
+ // be so big that it's unreasonable to have to wait behind a thread doing
+ // the work of converting M buffer locks into rangetree locks.
+ //
+ // Prevent concurrent-transaction workloads from trying the optimization
+ // in vain:
+ // - Idea: Don't even bother trying the optimization if we think the
+ // system is in a concurrent-transaction state.
+ // - Implementation: Do something even simpler than detecting whether the
+ // system is in a concurent-transaction state. Just keep a "score" value
+ // and some threshold. If at any time the locktree is eligible for the
+ // optimization, only do it if the score is at this threshold. When you
+ // actually do the optimization but someone has to migrate locks in the buffer
+ // (expensive), then reset the score back to zero. Each time a txn
+ // releases locks, the score is incremented by 1.
+ // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+ // do the optimization in a concurrent-transaction system. Similarly, it
+ // takes at most C txns to start using the single txnid optimzation, which
+ // is good when the system transitions from multithreaded to single threaded.
+ //
+ // STO_BUFFER_MAX_SIZE:
+ //
+ // We choose the max value to be 1 million since most transactions are smaller
+ // than 1 million and we can create a rangetree of 1 million elements in
+ // less than a second. So we can be pretty confident that this threshold
+ // enables the optimization almost always, and prevents super pathological
+ // latency issues for the first lock taken by a second thread.
+ //
+ // STO_SCORE_THRESHOLD:
+ //
+ // A simple first guess at a good value for the score threshold is 100.
+ // By our analysis, we'd end up doing the optimization in vain for
+ // around 1% of all transactions, which seems reasonable. Further,
+ // if the system goes single threaded, it ought to be pretty quick
+ // for 100 transactions to go by, so we won't have to wait long before
+ // we start doing the single txind optimzation again.
+ static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+ static const int STO_SCORE_THRESHOLD = 100;
+ int m_sto_score;
+
+ // statistics about time spent ending the STO early
+ uint64_t m_sto_end_early_count;
+ tokutime_t m_sto_end_early_time;
+
+ // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+ // to the given txnid.
+ // requires: m_sto_txnid is invalid
+ void sto_begin(TXNID txnid);
+
+ // effect: append a range to the sto buffer
+ // requires: m_sto_txnid is valid
+ void sto_append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request);
+
+ // effect: ends the single txnid optimization, releaseing any memory
+ // stored in the sto buffer, notifying the tracker, and
+ // invalidating m_sto_txnid.
+ // requires: m_sto_txnid is valid
+ void sto_end(void);
+
+ // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+ // effect: ends the single txnid optimization early, migrating buffer locks
+ // into the rangetree, calling sto_end(), and then setting the
+ // sto_score back to zero.
+ // requires: m_sto_txnid is valid
+ void sto_end_early(void *prepared_lkr);
+ void sto_end_early_no_accounting(void *prepared_lkr);
+
+ // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
+ // use
+ // the real type because the compiler won't allow us to forward
+ // declare concurrent_tree::locked_keyrange without including
+ // concurrent_tree.h, which we cannot do here because it is a template
+ // implementation.
+ // requires: the prepared locked keyrange is for the locktree's rangetree
+ // requires: m_sto_txnid is valid
+ // effect: migrates each lock in the single txnid buffer into the locktree's
+ // rangetree, notifying the memory tracker as necessary.
+ void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+ // effect: If m_sto_txnid is valid, then release the txnid's locks
+ // by ending the optimization.
+ // requires: If m_sto_txnid is valid, it is equal to the given txnid
+ // returns: True if locks were released for this txnid
+ bool sto_try_release(TXNID txnid);
+
+ // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+ // requires: the prepared locked keyrange is for the locktree's rangetree
+ // effect: If m_sto_txnid is valid and equal to the given txnid, then
+ // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+ // but not equal to this txnid, then migrate the buffer's locks
+ // into the rangetree and end the optimization, setting the score
+ // back to zero.
+ // returns: true if the lock was acquired for this txnid
+ bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
+ const DBT *right_key, bool is_write_request);
+
+ // Effect:
+ // Provides a hook for a helgrind suppression.
+ // Returns:
+ // true if m_sto_txnid is not TXNID_NONE
+ bool sto_txnid_is_valid_unsafe(void) const;
+
+ // Effect:
+ // Provides a hook for a helgrind suppression.
+ // Returns:
+ // m_sto_score
+ int sto_get_score_unsafe(void) const;
+
+ void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
+ const DBT *right_key);
+
+ int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+ const DBT *left_key, const DBT *right_key,
+ bool is_write_request, txnid_set *conflicts);
+
+ int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+ const DBT *right_key, txnid_set *conflicts);
+
+ int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+ const DBT *right_key, txnid_set *conflicts,
+ bool big_txn);
+
+ friend class locktree_unit_test;
+ friend class manager_unit_test;
+ friend class lock_request_unit_test;
+
+ // engine status reaches into the locktree to read some stats
+ friend void locktree_manager::get_status(LTM_STATUS status);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
new file mode 100644
index 000000000..4186182be
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../portability/toku_pthread.h"
+#include "../util/status.h"
+#include "lock_request.h"
+#include "locktree.h"
+
+namespace toku {
+
+void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+ lt_escalate_cb escalate_cb, void *escalate_extra,
+ toku_external_mutex_factory_t mutex_factory_arg) {
+ mutex_factory = mutex_factory_arg;
+ m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
+ m_current_lock_memory = 0;
+
+ m_locktree_map.create();
+ m_lt_create_callback = create_cb;
+ m_lt_destroy_callback = destroy_cb;
+ m_lt_escalate_callback = escalate_cb;
+ m_lt_escalate_callback_extra = escalate_extra;
+ ZERO_STRUCT(m_mutex);
+ toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
+
+ ZERO_STRUCT(m_lt_counters);
+
+ escalator_init();
+}
+
+void locktree_manager::destroy(void) {
+ escalator_destroy();
+ invariant(m_current_lock_memory == 0);
+ invariant(m_locktree_map.size() == 0);
+ m_locktree_map.destroy();
+ toku_mutex_destroy(&m_mutex);
+}
+
+void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
+
+int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
+ int r = 0;
+ mutex_lock();
+ if (max_lock_memory < m_current_lock_memory) {
+ r = EDOM;
+ } else {
+ m_max_lock_memory = max_lock_memory;
+ }
+ mutex_unlock();
+ return r;
+}
+
+int locktree_manager::find_by_dict_id(locktree *const &lt,
+ const DICTIONARY_ID &dict_id) {
+ if (lt->get_dict_id().dictid < dict_id.dictid) {
+ return -1;
+ } else if (lt->get_dict_id().dictid == dict_id.dictid) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
+ locktree *lt;
+ int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
+ nullptr);
+ return r == 0 ? lt : nullptr;
+}
+
+void locktree_manager::locktree_map_put(locktree *lt) {
+ int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
+ lt, lt->get_dict_id(), nullptr);
+ invariant_zero(r);
+}
+
+void locktree_manager::locktree_map_remove(locktree *lt) {
+ uint32_t idx;
+ locktree *found_lt;
+ int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
+ lt->get_dict_id(), &found_lt, &idx);
+ invariant_zero(r);
+ invariant(found_lt == lt);
+ r = m_locktree_map.delete_at(idx);
+ invariant_zero(r);
+}
+
+locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+ void *on_create_extra) {
+ // hold the mutex around searching and maybe
+ // inserting into the locktree map
+ mutex_lock();
+
+ locktree *lt = locktree_map_find(dict_id);
+ if (lt == nullptr) {
+ XCALLOC(lt);
+ lt->create(this, dict_id, cmp, mutex_factory);
+
+ // new locktree created - call the on_create callback
+ // and put it in the locktree map
+ if (m_lt_create_callback) {
+ int r = m_lt_create_callback(lt, on_create_extra);
+ if (r != 0) {
+ lt->release_reference();
+ lt->destroy();
+ toku_free(lt);
+ lt = nullptr;
+ }
+ }
+ if (lt) {
+ locktree_map_put(lt);
+ }
+ } else {
+ reference_lt(lt);
+ }
+
+ mutex_unlock();
+
+ return lt;
+}
+
+void locktree_manager::reference_lt(locktree *lt) {
+ // increment using a sync fetch and add.
+ // the caller guarantees that the lt won't be
+ // destroyed while we increment the count here.
+ //
+ // the caller can do this by already having an lt
+ // reference or by holding the manager mutex.
+ //
+ // if the manager's mutex is held, it is ok for the
+ // reference count to transition from 0 to 1 (no race),
+ // since we're serialized with other opens and closes.
+ lt->add_reference();
+}
+
+void locktree_manager::release_lt(locktree *lt) {
+ bool do_destroy = false;
+ DICTIONARY_ID dict_id = lt->get_dict_id();
+
+ // Release a reference on the locktree. If the count transitions to zero,
+ // then we *may* need to do the cleanup.
+ //
+ // Grab the manager's mutex and look for a locktree with this locktree's
+ // dictionary id. Since dictionary id's never get reused, any locktree
+ // found must be the one we just released a reference on.
+ //
+ // At least two things could have happened since we got the mutex:
+ // - Another thread gets a locktree with the same dict_id, increments
+ // the reference count. In this case, we shouldn't destroy it.
+ // - Another thread gets a locktree with the same dict_id and then
+ // releases it quickly, transitioning the reference count from zero to
+ // one and back to zero. In this case, only one of us should destroy it.
+ // It doesn't matter which. We originally missed this case, see #5776.
+ //
+ // After 5776, the high level rule for release is described below.
+ //
+ // If a thread releases a locktree and notices the reference count transition
+ // to zero, then that thread must immediately:
+ // - assume the locktree object is invalid
+ // - grab the manager's mutex
+ // - search the locktree map for a locktree with the same dict_id and remove
+ // it, if it exists. the destroy may be deferred.
+ // - release the manager's mutex
+ //
+ // This way, if many threads transition the same locktree's reference count
+ // from 1 to zero and wait behind the manager's mutex, only one of them will
+ // do the actual destroy and the others will happily do nothing.
+ uint32_t refs = lt->release_reference();
+ if (refs == 0) {
+ mutex_lock();
+ // lt may not have already been destroyed, so look it up.
+ locktree *find_lt = locktree_map_find(dict_id);
+ if (find_lt != nullptr) {
+ // A locktree is still in the map with that dict_id, so it must be
+ // equal to lt. This is true because dictionary ids are never reused.
+ // If the reference count is zero, it's our responsibility to remove
+ // it and do the destroy. Otherwise, someone still wants it.
+ // If the locktree is still valid then check if it should be deleted.
+ if (find_lt == lt) {
+ if (lt->get_reference_count() == 0) {
+ locktree_map_remove(lt);
+ do_destroy = true;
+ }
+ m_lt_counters.add(lt->get_lock_request_info()->counters);
+ }
+ }
+ mutex_unlock();
+ }
+
+ // if necessary, do the destroy without holding the mutex
+ if (do_destroy) {
+ if (m_lt_destroy_callback) {
+ m_lt_destroy_callback(lt);
+ }
+ lt->destroy();
+ toku_free(lt);
+ }
+}
+
+void locktree_manager::run_escalation(void) {
+ struct escalation_fn {
+ static void run(void *extra) {
+ locktree_manager *mgr = (locktree_manager *)extra;
+ mgr->escalate_all_locktrees();
+ };
+ };
+ m_escalator.run(this, escalation_fn::run, this);
+}
+
+// test-only version of lock escalation
+void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
+
+void locktree_manager::escalate_all_locktrees(void) {
+ uint64_t t0 = toku_current_time_microsec();
+
+ // get all locktrees
+ mutex_lock();
+ int num_locktrees = m_locktree_map.size();
+ locktree **locktrees = new locktree *[num_locktrees];
+ for (int i = 0; i < num_locktrees; i++) {
+ int r = m_locktree_map.fetch(i, &locktrees[i]);
+ invariant_zero(r);
+ reference_lt(locktrees[i]);
+ }
+ mutex_unlock();
+
+ // escalate them
+ escalate_locktrees(locktrees, num_locktrees);
+
+ delete[] locktrees;
+
+ uint64_t t1 = toku_current_time_microsec();
+ add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::note_mem_used(uint64_t mem_used) {
+ (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
+}
+
+void locktree_manager::note_mem_released(uint64_t mem_released) {
+ uint64_t old_mem_used =
+ toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
+ invariant(old_mem_used >= mem_released);
+}
+
+bool locktree_manager::out_of_locks(void) const {
+ return m_current_lock_memory >= m_max_lock_memory;
+}
+
+bool locktree_manager::over_big_threshold(void) {
+ return m_current_lock_memory >= m_max_lock_memory / 2;
+}
+
+int locktree_manager::iterate_pending_lock_requests(
+ lock_request_iterate_callback callback, void *extra) {
+ mutex_lock();
+ int r = 0;
+ uint32_t num_locktrees = m_locktree_map.size();
+ for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
+ locktree *lt;
+ r = m_locktree_map.fetch(i, &lt);
+ invariant_zero(r);
+ if (r == EINVAL) // Shouldn't happen, avoid compiler warning
+ continue;
+
+ struct lt_lock_request_info *info = lt->get_lock_request_info();
+ toku_external_mutex_lock(&info->mutex);
+
+ uint32_t num_requests = info->pending_lock_requests.size();
+ for (uint32_t k = 0; k < num_requests && r == 0; k++) {
+ lock_request *req;
+ r = info->pending_lock_requests.fetch(k, &req);
+ invariant_zero(r);
+ if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+ continue;
+ r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
+ req->get_right_key(), req->get_conflicting_txnid(),
+ req->get_start_time(), extra);
+ }
+
+ toku_external_mutex_unlock(&info->mutex);
+ }
+ mutex_unlock();
+ return r;
+}
+
+int locktree_manager::check_current_lock_constraints(bool big_txn) {
+ int r = 0;
+ if (big_txn && over_big_threshold()) {
+ run_escalation();
+ if (over_big_threshold()) {
+ r = TOKUDB_OUT_OF_LOCKS;
+ }
+ }
+ if (r == 0 && out_of_locks()) {
+ run_escalation();
+ if (out_of_locks()) {
+ // return an error if we're still out of locks after escalation.
+ r = TOKUDB_OUT_OF_LOCKS;
+ }
+ }
+ return r;
+}
+
+void locktree_manager::escalator_init(void) {
+ ZERO_STRUCT(m_escalation_mutex);
+ toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
+ m_escalation_count = 0;
+ m_escalation_time = 0;
+ m_wait_escalation_count = 0;
+ m_wait_escalation_time = 0;
+ m_long_wait_escalation_count = 0;
+ m_long_wait_escalation_time = 0;
+ m_escalation_latest_result = 0;
+ m_escalator.create();
+}
+
+void locktree_manager::escalator_destroy(void) {
+ m_escalator.destroy();
+ toku_mutex_destroy(&m_escalation_mutex);
+}
+
+void locktree_manager::add_escalator_wait_time(uint64_t t) {
+ toku_mutex_lock(&m_escalation_mutex);
+ m_wait_escalation_count += 1;
+ m_wait_escalation_time += t;
+ if (t >= 1000000) {
+ m_long_wait_escalation_count += 1;
+ m_long_wait_escalation_time += t;
+ }
+ toku_mutex_unlock(&m_escalation_mutex);
+}
+
+void locktree_manager::escalate_locktrees(locktree **locktrees,
+ int num_locktrees) {
+ // there are too many row locks in the system and we need to tidy up.
+ //
+ // a simple implementation of escalation does not attempt
+ // to reduce the memory foot print of each txn's range buffer.
+ // doing so would require some layering hackery (or a callback)
+ // and more complicated locking. for now, just escalate each
+ // locktree individually, in-place.
+ tokutime_t t0 = toku_time_now();
+ for (int i = 0; i < num_locktrees; i++) {
+ locktrees[i]->escalate(m_lt_escalate_callback,
+ m_lt_escalate_callback_extra);
+ release_lt(locktrees[i]);
+ }
+ tokutime_t t1 = toku_time_now();
+
+ toku_mutex_lock(&m_escalation_mutex);
+ m_escalation_count++;
+ m_escalation_time += (t1 - t0);
+ m_escalation_latest_result = m_current_lock_memory;
+ toku_mutex_unlock(&m_escalation_mutex);
+}
+
+struct escalate_args {
+ locktree_manager *mgr;
+ locktree **locktrees;
+ int num_locktrees;
+};
+
+void locktree_manager::locktree_escalator::create(void) {
+ ZERO_STRUCT(m_escalator_mutex);
+ toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
+ toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
+ m_escalator_running = false;
+}
+
+void locktree_manager::locktree_escalator::destroy(void) {
+ toku_cond_destroy(&m_escalator_done);
+ toku_mutex_destroy(&m_escalator_mutex);
+}
+
+void locktree_manager::locktree_escalator::run(
+ locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+ void *extra) {
+ uint64_t t0 = toku_current_time_microsec();
+ toku_mutex_lock(&m_escalator_mutex);
+ if (!m_escalator_running) {
+ // run escalation on this thread
+ m_escalator_running = true;
+ toku_mutex_unlock(&m_escalator_mutex);
+ escalate_locktrees_fun(extra);
+ toku_mutex_lock(&m_escalator_mutex);
+ m_escalator_running = false;
+ toku_cond_broadcast(&m_escalator_done);
+ } else {
+ toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
+ }
+ toku_mutex_unlock(&m_escalator_mutex);
+ uint64_t t1 = toku_current_time_microsec();
+ mgr->add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::get_status(LTM_STATUS statp) {
+ ltm_status.init();
+ LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
+ LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
+ LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
+ LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
+ LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
+ LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
+ LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
+ LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
+ LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
+
+ uint64_t lock_requests_pending = 0;
+ uint64_t sto_num_eligible = 0;
+ uint64_t sto_end_early_count = 0;
+ tokutime_t sto_end_early_time = 0;
+ uint32_t num_locktrees = 0;
+ struct lt_counters lt_counters;
+ ZERO_STRUCT(lt_counters); // PORT: instead of ={}.
+
+ if (toku_mutex_trylock(&m_mutex) == 0) {
+ lt_counters = m_lt_counters;
+ num_locktrees = m_locktree_map.size();
+ for (uint32_t i = 0; i < num_locktrees; i++) {
+ locktree *lt;
+ int r = m_locktree_map.fetch(i, &lt);
+ invariant_zero(r);
+ if (r == EINVAL) // Shouldn't happen, avoid compiler warning
+ continue;
+ if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
+ lock_requests_pending +=
+ lt->m_lock_request_info.pending_lock_requests.size();
+ lt_counters.add(lt->get_lock_request_info()->counters);
+ toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
+ }
+ sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
+ sto_end_early_count += lt->m_sto_end_early_count;
+ sto_end_early_time += lt->m_sto_end_early_time;
+ }
+ mutex_unlock();
+ }
+
+ LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
+ LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
+ LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
+ LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
+ LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+ LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
+ LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
+ LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
+ LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
+ LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
+ *statp = ltm_status;
+}
+
+void locktree_manager::kill_waiter(void *extra) {
+ mutex_lock();
+ int r = 0;
+ uint32_t num_locktrees = m_locktree_map.size();
+ for (uint32_t i = 0; i < num_locktrees; i++) {
+ locktree *lt;
+ r = m_locktree_map.fetch(i, &lt);
+ invariant_zero(r);
+ if (r) continue; // Get rid of "may be used uninitialized" warning
+ lock_request::kill_waiter(lt, extra);
+ }
+ mutex_unlock();
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
new file mode 100644
index 000000000..1e1d23ef8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
@@ -0,0 +1,265 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "range_buffer.h"
+
+#include <string.h>
+
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+namespace toku {
+
+bool range_buffer::record_header::left_is_infinite(void) const {
+ return left_neg_inf || left_pos_inf;
+}
+
+bool range_buffer::record_header::right_is_infinite(void) const {
+ return right_neg_inf || right_pos_inf;
+}
+
+void range_buffer::record_header::init(const DBT *left_key,
+ const DBT *right_key,
+ bool is_exclusive) {
+ is_exclusive_lock = is_exclusive;
+ left_neg_inf = left_key == toku_dbt_negative_infinity();
+ left_pos_inf = left_key == toku_dbt_positive_infinity();
+ left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
+ if (right_key) {
+ right_neg_inf = right_key == toku_dbt_negative_infinity();
+ right_pos_inf = right_key == toku_dbt_positive_infinity();
+ right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
+ } else {
+ right_neg_inf = left_neg_inf;
+ right_pos_inf = left_pos_inf;
+ right_key_size = 0;
+ }
+}
+
+const DBT *range_buffer::iterator::record::get_left_key(void) const {
+ if (_header.left_neg_inf) {
+ return toku_dbt_negative_infinity();
+ } else if (_header.left_pos_inf) {
+ return toku_dbt_positive_infinity();
+ } else {
+ return &_left_key;
+ }
+}
+
+const DBT *range_buffer::iterator::record::get_right_key(void) const {
+ if (_header.right_neg_inf) {
+ return toku_dbt_negative_infinity();
+ } else if (_header.right_pos_inf) {
+ return toku_dbt_positive_infinity();
+ } else {
+ return &_right_key;
+ }
+}
+
+size_t range_buffer::iterator::record::size(void) const {
+ return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
+}
+
+void range_buffer::iterator::record::deserialize(const char *buf) {
+ size_t current = 0;
+
+ // deserialize the header
+ memcpy(&_header, buf, sizeof(record_header));
+ current += sizeof(record_header);
+
+ // deserialize the left key if necessary
+ if (!_header.left_is_infinite()) {
+ // point the left DBT's buffer into ours
+ toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
+ current += _header.left_key_size;
+ }
+
+ // deserialize the right key if necessary
+ if (!_header.right_is_infinite()) {
+ if (_header.right_key_size == 0) {
+ toku_copyref_dbt(&_right_key, _left_key);
+ } else {
+ toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
+ }
+ }
+}
+
+toku::range_buffer::iterator::iterator()
+ : _ma_chunk_iterator(nullptr),
+ _current_chunk_base(nullptr),
+ _current_chunk_offset(0),
+ _current_chunk_max(0),
+ _current_rec_size(0) {}
+
+toku::range_buffer::iterator::iterator(const range_buffer *buffer)
+ : _ma_chunk_iterator(&buffer->_arena),
+ _current_chunk_base(nullptr),
+ _current_chunk_offset(0),
+ _current_chunk_max(0),
+ _current_rec_size(0) {
+ reset_current_chunk();
+}
+
+void range_buffer::iterator::reset_current_chunk() {
+ _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
+ _current_chunk_offset = 0;
+}
+
+bool range_buffer::iterator::current(record *rec) {
+ if (_current_chunk_offset < _current_chunk_max) {
+ const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
+ rec->deserialize(buf + _current_chunk_offset);
+ _current_rec_size = rec->size();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// move the iterator to the next record in the buffer
+void range_buffer::iterator::next(void) {
+ invariant(_current_chunk_offset < _current_chunk_max);
+ invariant(_current_rec_size > 0);
+
+ // the next record is _current_rec_size bytes forward
+ _current_chunk_offset += _current_rec_size;
+ // now, we don't know how big the current is, set it to 0.
+ _current_rec_size = 0;
+
+ if (_current_chunk_offset >= _current_chunk_max) {
+ // current chunk is exhausted, try moving to the next one
+ if (_ma_chunk_iterator.more()) {
+ _ma_chunk_iterator.next();
+ reset_current_chunk();
+ }
+ }
+}
+
+void range_buffer::create(void) {
+ // allocate buffer space lazily instead of on creation. this way,
+ // no malloc/free is done if the transaction ends up taking no locks.
+ _arena.create(0);
+ _num_ranges = 0;
+}
+
+void range_buffer::append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
+ // if the keys are equal, then only one copy is stored.
+ if (toku_dbt_equals(left_key, right_key)) {
+ invariant(left_key->size <= MAX_KEY_SIZE);
+ append_point(left_key, is_write_request);
+ } else {
+ invariant(left_key->size <= MAX_KEY_SIZE);
+ invariant(right_key->size <= MAX_KEY_SIZE);
+ append_range(left_key, right_key, is_write_request);
+ }
+ _num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
+
+uint64_t range_buffer::total_memory_size(void) const {
+ return _arena.total_size_in_use();
+}
+
+int range_buffer::get_num_ranges(void) const { return _num_ranges; }
+
+void range_buffer::destroy(void) { _arena.destroy(); }
+
+void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+ bool is_exclusive) {
+ size_t record_length =
+ sizeof(record_header) + left_key->size + right_key->size;
+ char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+ record_header h;
+ h.init(left_key, right_key, is_exclusive);
+
+ // serialize the header
+ memcpy(buf, &h, sizeof(record_header));
+ buf += sizeof(record_header);
+
+ // serialize the left key if necessary
+ if (!h.left_is_infinite()) {
+ memcpy(buf, left_key->data, left_key->size);
+ buf += left_key->size;
+ }
+
+ // serialize the right key if necessary
+ if (!h.right_is_infinite()) {
+ memcpy(buf, right_key->data, right_key->size);
+ }
+}
+
+void range_buffer::append_point(const DBT *key, bool is_exclusive) {
+ size_t record_length = sizeof(record_header) + key->size;
+ char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+ record_header h;
+ h.init(key, nullptr, is_exclusive);
+
+ // serialize the header
+ memcpy(buf, &h, sizeof(record_header));
+ buf += sizeof(record_header);
+
+ // serialize the key if necessary
+ if (!h.left_is_infinite()) {
+ memcpy(buf, key->data, key->size);
+ }
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
new file mode 100644
index 000000000..76e28d747
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
@@ -0,0 +1,178 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "../util/dbt.h"
+#include "../util/memarena.h"
+
+namespace toku {
+
+// a key range buffer represents a set of key ranges that can
+// be stored, iterated over, and then destroyed all at once.
+class range_buffer {
+ private:
+ // the key range buffer is a bunch of records in a row.
+ // each record has the following header, followed by the
+ // left key and right key data payload, if applicable.
+ // we limit keys to be 2^16, since we store lengths as 2 bytes.
+ static const size_t MAX_KEY_SIZE = 1 << 16;
+
+ struct record_header {
+ bool left_neg_inf;
+ bool left_pos_inf;
+ bool right_pos_inf;
+ bool right_neg_inf;
+ uint16_t left_key_size;
+ uint16_t right_key_size;
+ bool is_exclusive_lock;
+
+ bool left_is_infinite(void) const;
+
+ bool right_is_infinite(void) const;
+
+ void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
+ };
+ // PORT static_assert(sizeof(record_header) == 8, "record header format is
+ // off");
+
+ public:
+ // the iterator abstracts reading over a buffer of variable length
+ // records one by one until there are no more left.
+ class iterator {
+ public:
+ iterator();
+ iterator(const range_buffer *buffer);
+
+ // a record represents the user-view of a serialized key range.
+ // it handles positive and negative infinity and the optimized
+ // point range case, where left and right points share memory.
+ class record {
+ public:
+ // get a read-only pointer to the left key of this record's range
+ const DBT *get_left_key(void) const;
+
+ // get a read-only pointer to the right key of this record's range
+ const DBT *get_right_key(void) const;
+
+ // how big is this record? this tells us where the next record is
+ size_t size(void) const;
+
+ bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
+
+ // populate a record header and point our DBT's
+ // buffers into ours if they are not infinite.
+ void deserialize(const char *buf);
+
+ private:
+ record_header _header;
+ DBT _left_key;
+ DBT _right_key;
+ };
+
+ // populate the given record object with the current
+ // the memory referred to by record is valid for only
+ // as long as the record exists.
+ bool current(record *rec);
+
+ // move the iterator to the next record in the buffer
+ void next(void);
+
+ private:
+ void reset_current_chunk();
+
+ // the key range buffer we are iterating over, the current
+ // offset in that buffer, and the size of the current record.
+ memarena::chunk_iterator _ma_chunk_iterator;
+ const void *_current_chunk_base;
+ size_t _current_chunk_offset;
+ size_t _current_chunk_max;
+ size_t _current_rec_size;
+ };
+
+ // allocate buffer space lazily instead of on creation. this way,
+ // no malloc/free is done if the transaction ends up taking no locks.
+ void create(void);
+
+ // append a left/right key range to the buffer.
+ // if the keys are equal, then only one copy is stored.
+ void append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request = false);
+
+ // is this range buffer empty?
+ bool is_empty(void) const;
+
+ // how much memory is being used by this range buffer?
+ uint64_t total_memory_size(void) const;
+
+ // how many ranges are stored in this range buffer?
+ int get_num_ranges(void) const;
+
+ void destroy(void);
+
+ private:
+ memarena _arena;
+ int _num_ranges;
+
+ void append_range(const DBT *left_key, const DBT *right_key,
+ bool is_write_request);
+
+ // append a point to the buffer. this is the space/time saving
+ // optimization for key ranges where left == right.
+ void append_point(const DBT *key, bool is_write_request);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
new file mode 100644
index 000000000..8997f634b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "treenode.h"
+
+#include "../portability/toku_race_tools.h"
+
+namespace toku {
+
+// TODO: source location info might have to be pulled up one caller
+// to be useful
+void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+void treenode::init(const comparator *cmp) {
+ m_txnid = TXNID_NONE;
+ m_is_root = false;
+ m_is_empty = true;
+ m_cmp = cmp;
+
+ m_is_shared = false;
+ m_owners = nullptr;
+
+ // use an adaptive mutex at each node since we expect the time the
+ // lock is held to be relatively short compared to a context switch.
+ // indeed, this improves performance at high thread counts considerably.
+ memset(&m_mutex, 0, sizeof(toku_mutex_t));
+ toku_pthread_mutexattr_t attr;
+ toku_mutexattr_init(&attr);
+ toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
+ toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
+ toku_mutexattr_destroy(&attr);
+ m_left_child.set(nullptr);
+ m_right_child.set(nullptr);
+}
+
+void treenode::create_root(const comparator *cmp) {
+ init(cmp);
+ m_is_root = true;
+}
+
+void treenode::destroy_root(void) {
+ invariant(is_root());
+ invariant(is_empty());
+ toku_mutex_destroy(&m_mutex);
+ m_cmp = nullptr;
+}
+
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
+ bool is_shared) {
+ // allocates a new copy of the range for this node
+ m_range.create_copy(range);
+ m_txnid = txnid;
+ m_is_shared = is_shared;
+ m_is_empty = false;
+}
+
+bool treenode::is_root(void) { return m_is_root; }
+
+bool treenode::is_empty(void) { return m_is_empty; }
+
+bool treenode::range_overlaps(const keyrange &range) {
+ return m_range.overlaps(*m_cmp, range);
+}
+
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+ TXNID txnid, bool is_shared) {
+ treenode *XCALLOC(node);
+ node->init(cmp);
+ node->set_range_and_txnid(range, txnid, is_shared);
+ return node;
+}
+
+void treenode::swap_in_place(treenode *node1, treenode *node2) {
+ keyrange tmp_range = node1->m_range;
+ TXNID tmp_txnid = node1->m_txnid;
+ node1->m_range = node2->m_range;
+ node1->m_txnid = node2->m_txnid;
+ node2->m_range = tmp_range;
+ node2->m_txnid = tmp_txnid;
+
+ bool tmp_is_shared = node1->m_is_shared;
+ node1->m_is_shared = node2->m_is_shared;
+ node2->m_is_shared = tmp_is_shared;
+
+ auto tmp_m_owners = node1->m_owners;
+ node1->m_owners = node2->m_owners;
+ node2->m_owners = tmp_m_owners;
+}
+
+bool treenode::add_shared_owner(TXNID txnid) {
+ assert(m_is_shared);
+ if (txnid == m_txnid)
+ return false; // acquiring a lock on the same range by the same trx
+
+ if (m_txnid != TXNID_SHARED) {
+ m_owners = new TxnidVector;
+ m_owners->insert(m_txnid);
+ m_txnid = TXNID_SHARED;
+ }
+ m_owners->insert(txnid);
+ return true;
+}
+
+void treenode::free(treenode *node) {
+ // destroy the range, freeing any copied keys
+ node->m_range.destroy();
+
+ if (node->m_owners) {
+ delete node->m_owners;
+ node->m_owners = nullptr; // need this?
+ }
+
+ // the root is simply marked as empty.
+ if (node->is_root()) {
+ // PORT toku_mutex_assert_locked(&node->m_mutex);
+ node->m_is_empty = true;
+ } else {
+ // PORT toku_mutex_assert_unlocked(&node->m_mutex);
+ toku_mutex_destroy(&node->m_mutex);
+ toku_free(node);
+ }
+}
+
+uint32_t treenode::get_depth_estimate(void) const {
+ const uint32_t left_est = m_left_child.depth_est;
+ const uint32_t right_est = m_right_child.depth_est;
+ return (left_est > right_est ? left_est : right_est) + 1;
+}
+
+treenode *treenode::find_node_with_overlapping_child(
+ const keyrange &range, const keyrange::comparison *cmp_hint) {
+ // determine which child to look at based on a comparison. if we were
+ // given a comparison hint, use that. otherwise, compare them now.
+ keyrange::comparison c =
+ cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
+
+ treenode *child;
+ if (c == keyrange::comparison::LESS_THAN) {
+ child = lock_and_rebalance_left();
+ } else {
+ // The caller (locked_keyrange::acquire) handles the case where
+ // the root of the locked_keyrange is the node that overlaps.
+ // range is guaranteed not to overlap this node.
+ invariant(c == keyrange::comparison::GREATER_THAN);
+ child = lock_and_rebalance_right();
+ }
+
+ // if the search would lead us to an empty subtree (child == nullptr),
+ // or the child overlaps, then we know this node is the parent we want.
+ // otherwise we need to recur into that child.
+ if (child == nullptr) {
+ return this;
+ } else {
+ c = range.compare(*m_cmp, child->m_range);
+ if (c == keyrange::comparison::EQUALS ||
+ c == keyrange::comparison::OVERLAPS) {
+ child->mutex_unlock();
+ return this;
+ } else {
+ // unlock this node before recurring into the locked child,
+ // passing in a comparison hint since we just comapred range
+ // to the child's range.
+ mutex_unlock();
+ return child->find_node_with_overlapping_child(range, &c);
+ }
+ }
+}
+
+bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
+ int rc = true;
+ // choose a child to check. if that child is null, then insert the new node
+ // there. otherwise recur down that child's subtree
+ keyrange::comparison c = range.compare(*m_cmp, m_range);
+ if (c == keyrange::comparison::LESS_THAN) {
+ treenode *left_child = lock_and_rebalance_left();
+ if (left_child == nullptr) {
+ left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+ m_left_child.set(left_child);
+ } else {
+ left_child->insert(range, txnid, is_shared);
+ left_child->mutex_unlock();
+ }
+ } else if (c == keyrange::comparison::GREATER_THAN) {
+ // invariant(c == keyrange::comparison::GREATER_THAN);
+ treenode *right_child = lock_and_rebalance_right();
+ if (right_child == nullptr) {
+ right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+ m_right_child.set(right_child);
+ } else {
+ right_child->insert(range, txnid, is_shared);
+ right_child->mutex_unlock();
+ }
+ } else if (c == keyrange::comparison::EQUALS) {
+ invariant(is_shared);
+ invariant(m_is_shared);
+ rc = add_shared_owner(txnid);
+ } else {
+ invariant(0);
+ }
+ return rc;
+}
+
+treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
+ treenode *child =
+ direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
+
+ if (child) {
+ *parent = this;
+ treenode *child_extreme = child->find_child_at_extreme(direction, parent);
+ child->mutex_unlock();
+ return child_extreme;
+ } else {
+ return this;
+ }
+}
+
+treenode *treenode::find_leftmost_child(treenode **parent) {
+ return find_child_at_extreme(-1, parent);
+}
+
+treenode *treenode::find_rightmost_child(treenode **parent) {
+ return find_child_at_extreme(1, parent);
+}
+
+treenode *treenode::remove_root_of_subtree() {
+ // if this node has no children, just free it and return null
+ if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
+ // treenode::free requires that non-root nodes are unlocked
+ if (!is_root()) {
+ mutex_unlock();
+ }
+ treenode::free(this);
+ return nullptr;
+ }
+
+ // we have a child, so get either the in-order successor or
+ // predecessor of this node to be our replacement.
+ // replacement_parent is updated by the find functions as
+ // they recur down the tree, so initialize it to this.
+ treenode *child, *replacement;
+ treenode *replacement_parent = this;
+ if (m_left_child.ptr != nullptr) {
+ child = m_left_child.get_locked();
+ replacement = child->find_rightmost_child(&replacement_parent);
+ invariant(replacement == child || replacement_parent != this);
+
+ // detach the replacement from its parent
+ if (replacement_parent == this) {
+ m_left_child = replacement->m_left_child;
+ } else {
+ replacement_parent->m_right_child = replacement->m_left_child;
+ }
+ } else {
+ child = m_right_child.get_locked();
+ replacement = child->find_leftmost_child(&replacement_parent);
+ invariant(replacement == child || replacement_parent != this);
+
+ // detach the replacement from its parent
+ if (replacement_parent == this) {
+ m_right_child = replacement->m_right_child;
+ } else {
+ replacement_parent->m_left_child = replacement->m_right_child;
+ }
+ }
+ child->mutex_unlock();
+
+ // swap in place with the detached replacement, then destroy it
+ treenode::swap_in_place(replacement, this);
+ treenode::free(replacement);
+
+ return this;
+}
+
+void treenode::recursive_remove(void) {
+ treenode *left = m_left_child.ptr;
+ if (left) {
+ left->recursive_remove();
+ }
+ m_left_child.set(nullptr);
+
+ treenode *right = m_right_child.ptr;
+ if (right) {
+ right->recursive_remove();
+ }
+ m_right_child.set(nullptr);
+
+ // we do not take locks on the way down, so we know non-root nodes
+ // are unlocked here and the caller is required to pass a locked
+ // root, so this free is correct.
+ treenode::free(this);
+}
+
+void treenode::remove_shared_owner(TXNID txnid) {
+ assert(m_owners->size() > 1);
+ m_owners->erase(txnid);
+ assert(m_owners->size() > 0);
+ /* if there is just one owner left, move it to m_txnid */
+ if (m_owners->size() == 1) {
+ m_txnid = *m_owners->begin();
+ delete m_owners;
+ m_owners = nullptr;
+ }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
+ treenode *child;
+ // if the range is equal to this node's range, then just remove
+ // the root of this subtree. otherwise search down the tree
+ // in either the left or right children.
+ keyrange::comparison c = range.compare(*m_cmp, m_range);
+ switch (c) {
+ case keyrange::comparison::EQUALS: {
+ // if we are the only owners, remove. Otherwise, just remove
+ // us from the owners list.
+ if (txnid != TXNID_ANY && has_multiple_owners()) {
+ remove_shared_owner(txnid);
+ return this;
+ } else {
+ return remove_root_of_subtree();
+ }
+ }
+ case keyrange::comparison::LESS_THAN:
+ child = m_left_child.get_locked();
+ invariant_notnull(child);
+ child = child->remove(range, txnid);
+
+ // unlock the child if there still is one.
+ // regardless, set the right child pointer
+ if (child) {
+ child->mutex_unlock();
+ }
+ m_left_child.set(child);
+ break;
+ case keyrange::comparison::GREATER_THAN:
+ child = m_right_child.get_locked();
+ invariant_notnull(child);
+ child = child->remove(range, txnid);
+
+ // unlock the child if there still is one.
+ // regardless, set the right child pointer
+ if (child) {
+ child->mutex_unlock();
+ }
+ m_right_child.set(child);
+ break;
+ case keyrange::comparison::OVERLAPS:
+ // shouldn't be overlapping, since the tree is
+ // non-overlapping and this range must exist
+ abort();
+ }
+
+ return this;
+}
+
+bool treenode::left_imbalanced(int threshold) const {
+ uint32_t left_depth = m_left_child.depth_est;
+ uint32_t right_depth = m_right_child.depth_est;
+ return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
+}
+
+bool treenode::right_imbalanced(int threshold) const {
+ uint32_t left_depth = m_left_child.depth_est;
+ uint32_t right_depth = m_right_child.depth_est;
+ return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
+}
+
+// effect: rebalances the subtree rooted at this node
+// using AVL style O(1) rotations. unlocks this
+// node if it is not the new root of the subtree.
+// requires: node is locked by this thread, children are not
+// returns: locked root node of the rebalanced tree
+treenode *treenode::maybe_rebalance(void) {
+ // if we end up not rotating at all, the new root is this
+ treenode *new_root = this;
+ treenode *child = nullptr;
+
+ if (left_imbalanced(IMBALANCE_THRESHOLD)) {
+ child = m_left_child.get_locked();
+ if (child->right_imbalanced(0)) {
+ treenode *grandchild = child->m_right_child.get_locked();
+
+ child->m_right_child = grandchild->m_left_child;
+ grandchild->m_left_child.set(child);
+
+ m_left_child = grandchild->m_right_child;
+ grandchild->m_right_child.set(this);
+
+ new_root = grandchild;
+ } else {
+ m_left_child = child->m_right_child;
+ child->m_right_child.set(this);
+ new_root = child;
+ }
+ } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
+ child = m_right_child.get_locked();
+ if (child->left_imbalanced(0)) {
+ treenode *grandchild = child->m_left_child.get_locked();
+
+ child->m_left_child = grandchild->m_right_child;
+ grandchild->m_right_child.set(child);
+
+ m_right_child = grandchild->m_left_child;
+ grandchild->m_left_child.set(this);
+
+ new_root = grandchild;
+ } else {
+ m_right_child = child->m_left_child;
+ child->m_left_child.set(this);
+ new_root = child;
+ }
+ }
+
+ // up to three nodes may be locked.
+ // - this
+ // - child
+ // - grandchild (but if it is locked, its the new root)
+ //
+ // one of them is the new root. we unlock everything except the new root.
+ if (child && child != new_root) {
+ TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
+ child->mutex_unlock();
+ }
+ if (this != new_root) {
+ TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
+ mutex_unlock();
+ }
+ TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
+ return new_root;
+}
+
+treenode *treenode::lock_and_rebalance_left(void) {
+ treenode *child = m_left_child.get_locked();
+ if (child) {
+ treenode *new_root = child->maybe_rebalance();
+ m_left_child.set(new_root);
+ child = new_root;
+ }
+ return child;
+}
+
+treenode *treenode::lock_and_rebalance_right(void) {
+ treenode *child = m_right_child.get_locked();
+ if (child) {
+ treenode *new_root = child->maybe_rebalance();
+ m_right_child.set(new_root);
+ child = new_root;
+ }
+ return child;
+}
+
+void treenode::child_ptr::set(treenode *node) {
+ ptr = node;
+ depth_est = ptr ? ptr->get_depth_estimate() : 0;
+}
+
+treenode *treenode::child_ptr::get_locked(void) {
+ if (ptr) {
+ ptr->mutex_lock();
+ depth_est = ptr->get_depth_estimate();
+ }
+ return ptr;
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
new file mode 100644
index 000000000..ec25a8c58
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
@@ -0,0 +1,302 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../ft/comparator.h"
+#include "../portability/memory.h"
+#include "../portability/toku_pthread.h"
+// PORT: we need LTM_STATUS
+#include "../ft/ft-status.h"
+#include "../portability/txn_subst.h"
+#include "keyrange.h"
+
+namespace toku {
+
+// a node in a tree with its own mutex
+// - range is the "key" of this node
+// - txnid is the single txnid associated with this node
+// - left and right children may be null
+//
+// to build a tree on top of this abstraction, the user:
+// - provides memory for a root node, initializes it via create_root()
+// - performs tree operations on the root node. memory management
+// below the root node is handled by the abstraction, not the user.
+// this pattern:
+// - guaruntees a root node always exists.
+// - does not allow for rebalances on the root node
+
+class treenode {
+ public:
+ // every treenode function has some common requirements:
+ // - node is locked and children are never locked
+ // - node may be unlocked if no other thread has visibility
+
+ // effect: create the root node
+ void create_root(const comparator *cmp);
+
+ // effect: destroys the root node
+ void destroy_root(void);
+
+ // effect: sets the txnid and copies the given range for this node
+ void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
+
+ // returns: true iff this node is marked as empty
+ bool is_empty(void);
+
+ // returns: true if this is the root node, denoted by a null parent
+ bool is_root(void);
+
+ // returns: true if the given range overlaps with this node's range
+ bool range_overlaps(const keyrange &range);
+
+ // effect: locks the node
+ void mutex_lock(void);
+
+ // effect: unlocks the node
+ void mutex_unlock(void);
+
+ // return: node whose child overlaps, or a child that is empty
+ // and would contain range if it existed
+ // given: if cmp_hint is non-null, then it is a precomputed
+ // comparison of this node's range to the given range.
+ treenode *find_node_with_overlapping_child(
+ const keyrange &range, const keyrange::comparison *cmp_hint);
+
+ // effect: performs an in-order traversal of the ranges that overlap the
+ // given range, calling function->fn() on each node that does
+ // requires: function signature is: bool fn(const keyrange &range, TXNID
+ // txnid) requires: fn returns true to keep iterating, false to stop iterating
+ // requires: fn does not attempt to use any ranges read out by value
+ // after removing a node with an overlapping range from the tree.
+ template <class F>
+ void traverse_overlaps(const keyrange &range, F *function) {
+ keyrange::comparison c = range.compare(*m_cmp, m_range);
+ if (c == keyrange::comparison::EQUALS) {
+ // Doesn't matter if fn wants to keep going, there
+ // is nothing left, so return.
+ function->fn(m_range, m_txnid, m_is_shared, m_owners);
+ return;
+ }
+
+ treenode *left = m_left_child.get_locked();
+ if (left) {
+ if (c != keyrange::comparison::GREATER_THAN) {
+ // Target range is less than this node, or it overlaps this
+ // node. There may be something on the left.
+ left->traverse_overlaps(range, function);
+ }
+ left->mutex_unlock();
+ }
+
+ if (c == keyrange::comparison::OVERLAPS) {
+ bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
+ if (!keep_going) {
+ return;
+ }
+ }
+
+ treenode *right = m_right_child.get_locked();
+ if (right) {
+ if (c != keyrange::comparison::LESS_THAN) {
+ // Target range is greater than this node, or it overlaps this
+ // node. There may be something on the right.
+ right->traverse_overlaps(range, function);
+ }
+ right->mutex_unlock();
+ }
+ }
+
+ // effect: inserts the given range and txnid into a subtree, recursively
+ // requires: range does not overlap with any node below the subtree
+ bool insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+ // effect: removes the given range from the subtree
+ // requires: range exists in the subtree
+ // returns: the root of the resulting subtree
+ treenode *remove(const keyrange &range, TXNID txnid);
+
+ // effect: removes this node and all of its children, recursively
+ // requires: every node at and below this node is unlocked
+ void recursive_remove(void);
+
+ private:
+ // the child_ptr is a light abstraction for the locking of
+ // a child and the maintenence of its depth estimate.
+
+ struct child_ptr {
+ // set the child pointer
+ void set(treenode *node);
+
+ // get and lock this child if it exists
+ treenode *get_locked(void);
+
+ treenode *ptr;
+ uint32_t depth_est;
+ };
+
+ // the balance factor at which a node is considered imbalanced
+ static const int32_t IMBALANCE_THRESHOLD = 2;
+
+ // node-level mutex
+ toku_mutex_t m_mutex;
+
+ // the range and txnid for this node. the range contains a copy
+ // of the keys originally inserted into the tree. nodes may
+ // swap ranges. but at the end of the day, when a node is
+ // destroyed, it frees the memory associated with whatever range
+ // it has at the time of destruction.
+ keyrange m_range;
+
+ void remove_shared_owner(TXNID txnid);
+
+ bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+ private:
+ // Owner transaction id.
+ // A value of TXNID_SHARED means this node has multiple owners
+ TXNID m_txnid;
+
+ // If true, this lock is a non-exclusive lock, and it can have either
+ // one or several owners.
+ bool m_is_shared;
+
+ // List of the owners, or nullptr if there's just one owner.
+ TxnidVector *m_owners;
+
+ // two child pointers
+ child_ptr m_left_child;
+ child_ptr m_right_child;
+
+ // comparator for ranges
+ // psergey-todo: Is there any sense to store the comparator in each tree
+ // node?
+ const comparator *m_cmp;
+
+ // marked for the root node. the root node is never free()'d
+ // when removed, but instead marked as empty.
+ bool m_is_root;
+
+ // marked for an empty node. only valid for the root.
+ bool m_is_empty;
+
+ // effect: initializes an empty node with the given comparator
+ void init(const comparator *cmp);
+
+ // requires: this is a shared node (m_is_shared==true)
+ // effect: another transaction is added as an owner.
+ // returns: true <=> added another owner
+ // false <=> this transaction is already an owner
+ bool add_shared_owner(TXNID txnid);
+
+ // requires: *parent is initialized to something meaningful.
+ // requires: subtree is non-empty
+ // returns: the leftmost child of the given subtree
+ // returns: a pointer to the parent of said child in *parent, only
+ // if this function recurred, otherwise it is untouched.
+ treenode *find_leftmost_child(treenode **parent);
+
+ // requires: *parent is initialized to something meaningful.
+ // requires: subtree is non-empty
+ // returns: the rightmost child of the given subtree
+ // returns: a pointer to the parent of said child in *parent, only
+ // if this function recurred, otherwise it is untouched.
+ treenode *find_rightmost_child(treenode **parent);
+
+ // effect: remove the root of this subtree, destroying the old root
+ // returns: the new root of the subtree
+ treenode *remove_root_of_subtree(void);
+
+ // requires: subtree is non-empty, direction is not 0
+ // returns: the child of the subtree at either the left or rightmost extreme
+ treenode *find_child_at_extreme(int direction, treenode **parent);
+
+ // effect: retrieves and possibly rebalances the left child
+ // returns: a locked left child, if it exists
+ treenode *lock_and_rebalance_left(void);
+
+ // effect: retrieves and possibly rebalances the right child
+ // returns: a locked right child, if it exists
+ treenode *lock_and_rebalance_right(void);
+
+ // returns: the estimated depth of this subtree
+ uint32_t get_depth_estimate(void) const;
+
+ // returns: true iff left subtree depth is sufficiently less than the right
+ bool left_imbalanced(int threshold) const;
+
+ // returns: true iff right subtree depth is sufficiently greater than the left
+ bool right_imbalanced(int threshold) const;
+
+ // effect: performs an O(1) rebalance, which will "heal" an imbalance by at
+ // most 1. effect: if the new root is not this node, then this node is
+ // unlocked. returns: locked node representing the new root of the rebalanced
+ // subtree
+ treenode *maybe_rebalance(void);
+
+ // returns: allocated treenode populated with a copy of the range and txnid
+ static treenode *alloc(const comparator *cmp, const keyrange &range,
+ TXNID txnid, bool is_shared);
+
+ // requires: node is a locked root node, or an unlocked non-root node
+ static void free(treenode *node);
+
+ // effect: swaps the range/txnid pairs for node1 and node2.
+ static void swap_in_place(treenode *node1, treenode *node2);
+
+ friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
new file mode 100644
index 000000000..4caf1e26f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
@@ -0,0 +1,120 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "txnid_set.h"
+
+#include "../db.h"
+
+namespace toku {
+
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
+ if (txnid_a < txnid_b) {
+ return -1;
+ } else if (txnid_a == txnid_b) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+void txnid_set::create(void) {
+ // lazily allocate the underlying omt, since it is common
+ // to create a txnid set and never put anything in it.
+ m_txnids.create_no_array();
+}
+
+void txnid_set::destroy(void) { m_txnids.destroy(); }
+
+// Return true if the given transaction id is a member of the set.
+// Otherwise, return false.
+bool txnid_set::contains(TXNID txnid) const {
+ TXNID find_txnid;
+ int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
+ return r == 0 ? true : false;
+}
+
+// Add a given txnid to the set
+void txnid_set::add(TXNID txnid) {
+ int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
+ invariant(r == 0 || r == DB_KEYEXIST);
+}
+
+// Delete a given txnid from the set.
+void txnid_set::remove(TXNID txnid) {
+ uint32_t idx;
+ int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
+ if (r == 0) {
+ r = m_txnids.delete_at(idx);
+ invariant_zero(r);
+ }
+}
+
+// Return the size of the set
+uint32_t txnid_set::size(void) const { return m_txnids.size(); }
+
+// Get the ith id in the set, assuming that the set is sorted.
+TXNID txnid_set::get(uint32_t i) const {
+ TXNID txnid;
+ int r = m_txnids.fetch(i, &txnid);
+ if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+ return TXNID_NONE;
+ invariant_zero(r);
+ return txnid;
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
new file mode 100644
index 000000000..d79c24fb0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../portability/txn_subst.h"
+#include "../util/omt.h"
+
+namespace toku {
+
+class txnid_set {
+ public:
+ // effect: Creates an empty set. Does not malloc space for
+ // any entries yet. That is done lazily on add().
+ void create(void);
+
+ // effect: Destroy the set's internals.
+ void destroy(void);
+
+ // returns: True if the given txnid is a member of the set.
+ bool contains(TXNID id) const;
+
+ // effect: Adds a given txnid to the set if it did not exist
+ void add(TXNID txnid);
+
+ // effect: Deletes a txnid from the set if it exists.
+ void remove(TXNID txnid);
+
+ // returns: Size of the set
+ uint32_t size(void) const;
+
+ // returns: The "i'th" id in the set, as if it were sorted.
+ TXNID get(uint32_t i) const;
+
+ private:
+ toku::omt<TXNID> m_txnids;
+
+ friend class txnid_set_unit_test;
+};
+ENSURE_POD(txnid_set);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
new file mode 100644
index 000000000..24536c88e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
@@ -0,0 +1,213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "../db.h"
+#include "../portability/memory.h"
+// PORT #include <toku_assert.h>
+#include <memory.h>
+#include <string.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Create a lock request graph
+void wfg::create(void) { m_nodes.create(); }
+
+// Destroy the internals of the lock request graph
+void wfg::destroy(void) {
+ uint32_t n_nodes = m_nodes.size();
+ for (uint32_t i = 0; i < n_nodes; i++) {
+ node *n;
+ int r = m_nodes.fetch(i, &n);
+ invariant_zero(r);
+ invariant_notnull(n);
+ if (r) continue; // Get rid of "may be used uninitialized" warning
+ node::free(n);
+ }
+ m_nodes.destroy();
+}
+
+// Add an edge (a_id, b_id) to the graph
+void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
+ node *a_node = find_create_node(a_txnid);
+ node *b_node = find_create_node(b_txnid);
+ a_node->edges.add(b_node->txnid);
+}
+
+// Return true if a node with the given transaction id exists in the graph.
+// Return false otherwise.
+bool wfg::node_exists(TXNID txnid) {
+ node *n = find_node(txnid);
+ return n != NULL;
+}
+
+bool wfg::cycle_exists_from_node(node *target, node *head,
+ std::function<void(TXNID)> reporter) {
+ bool cycle_found = false;
+ head->visited = true;
+ uint32_t n_edges = head->edges.size();
+ for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
+ TXNID edge_id = head->edges.get(i);
+ if (target->txnid == edge_id) {
+ cycle_found = true;
+ if (reporter) reporter(edge_id);
+ } else {
+ node *new_head = find_node(edge_id);
+ if (new_head && !new_head->visited) {
+ cycle_found = cycle_exists_from_node(target, new_head, reporter);
+ if (cycle_found && reporter) reporter(edge_id);
+ }
+ }
+ }
+ head->visited = false;
+ return cycle_found;
+}
+
+// Return true if there exists a cycle from a given transaction id in the graph.
+// Return false otherwise.
+bool wfg::cycle_exists_from_txnid(TXNID txnid,
+ std::function<void(TXNID)> reporter) {
+ node *a_node = find_node(txnid);
+ bool cycles_found = false;
+ if (a_node) {
+ cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
+ }
+ return cycles_found;
+}
+
+// Apply a given function f to all of the nodes in the graph. The apply
+// function returns when the function f is called for all of the nodes in the
+// graph, or the function f returns non-zero.
+void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
+ int r = 0;
+ uint32_t n_nodes = m_nodes.size();
+ for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
+ node *n;
+ r = m_nodes.fetch(i, &n);
+ invariant_zero(r);
+ if (r) continue; // Get rid of "may be used uninitialized" warning
+ r = fn(n->txnid, extra);
+ }
+}
+
+// Apply a given function f to all of the edges whose origin is a given node id.
+// The apply function returns when the function f is called for all edges in the
+// graph rooted at node id, or the function f returns non-zero.
+void wfg::apply_edges(TXNID txnid,
+ int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+ void *extra) {
+ node *n = find_node(txnid);
+ if (n) {
+ int r = 0;
+ uint32_t n_edges = n->edges.size();
+ for (uint32_t i = 0; i < n_edges && r == 0; i++) {
+ r = fn(txnid, n->edges.get(i), extra);
+ }
+ }
+}
+
+// find node by id
+wfg::node *wfg::find_node(TXNID txnid) {
+ node *n = nullptr;
+ int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
+ invariant(r == 0 || r == DB_NOTFOUND);
+ return n;
+}
+
+// this is the omt comparison function
+// nodes are compared by their txnid.
+int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
+ TXNID txnid_a = node_a->txnid;
+ if (txnid_a < txnid_b) {
+ return -1;
+ } else if (txnid_a == txnid_b) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+// insert a new node
+wfg::node *wfg::find_create_node(TXNID txnid) {
+ node *n;
+ uint32_t idx;
+ int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
+ if (r == DB_NOTFOUND) {
+ n = node::alloc(txnid);
+ r = m_nodes.insert_at(n, idx);
+ invariant_zero(r);
+ }
+ invariant_notnull(n);
+ return n;
+}
+
+wfg::node *wfg::node::alloc(TXNID txnid) {
+ node *XCALLOC(n);
+ n->txnid = txnid;
+ n->visited = false;
+ n->edges.create();
+ return n;
+}
+
+void wfg::node::free(wfg::node *n) {
+ n->edges.destroy();
+ toku_free(n);
+}
+
+} /* namespace toku */
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
new file mode 100644
index 000000000..804202170
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
@@ -0,0 +1,124 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <functional>
+
+#include "../util/omt.h"
+#include "txnid_set.h"
+
+namespace toku {
+
+// A wfg is a 'wait-for' graph. A directed edge in represents one
+// txn waiting for another to finish before it can acquire a lock.
+
+class wfg {
+ public:
+ // Create a lock request graph
+ void create(void);
+
+ // Destroy the internals of the lock request graph
+ void destroy(void);
+
+ // Add an edge (a_id, b_id) to the graph
+ void add_edge(TXNID a_txnid, TXNID b_txnid);
+
+ // Return true if a node with the given transaction id exists in the graph.
+ // Return false otherwise.
+ bool node_exists(TXNID txnid);
+
+ // Return true if there exists a cycle from a given transaction id in the
+ // graph. Return false otherwise.
+ bool cycle_exists_from_txnid(TXNID txnid,
+ std::function<void(TXNID)> reporter);
+
+ // Apply a given function f to all of the nodes in the graph. The apply
+ // function returns when the function f is called for all of the nodes in the
+ // graph, or the function f returns non-zero.
+ void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
+
+ // Apply a given function f to all of the edges whose origin is a given node
+ // id. The apply function returns when the function f is called for all edges
+ // in the graph rooted at node id, or the function f returns non-zero.
+ void apply_edges(TXNID txnid,
+ int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+ void *extra);
+
+ private:
+ struct node {
+ // txnid for this node and the associated set of edges
+ TXNID txnid;
+ txnid_set edges;
+ bool visited;
+
+ static node *alloc(TXNID txnid);
+
+ static void free(node *n);
+ };
+ ENSURE_POD(node);
+
+ toku::omt<node *> m_nodes;
+
+ node *find_node(TXNID txnid);
+
+ node *find_create_node(TXNID txnid);
+
+ bool cycle_exists_from_node(node *target, node *head,
+ std::function<void(TXNID)> reporter);
+
+ static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
+};
+ENSURE_POD(wfg);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
new file mode 100644
index 000000000..0a621f8e0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
@@ -0,0 +1,215 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+#include "toku_portability.h"
+
+/* Percona memory allocation functions and macros.
+ * These are functions for malloc and free */
+
+int toku_memory_startup(void) __attribute__((constructor));
+void toku_memory_shutdown(void) __attribute__((destructor));
+
+/* Generally: errno is set to 0 or a value to indicate problems. */
+
+// Everything should call toku_malloc() instead of malloc(), and toku_calloc()
+// instead of calloc() That way the tests can can, e.g., replace the malloc
+// function using toku_set_func_malloc().
+void *toku_calloc(size_t nmemb, size_t size)
+ __attribute__((__visibility__("default")));
+void *toku_xcalloc(size_t nmemb, size_t size)
+ __attribute__((__visibility__("default")));
+void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_malloc_aligned(size_t alignment, size_t size)
+ __attribute__((__visibility__("default")));
+
+// xmalloc aborts instead of return NULL if we run out of memory
+void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_xrealloc(void *, size_t size)
+ __attribute__((__visibility__("default")));
+void *toku_xmalloc_aligned(size_t alignment, size_t size)
+ __attribute__((__visibility__("default")));
+// Effect: Perform a os_malloc_aligned(size) with the additional property that
+// the returned pointer is a multiple of ALIGNMENT.
+// Fail with a resource_assert if the allocation fails (don't return an error
+// code). If the alloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.
+
+void toku_free(void *) __attribute__((__visibility__("default")));
+
+size_t toku_malloc_usable_size(void *p)
+ __attribute__((__visibility__("default")));
+
+/* MALLOC is a macro that helps avoid a common error:
+ * Suppose I write
+ * struct foo *x = malloc(sizeof(struct foo));
+ * That works fine. But if I change it to this, I've probably made an mistake:
+ * struct foo *x = malloc(sizeof(struct bar));
+ * It can get worse, since one might have something like
+ * struct foo *x = malloc(sizeof(struct foo *))
+ * which looks reasonable, but it allocoates enough to hold a pointer instead of
+ * the amount needed for the struct. So instead, write struct foo *MALLOC(x);
+ * and you cannot go wrong.
+ */
+#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v)))
+/* MALLOC_N is like calloc(Except no 0ing of data): It makes an array. Write
+ * int *MALLOC_N(5,x);
+ * to make an array of 5 integers.
+ */
+#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v)))
+#define MALLOC_N_ALIGNED(align, n, v) \
+ CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v)))
+
+// CALLOC_N is like calloc with auto-figuring out size of members
+#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
+
+#define CALLOC(v) CALLOC_N(1, v)
+
+// XMALLOC macros are like MALLOC except they abort if the operation fails
+#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
+#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v)))
+#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v))))
+#define XCALLOC(v) XCALLOC_N(1, v)
+#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
+#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v)))
+
+#define XMALLOC_N_ALIGNED(align, n, v) \
+ CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v)))
+
+#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
+#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
+
+// ZERO_ARRAY writes zeroes to a stack-allocated array
+#define ZERO_ARRAY(o) \
+ do { \
+ memset((o), 0, sizeof(o)); \
+ } while (0)
+// ZERO_STRUCT writes zeroes to a stack-allocated struct
+#define ZERO_STRUCT(o) \
+ do { \
+ memset(&(o), 0, sizeof(o)); \
+ } while (0)
+
+/* Copy memory. Analogous to strdup() */
+void *toku_memdup(const void *v, size_t len);
+/* Toku-version of strdup. Use this so that it calls toku_malloc() */
+char *toku_strdup(const char *s) __attribute__((__visibility__("default")));
+/* Toku-version of strndup. Use this so that it calls toku_malloc() */
+char *toku_strndup(const char *s, size_t n)
+ __attribute__((__visibility__("default")));
+/* Copy memory. Analogous to strdup() Crashes instead of returning NULL */
+void *toku_xmemdup(const void *v, size_t len)
+ __attribute__((__visibility__("default")));
+/* Toku-version of strdup. Use this so that it calls toku_xmalloc() Crashes
+ * instead of returning NULL */
+char *toku_xstrdup(const char *s) __attribute__((__visibility__("default")));
+
+void toku_malloc_cleanup(
+ void); /* Before exiting, call this function to free up any internal data
+ structures from toku_malloc. Otherwise valgrind will complain of
+ memory leaks. */
+
+/* Check to see if everything malloc'd was free. Might be a no-op depending on
+ * how memory.c is configured. */
+void toku_memory_check_all_free(void);
+/* Check to see if memory is "sane". Might be a no-op. Probably better to
+ * simply use valgrind. */
+void toku_do_memory_check(void);
+
+typedef void *(*malloc_fun_t)(size_t);
+typedef void (*free_fun_t)(void *);
+typedef void *(*realloc_fun_t)(void *, size_t);
+typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
+typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/,
+ size_t /*size*/);
+
+void toku_set_func_malloc(malloc_fun_t f);
+void toku_set_func_xmalloc_only(malloc_fun_t f);
+void toku_set_func_malloc_only(malloc_fun_t f);
+void toku_set_func_realloc(realloc_fun_t f);
+void toku_set_func_xrealloc_only(realloc_fun_t f);
+void toku_set_func_realloc_only(realloc_fun_t f);
+void toku_set_func_free(free_fun_t f);
+
+typedef struct memory_status {
+ uint64_t malloc_count; // number of malloc operations
+ uint64_t free_count; // number of free operations
+ uint64_t realloc_count; // number of realloc operations
+ uint64_t malloc_fail; // number of malloc operations that failed
+ uint64_t realloc_fail; // number of realloc operations that failed
+ uint64_t requested; // number of bytes requested
+ uint64_t used; // number of bytes used (requested + overhead), obtained from
+ // malloc_usable_size()
+ uint64_t freed; // number of bytes freed;
+ uint64_t max_requested_size; // largest attempted allocation size
+ uint64_t last_failed_size; // size of the last failed allocation attempt
+ volatile uint64_t
+ max_in_use; // maximum memory footprint (used - freed), approximate (not
+ // worth threadsafety overhead for exact)
+ const char *mallocator_version;
+ uint64_t mmap_threshold;
+} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS;
+
+void toku_memory_get_status(LOCAL_MEMORY_STATUS s);
+
+// Effect: Like toku_memory_footprint, except instead of passing p,
+// we pass toku_malloc_usable_size(p).
+size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable);
+
+// Effect: Return an estimate how how much space an object is using, possibly by
+// using toku_malloc_usable_size(p).
+// If p is NULL then returns 0.
+size_t toku_memory_footprint(void *p, size_t touched);
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
new file mode 100644
index 000000000..af47800fb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
@@ -0,0 +1,39 @@
+//
+// A replacement for toku_assert.h
+//
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+
+#ifdef NDEBUG
+
+#define assert_zero(a) ((void)(a))
+#define invariant(a) ((void)(a))
+#define invariant_notnull(a) ((void)(a))
+#define invariant_zero(a) ((void)(a))
+
+#else
+
+#define assert_zero(a) assert((a) == 0)
+#define invariant(a) assert(a)
+#define invariant_notnull(a) assert(a)
+#define invariant_zero(a) assert_zero(a)
+
+#endif
+
+#define lazy_assert_zero(a) assert_zero(a)
+
+#define paranoid_invariant_zero(a) assert_zero(a)
+#define paranoid_invariant_notnull(a) assert(a)
+#define paranoid_invariant(a) assert(a)
+
+#define ENSURE_POD(type) \
+ static_assert( \
+ std::is_standard_layout<type>::value && std::is_trivial<type>::value, \
+ #type "isn't POD")
+
+inline int get_error_errno(void) {
+ invariant(errno);
+ return errno;
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
new file mode 100644
index 000000000..aaa2298fa
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
@@ -0,0 +1,130 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "toku_assert_subst.h"
+
+__attribute__((const, always_inline)) static inline intptr_t which_cache_line(
+ intptr_t addr) {
+ static const size_t assumed_cache_line_size = 64;
+ return addr / assumed_cache_line_size;
+}
+template <typename T>
+__attribute__((const, always_inline)) static inline bool crosses_boundary(
+ T *addr, size_t width) {
+ const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
+ const intptr_t last_byte = int_addr + width - 1;
+ return which_cache_line(int_addr) != which_cache_line(last_byte);
+}
+
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr,
+ U diff) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_fetch_and_add(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr,
+ U diff) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_add_and_fetch(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr,
+ U diff) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_fetch_and_sub(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr,
+ U diff) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_sub_and_fetch(addr, diff);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap(
+ T *addr, U oldval, V newval) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_val_compare_and_swap(addr, oldval, newval);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline bool
+toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
+ paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+ return __sync_bool_compare_and_swap(addr, oldval, newval);
+}
+
+// in case you include this but not toku_portability.h
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
new file mode 100644
index 000000000..eb8291c1d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
@@ -0,0 +1,83 @@
+/*
+ A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided
+ condition and mutex that provides toku_pthread_*-like interface. The functions
+ are named
+
+ toku_external_{mutex|cond}_XXX
+
+ Lock Tree uses this mutex and condition for interruptible (long) lock waits.
+
+ (It also still uses toku_pthread_XXX calls for mutexes/conditions for
+ shorter waits on internal objects)
+*/
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "toku_portability.h"
+
+using ROCKSDB_NAMESPACE::TransactionDBCondVar;
+using ROCKSDB_NAMESPACE::TransactionDBMutex;
+
+typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory>
+ toku_external_mutex_factory_t;
+
+typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t;
+typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t;
+
+static inline void toku_external_cond_init(
+ toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) {
+ *cond = mutex_factory->AllocateCondVar();
+}
+
+inline void toku_external_cond_destroy(toku_external_cond_t *cond) {
+ cond->reset(); // this will destroy the managed object
+}
+
+inline void toku_external_cond_signal(toku_external_cond_t *cond) {
+ (*cond)->Notify();
+}
+
+inline void toku_external_cond_broadcast(toku_external_cond_t *cond) {
+ (*cond)->NotifyAll();
+}
+
+inline int toku_external_cond_timedwait(toku_external_cond_t *cond,
+ toku_external_mutex_t *mutex,
+ int64_t timeout_microsec) {
+ auto res = (*cond)->WaitFor(*mutex, timeout_microsec);
+ if (res.ok())
+ return 0;
+ else
+ return ETIMEDOUT;
+}
+
+inline void toku_external_mutex_init(toku_external_mutex_factory_t factory,
+ toku_external_mutex_t *mutex) {
+ // Use placement new: the memory has been allocated but constructor wasn't
+ // called
+ new (mutex) toku_external_mutex_t;
+ *mutex = factory->AllocateMutex();
+}
+
+inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) {
+ (*mutex)->Lock();
+}
+
+inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) {
+ (*mutex)->Lock();
+ return 0;
+}
+
+inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) {
+ (*mutex)->UnLock();
+}
+
+inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) {
+ mutex->reset(); // this will destroy the managed object
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
new file mode 100644
index 000000000..c967e7177
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
@@ -0,0 +1,286 @@
+/*======
+This file is part of PerconaFT.
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#pragma once
+
+#include <stdio.h> // FILE
+
+// Performance instrumentation object identifier type
+typedef unsigned int pfs_key_t;
+
+enum class toku_instr_object_type { mutex, rwlock, cond, thread, file };
+
+struct PSI_file;
+
+struct TOKU_FILE {
+ /** The real file. */
+ FILE *file;
+ struct PSI_file *key;
+ TOKU_FILE() : file(nullptr), key(nullptr) {}
+};
+
+struct PSI_mutex;
+struct PSI_cond;
+struct PSI_rwlock;
+
+struct toku_mutex_t;
+struct toku_cond_t;
+struct toku_pthread_rwlock_t;
+
+class toku_instr_key;
+
+class toku_instr_probe_empty {
+ public:
+ explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {}
+
+ void start_with_source_location(UU(const char *src_file), UU(int src_line)) {}
+
+ void stop() {}
+};
+
+#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__)
+#define TOKU_PROBE_STOP(p) p->stop
+
+extern toku_instr_key toku_uninstrumented;
+
+#ifndef MYSQL_TOKUDB_ENGINE
+
+#include <pthread.h>
+
+class toku_instr_key {
+ public:
+ toku_instr_key(UU(toku_instr_object_type type), UU(const char *group),
+ UU(const char *name)) {}
+
+ explicit toku_instr_key(UU(pfs_key_t key_id)) {}
+ // No-instrumentation constructor:
+ toku_instr_key() {}
+ ~toku_instr_key() {}
+};
+
+typedef toku_instr_probe_empty toku_instr_probe;
+
+enum class toku_instr_file_op {
+ file_stream_open,
+ file_create,
+ file_open,
+ file_delete,
+ file_rename,
+ file_read,
+ file_write,
+ file_sync,
+ file_stream_close,
+ file_close,
+ file_stat
+};
+
+struct PSI_file {};
+struct PSI_mutex {};
+
+struct toku_io_instrumentation {};
+
+inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread,
+ const pthread_attr_t *attr,
+ void *(*start_routine)(void *), void *arg) {
+ return pthread_create(thread, attr, start_routine, arg);
+}
+
+inline void toku_instr_register_current_thread() {}
+
+inline void toku_instr_delete_current_thread() {}
+
+// Instrument file creation, opening, closing, and renaming
+inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr),
+ UU(const toku_instr_key &key),
+ UU(toku_instr_file_op op),
+ UU(const char *name),
+ UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_file_stream_open_end(
+ UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {}
+
+inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr),
+ UU(int fd)) {}
+
+inline void toku_instr_file_name_close_begin(
+ UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+ UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_file_stream_close_begin(
+ UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+ UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_fd_close_begin(
+ UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+ UU(int fd), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr),
+ UU(int result)) {}
+
+inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr),
+ UU(toku_instr_file_op op), UU(int fd),
+ UU(unsigned int count),
+ UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_file_name_io_begin(
+ UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+ UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count),
+ UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_stream_io_begin(
+ UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+ UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr),
+ UU(unsigned int count)) {}
+
+struct toku_mutex_t;
+
+struct toku_mutex_instrumentation {};
+
+inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key),
+ UU(toku_mutex_t &mutex)) {
+ return nullptr;
+}
+
+inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {}
+
+inline void toku_instr_mutex_lock_start(
+ UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+ UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_trylock_start(
+ UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+ UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_lock_end(
+ UU(toku_mutex_instrumentation &mutex_instr),
+ UU(int pthread_mutex_lock_result)) {}
+
+inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {}
+
+struct toku_cond_instrumentation {};
+
+enum class toku_instr_cond_op {
+ cond_wait,
+ cond_timedwait,
+};
+
+inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key),
+ UU(toku_cond_t &cond)) {
+ return nullptr;
+}
+
+inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {}
+
+inline void toku_instr_cond_wait_start(
+ UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op),
+ UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr),
+ UU(int pthread_cond_wait_result)) {}
+
+inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {}
+
+inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {}
+
+#if 0
+// rw locks are not used
+// rwlock instrumentation
+struct toku_rwlock_instrumentation {};
+
+inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key),
+ UU(toku_pthread_rwlock_t &rwlock)) {
+ return nullptr;
+}
+
+inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_start(
+ UU(toku_rwlock_instrumentation &rwlock_instr),
+ UU(toku_pthread_rwlock_t &rwlock),
+ UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_start(
+ UU(toku_rwlock_instrumentation &rwlock_instr),
+ UU(toku_pthread_rwlock_t &rwlock),
+ UU(const char *src_file),
+ UU(int src_line)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_end(
+ UU(toku_rwlock_instrumentation &rwlock_instr),
+ UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_end(
+ UU(toku_rwlock_instrumentation &rwlock_instr),
+ UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {}
+#endif
+
+#else // MYSQL_TOKUDB_ENGINE
+// There can be not only mysql but also mongodb or any other PFS stuff
+#include <toku_instr_mysql.h>
+#endif // MYSQL_TOKUDB_ENGINE
+
+// Mutexes
+extern toku_instr_key manager_escalation_mutex_key;
+extern toku_instr_key manager_escalator_mutex_key;
+extern toku_instr_key manager_mutex_key;
+extern toku_instr_key treenode_mutex_key;
+extern toku_instr_key locktree_request_info_mutex_key;
+extern toku_instr_key locktree_request_info_retry_mutex_key;
+
+// condition vars
+extern toku_instr_key lock_request_m_wait_cond_key;
+extern toku_instr_key locktree_request_info_retry_cv_key;
+extern toku_instr_key manager_m_escalator_done_key; // unused
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
new file mode 100644
index 000000000..9a95b38bd
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
@@ -0,0 +1,87 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#if defined(__clang__)
+#define constexpr_static_assert(a, b)
+#else
+#define constexpr_static_assert(a, b) static_assert(a, b)
+#endif
+
+// include here, before they get deprecated
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "toku_atomic.h"
+
+#if defined(__cplusplus)
+#include <type_traits>
+#endif
+
+#if defined(__cplusplus)
+// decltype() here gives a reference-to-pointer instead of just a pointer,
+// just use __typeof__
+#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value)
+#else
+#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value)
+#endif
+
+#define UU(x) x __attribute__((__unused__))
+
+#include "toku_instrumentation.h"
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
new file mode 100644
index 000000000..571b950e1
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "toku_portability.h"
+// PORT2: #include "toku_assert.h"
+
+// TODO: some things moved toku_instrumentation.h, not necessarily the best
+// place
+typedef pthread_attr_t toku_pthread_attr_t;
+typedef pthread_t toku_pthread_t;
+typedef pthread_mutex_t toku_pthread_mutex_t;
+typedef pthread_condattr_t toku_pthread_condattr_t;
+typedef pthread_cond_t toku_pthread_cond_t;
+typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t;
+typedef pthread_key_t toku_pthread_key_t;
+typedef struct timespec toku_timespec_t;
+
+// TODO: break this include loop
+#include <pthread.h>
+typedef pthread_mutexattr_t toku_pthread_mutexattr_t;
+
+struct toku_mutex_t {
+ pthread_mutex_t pmutex;
+ struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */
+#if defined(TOKU_PTHREAD_DEBUG)
+ pthread_t owner; // = pthread_self(); // for debugging
+ bool locked;
+ bool valid;
+ pfs_key_t instr_key_id;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+};
+
+struct toku_cond_t {
+ pthread_cond_t pcond;
+ struct PSI_cond *psi_cond;
+#if defined(TOKU_PTHREAD_DEBUG)
+ pfs_key_t instr_key_id;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+};
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_COND_INITIALIZER \
+ { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 }
+#else
+#define TOKU_COND_INITIALIZER \
+ { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr }
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+struct toku_pthread_rwlock_t {
+ pthread_rwlock_t rwlock;
+ struct PSI_rwlock *psi_rwlock;
+#if defined(TOKU_PTHREAD_DEBUG)
+ pfs_key_t instr_key_id;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+};
+
+typedef struct toku_mutex_aligned {
+ toku_mutex_t aligned_mutex __attribute__((__aligned__(64)));
+} toku_mutex_aligned_t;
+
+// Initializing with {} will fill in a struct with all zeros.
+// But you may also need a pragma to suppress the warnings, as follows
+//
+// #pragma GCC diagnostic push
+// #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// toku_mutex_t foo = ZERO_MUTEX_INITIALIZER;
+// #pragma GCC diagnostic pop
+//
+// In general it will be a lot of busy work to make this codebase compile
+// cleanly with -Wmissing-field-initializers
+
+#define ZERO_MUTEX_INITIALIZER \
+ {}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_MUTEX_INITIALIZER \
+ { \
+ .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+ .locked = false, .valid = true, .instr_key_id = 0 \
+ }
+#else
+#define TOKU_MUTEX_INITIALIZER \
+ { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+// Darwin doesn't provide adaptive mutexes
+#if defined(__APPLE__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+ { \
+ .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+ .locked = false, .valid = true, .instr_key_id = 0 \
+ }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+ { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif // defined(TOKU_PTHREAD_DEBUG)
+#else // __FreeBSD__, __linux__, at least
+#if defined(__GLIBC__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP
+#else
+// not all libc (e.g. musl) implement NP (Non-POSIX) attributes
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#endif
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+ { \
+ .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \
+ .owner = 0, .locked = false, .valid = true, .instr_key_id = 0 \
+ }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+ { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr }
+#endif // defined(TOKU_PTHREAD_DEBUG)
+#endif // defined(__APPLE__)
+
+// Different OSes implement mutexes as different amounts of nested structs.
+// C++ will fill out all missing values with zeroes if you provide at least one
+// zero, but it needs the right amount of nesting.
+#if defined(__FreeBSD__)
+#define ZERO_COND_INITIALIZER \
+ { 0 }
+#elif defined(__APPLE__)
+#define ZERO_COND_INITIALIZER \
+ { \
+ { 0 } \
+ }
+#else // __linux__, at least
+#define ZERO_COND_INITIALIZER \
+ {}
+#endif
+
+static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) {
+ int r = pthread_mutexattr_init(attr);
+ assert_zero(r);
+}
+
+static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr,
+ int type) {
+ int r = pthread_mutexattr_settype(attr, type);
+ assert_zero(r);
+}
+
+static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) {
+ int r = pthread_mutexattr_destroy(attr);
+ assert_zero(r);
+}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) {
+ invariant(mutex->locked);
+ invariant(mutex->owner == pthread_self());
+}
+#else
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex
+ __attribute__((unused))) {}
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+// asserting that a mutex is unlocked only makes sense
+// if the calling thread can guaruntee that no other threads
+// are trying to lock this mutex at the time of the assertion
+//
+// a good example of this is a tree with mutexes on each node.
+// when a node is locked the caller knows that no other threads
+// can be trying to lock its childrens' mutexes. the children
+// are in one of two fixed states: locked or unlocked.
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) {
+ invariant(mutex->owner == 0);
+ invariant(!mutex->locked);
+}
+#else
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex
+ __attribute__((unused))) {}
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+#define toku_mutex_lock(M) \
+ toku_mutex_lock_with_source_location(M, __FILE__, __LINE__)
+
+static inline void toku_cond_init(toku_cond_t *cond,
+ const toku_pthread_condattr_t *attr) {
+ int r = pthread_cond_init(&cond->pcond, attr);
+ assert_zero(r);
+}
+
+#define toku_mutex_trylock(M) \
+ toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__)
+
+inline void toku_mutex_unlock(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(mutex->owner == pthread_self());
+ invariant(mutex->valid);
+ invariant(mutex->locked);
+ mutex->locked = false;
+ mutex->owner = 0;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+ toku_instr_mutex_unlock(mutex->psi_mutex);
+ int r = pthread_mutex_unlock(&mutex->pmutex);
+ assert_zero(r);
+}
+
+inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex,
+ const char *src_file,
+ int src_line) {
+ toku_mutex_instrumentation mutex_instr;
+ toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line);
+
+ const int r = pthread_mutex_lock(&mutex->pmutex);
+ toku_instr_mutex_lock_end(mutex_instr, r);
+
+ assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(mutex->valid);
+ invariant(!mutex->locked);
+ invariant(mutex->owner == 0);
+ mutex->locked = true;
+ mutex->owner = pthread_self();
+#endif // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex,
+ const char *src_file,
+ int src_line) {
+ toku_mutex_instrumentation mutex_instr;
+ toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line);
+
+ const int r = pthread_mutex_lock(&mutex->pmutex);
+ toku_instr_mutex_lock_end(mutex_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+ if (r == 0) {
+ invariant(mutex->valid);
+ invariant(!mutex->locked);
+ invariant(mutex->owner == 0);
+ mutex->locked = true;
+ mutex->owner = pthread_self();
+ }
+#endif // defined(TOKU_PTHREAD_DEBUG)
+ return r;
+}
+
+#define toku_cond_wait(C, M) \
+ toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__)
+
+#define toku_cond_timedwait(C, M, W) \
+ toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__)
+
+inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond,
+ const pthread_condattr_t *attr) {
+ toku_instr_cond_init(key, *cond);
+ int r = pthread_cond_init(&cond->pcond, attr);
+ assert_zero(r);
+}
+
+inline void toku_cond_destroy(toku_cond_t *cond) {
+ toku_instr_cond_destroy(cond->psi_cond);
+ int r = pthread_cond_destroy(&cond->pcond);
+ assert_zero(r);
+}
+
+inline void toku_cond_wait_with_source_location(toku_cond_t *cond,
+ toku_mutex_t *mutex,
+ const char *src_file,
+ int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(mutex->locked);
+ mutex->locked = false;
+ mutex->owner = 0;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+ /* Instrumentation start */
+ toku_cond_instrumentation cond_instr;
+ toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond,
+ *mutex, src_file, src_line);
+
+ /* Instrumented code */
+ const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex);
+
+ /* Instrumentation end */
+ toku_instr_cond_wait_end(cond_instr, r);
+
+ assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(!mutex->locked);
+ mutex->locked = true;
+ mutex->owner = pthread_self();
+#endif // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond,
+ toku_mutex_t *mutex,
+ toku_timespec_t *wakeup_at,
+ const char *src_file,
+ int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(mutex->locked);
+ mutex->locked = false;
+ mutex->owner = 0;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+
+ /* Instrumentation start */
+ toku_cond_instrumentation cond_instr;
+ toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait,
+ *cond, *mutex, src_file, src_line);
+
+ /* Instrumented code */
+ const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at);
+
+ /* Instrumentation end */
+ toku_instr_cond_wait_end(cond_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(!mutex->locked);
+ mutex->locked = true;
+ mutex->owner = pthread_self();
+#endif // defined(TOKU_PTHREAD_DEBUG)
+ return r;
+}
+
+inline void toku_cond_signal(toku_cond_t *cond) {
+ toku_instr_cond_signal(*cond);
+ const int r = pthread_cond_signal(&cond->pcond);
+ assert_zero(r);
+}
+
+inline void toku_cond_broadcast(toku_cond_t *cond) {
+ toku_instr_cond_broadcast(*cond);
+ const int r = pthread_cond_broadcast(&cond->pcond);
+ assert_zero(r);
+}
+
+inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex,
+ const toku_pthread_mutexattr_t *attr) {
+#if defined(TOKU_PTHREAD_DEBUG)
+ mutex->valid = true;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+ toku_instr_mutex_init(key, *mutex);
+ const int r = pthread_mutex_init(&mutex->pmutex, attr);
+ assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+ mutex->locked = false;
+ invariant(mutex->valid);
+ mutex->valid = true;
+ mutex->owner = 0;
+#endif // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline void toku_mutex_destroy(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+ invariant(mutex->valid);
+ mutex->valid = false;
+ invariant(!mutex->locked);
+#endif // defined(TOKU_PTHREAD_DEBUG)
+ toku_instr_mutex_destroy(mutex->psi_mutex);
+ int r = pthread_mutex_destroy(&mutex->pmutex);
+ assert_zero(r);
+}
+
+#define toku_pthread_rwlock_rdlock(RW) \
+ toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__)
+
+#define toku_pthread_rwlock_wrlock(RW) \
+ toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__)
+
+#if 0
+inline void toku_pthread_rwlock_init(
+ const toku_instr_key &key,
+ toku_pthread_rwlock_t *__restrict rwlock,
+ const toku_pthread_rwlockattr_t *__restrict attr) {
+ toku_instr_rwlock_init(key, *rwlock);
+ int r = pthread_rwlock_init(&rwlock->rwlock, attr);
+ assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) {
+ toku_instr_rwlock_destroy(rwlock->psi_rwlock);
+ int r = pthread_rwlock_destroy(&rwlock->rwlock);
+ assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdlock_with_source_location(
+ toku_pthread_rwlock_t *rwlock,
+ const char *src_file,
+ uint src_line) {
+
+ /* Instrumentation start */
+ toku_rwlock_instrumentation rwlock_instr;
+ toku_instr_rwlock_rdlock_wait_start(
+ rwlock_instr, *rwlock, src_file, src_line);
+ /* Instrumented code */
+ const int r = pthread_rwlock_rdlock(&rwlock->rwlock);
+
+ /* Instrumentation end */
+ toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r);
+
+ assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrlock_with_source_location(
+ toku_pthread_rwlock_t *rwlock,
+ const char *src_file,
+ uint src_line) {
+
+ /* Instrumentation start */
+ toku_rwlock_instrumentation rwlock_instr;
+ toku_instr_rwlock_wrlock_wait_start(
+ rwlock_instr, *rwlock, src_file, src_line);
+ /* Instrumented code */
+ const int r = pthread_rwlock_wrlock(&rwlock->rwlock);
+
+ /* Instrumentation end */
+ toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r);
+
+ assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) {
+ toku_instr_rwlock_unlock(*rwlock);
+ const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+ assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) {
+ toku_instr_rwlock_unlock(*rwlock);
+ const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+ assert_zero(r);
+}
+#endif
+
+static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
+ return pthread_join(thread, value_ptr);
+}
+
+static inline int toku_pthread_detach(toku_pthread_t thread) {
+ return pthread_detach(thread);
+}
+
+static inline int toku_pthread_key_create(toku_pthread_key_t *key,
+ void (*destroyf)(void *)) {
+ return pthread_key_create(key, destroyf);
+}
+
+static inline int toku_pthread_key_delete(toku_pthread_key_t key) {
+ return pthread_key_delete(key);
+}
+
+static inline void *toku_pthread_getspecific(toku_pthread_key_t key) {
+ return pthread_getspecific(key);
+}
+
+static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) {
+ return pthread_setspecific(key, data);
+}
+
+int toku_pthread_yield(void) __attribute__((__visibility__("default")));
+
+static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); }
+
+static inline void *toku_pthread_done(void *exit_value) {
+ toku_instr_delete_current_thread();
+ pthread_exit(exit_value);
+ return nullptr; // Avoid compiler warning
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
new file mode 100644
index 000000000..3cb5b5790
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
@@ -0,0 +1,179 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+
+#ifdef HAVE_valgrind
+#undef USE_VALGRIND
+#define USE_VALGRIND 1
+#endif
+
+#if defined(__linux__) && USE_VALGRIND
+
+#include <valgrind/drd.h>
+#include <valgrind/helgrind.h>
+
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \
+ VALGRIND_HG_ENABLE_CHECKING(p, size)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \
+ VALGRIND_HG_DISABLE_CHECKING(p, size)
+#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END()
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END()
+
+/*
+ * How to make helgrind happy about tree rotations and new mutex orderings:
+ *
+ * // Tell helgrind that we unlocked it so that the next call doesn't get a
+ * "destroyed a locked mutex" error.
+ * // Tell helgrind that we destroyed the mutex.
+ * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka);
+ * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka);
+ *
+ * // And recreate it. It would be better to simply be able to say that the
+ * order on these two can now be reversed, because this code forgets all the
+ * ordering information for this mutex.
+ * // Then tell helgrind that we have locked it again.
+ * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0);
+ * VALGRIND_HG_MUTEX_LOCK_POST(&locka);
+ *
+ * When the ordering of two locks changes, we don't need tell Helgrind about do
+ * both locks. Just one is good enough.
+ */
+
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \
+ VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex); \
+ VALGRIND_HG_MUTEX_DESTROY_PRE(mutex); \
+ VALGRIND_HG_MUTEX_INIT_POST(mutex, 0); \
+ VALGRIND_HG_MUTEX_LOCK_POST(mutex);
+
+#else // !defined(__linux__) || !USE_VALGRIND
+
+#define NVALGRIND 1
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0)
+#define TOKU_DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0)
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex)
+#undef RUNNING_ON_VALGRIND
+#define RUNNING_ON_VALGRIND (0U)
+#endif
+
+// Valgrind 3.10.1 (and previous versions).
+// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING.
+// Helgrind's implementation of disable and enable checking causes false races
+// to be reported. In addition, the race report does not include ANY
+// information about the code that uses the helgrind disable and enable
+// functions. Therefore, it is very difficult to figure out the cause of the
+// race. DRD does implement the disable and enable functions.
+
+// Problems with ANNOTATE_IGNORE_READS.
+// Helgrind does not implement ignore reads.
+// Annotate ignore reads is the way to inform DRD to ignore racy reads.
+
+// FT code uses unsafe reads in several places. These unsafe reads have been
+// noted as valid since they use the toku_unsafe_fetch function. Unfortunately,
+// this causes helgrind to report erroneous data races which makes use of
+// helgrind problematic.
+
+// Unsafely fetch and return a `T' from src, telling drd to ignore
+// racey access to src for the next sizeof(*src) bytes
+template <typename T>
+T toku_unsafe_fetch(T *src) {
+ if (0)
+ TOKU_VALGRIND_HG_DISABLE_CHECKING(src,
+ sizeof *src); // disabled, see comment
+ TOKU_ANNOTATE_IGNORE_READS_BEGIN();
+ T r = *src;
+ TOKU_ANNOTATE_IGNORE_READS_END();
+ if (0)
+ TOKU_VALGRIND_HG_ENABLE_CHECKING(src,
+ sizeof *src); // disabled, see comment
+ return r;
+}
+
+template <typename T>
+T toku_unsafe_fetch(T &src) {
+ return toku_unsafe_fetch(&src);
+}
+
+// Unsafely set a `T' value into *dest from src, telling drd to ignore
+// racey access to dest for the next sizeof(*dest) bytes
+template <typename T>
+void toku_unsafe_set(T *dest, const T src) {
+ if (0)
+ TOKU_VALGRIND_HG_DISABLE_CHECKING(dest,
+ sizeof *dest); // disabled, see comment
+ TOKU_ANNOTATE_IGNORE_WRITES_BEGIN();
+ *dest = src;
+ TOKU_ANNOTATE_IGNORE_WRITES_END();
+ if (0)
+ TOKU_VALGRIND_HG_ENABLE_CHECKING(dest,
+ sizeof *dest); // disabled, see comment
+}
+
+template <typename T>
+void toku_unsafe_set(T &dest, const T src) {
+ toku_unsafe_set(&dest, src);
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
new file mode 100644
index 000000000..46111e7f0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
@@ -0,0 +1,193 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include "toku_config.h"
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <time.h>
+#if defined(__powerpc__)
+#include <sys/platform/ppc.h>
+#endif
+
+#if 0
+static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
+ return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec));
+}
+// PORT2: temporary:
+#define HAVE_CLOCK_REALTIME
+#if !defined(HAVE_CLOCK_REALTIME)
+// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time.
+typedef int clockid_t;
+// just something bogus, it doesn't matter, we just want to make sure we're
+// only supporting this mode because we're not sure we can support other modes
+// without a real clock_gettime()
+#define CLOCK_REALTIME 0x01867234
+#endif
+int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default")));
+#endif
+
+// *************** Performance timers ************************
+// What do you really want from a performance timer:
+// (1) Can determine actual time of day from the performance time.
+// (2) Time goes forward, never backward.
+// (3) Same time on different processors (or even different machines).
+// (4) Time goes forward at a constant rate (doesn't get faster and slower)
+// (5) Portable.
+// (6) Getting the time is cheap.
+// Unfortuately it seems tough to get Properties 1-5. So we go for Property 6,,
+// but we abstract it. We offer a type tokutime_t which can hold the time. This
+// type can be subtracted to get a time difference. We can get the present time
+// cheaply. We can convert this type to seconds (but that can be expensive). The
+// implementation is to use RDTSC (hence we lose property 3: not portable).
+// Recent machines have constant_tsc in which case we get property (4).
+// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock
+// skew, so we get property (3). We get property 2 with RDTSC (as long as
+// there's not any skew). We don't even try to get propety 1, since we don't
+// need it. The decision here is that these times are really accurate only on
+// modern machines with modern OSs.
+typedef uint64_t tokutime_t; // Time type used in by tokutek timers.
+
+#if 0
+// The value of tokutime_t is not specified here.
+// It might be microseconds since 1/1/1970 (if gettimeofday() is
+// used), or clock cycles since boot (if rdtsc is used). Or something
+// else.
+// Two tokutime_t values can be subtracted to get a time difference.
+// Use tokutime_to_seconds to that convert difference to seconds.
+// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
+//
+// For accurate time calculations do the subtraction in the right order:
+// Right: tokutime_to_seconds(t1-t2);
+// Wrong tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
+// Doing it the wrong way is likely to result in loss of precision.
+// A double can hold numbers up to about 53 bits. RDTSC which uses about 33 bits every second, so that leaves
+// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
+//
+double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
+
+#endif
+
+// Get the value of tokutime for right now. We want this to be fast, so we
+// expose the implementation as RDTSC.
+static inline tokutime_t toku_time_now(void) {
+#if defined(__x86_64__) || defined(__i386__)
+ uint32_t lo, hi;
+ __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+ return (uint64_t)hi << 32 | lo;
+#elif defined(__aarch64__)
+ uint64_t result;
+ __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result));
+ return result;
+#elif defined(__powerpc__)
+ return __ppc_get_timebase();
+#elif defined(__s390x__)
+ uint64_t result;
+ asm volatile("stckf %0" : "=Q"(result) : : "cc");
+ return result;
+#elif defined(__riscv) && __riscv_xlen == 32
+ uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+ // Implemented in assembly because Clang insisted on branching.
+ asm volatile(
+ "rdcycleh %0\n"
+ "rdcycle %1\n"
+ "rdcycleh %2\n"
+ "sub %0, %0, %2\n"
+ "seqz %0, %0\n"
+ "sub %0, zero, %0\n"
+ "and %1, %1, %0\n"
+ : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+ return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+#elif defined(__riscv) && __riscv_xlen == 64
+ uint64_t cycles;
+ asm volatile("rdcycle %0" : "=r"(cycles));
+ return cycles;
+#else
+#error No timer implementation for this platform
+#endif
+}
+
+static inline uint64_t toku_current_time_microsec(void) {
+ struct timeval t;
+ gettimeofday(&t, NULL);
+ return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
+}
+
+#if 0
+// sleep microseconds
+static inline void toku_sleep_microsec(uint64_t ms) {
+ struct timeval t;
+
+ t.tv_sec = ms / 1000000;
+ t.tv_usec = ms % 1000000;
+
+ select(0, NULL, NULL, NULL, &t);
+}
+#endif
+
+/*
+ PORT: Usage of this file:
+
+ uint64_t toku_current_time_microsec() // uses gettimeoday
+ is used to track how much time various operations took (for example, lock
+ escalation). (TODO: it is not clear why these operations are tracked with
+ microsecond precision while others use nanoseconds)
+
+ tokutime_t toku_time_now() // uses rdtsc
+ seems to be used for a very similar purpose. This has greater precision
+
+ RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which
+ should be adequate substitutes.
+*/
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
new file mode 100644
index 000000000..803914862
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
@@ -0,0 +1,27 @@
+//
+// A substitute for ft/txn/txn.h
+//
+#pragma once
+
+#include <set>
+
+#include "../util/omt.h"
+
+typedef uint64_t TXNID;
+#define TXNID_NONE ((TXNID)0)
+
+// A set of transactions
+// (TODO: consider using class toku::txnid_set. The reason for using STL
+// container was that its API is easier)
+class TxnidVector : public std::set<TXNID> {
+ public:
+ bool contains(TXNID txnid) { return find(txnid) != end(); }
+};
+
+// A value for lock structures with a meaning "the lock is owned by multiple
+// transactions (and one has to check the TxnidVector to get their ids)
+#define TXNID_SHARED (TXNID(-1))
+
+// Auxiliary value meaning "any transaction id will do". No real transaction
+// may have this is as id.
+#define TXNID_ANY (TXNID(-2))
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
new file mode 100644
index 000000000..50dc879ce
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
@@ -0,0 +1,132 @@
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+/*
+ This is a dump ground to make Lock Tree work without the rest of TokuDB.
+*/
+#include <string.h>
+
+#include "db.h"
+#include "ft/ft-status.h"
+#include "portability/memory.h"
+#include "util/dbt.h"
+
+// portability/os_malloc.cc
+
+void toku_free(void *p) { free(p); }
+
+void *toku_xmalloc(size_t size) { return malloc(size); }
+
+void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); }
+
+void *toku_xmemdup(const void *v, size_t len) {
+ void *p = toku_xmalloc(len);
+ memcpy(p, v, len);
+ return p;
+}
+
+// TODO: what are the X-functions? Xcalloc, Xrealloc?
+void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); }
+
+// ft-ft-opts.cc:
+
+// locktree
+toku_instr_key lock_request_m_wait_cond_key;
+toku_instr_key manager_m_escalator_done_key;
+toku_instr_key locktree_request_info_mutex_key;
+toku_instr_key locktree_request_info_retry_mutex_key;
+toku_instr_key locktree_request_info_retry_cv_key;
+
+toku_instr_key treenode_mutex_key;
+toku_instr_key manager_mutex_key;
+toku_instr_key manager_escalation_mutex_key;
+toku_instr_key manager_escalator_mutex_key;
+
+// portability/memory.cc
+size_t toku_memory_footprint(void *, size_t touched) { return touched; }
+
+// ft/ft-status.c
+// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with
+// "TOKU"
+LTM_STATUS_S ltm_status;
+void LTM_STATUS_S::init() {
+ if (m_initialized) return;
+#define LTM_STATUS_INIT(k, c, t, l) \
+ TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \
+ TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS)
+ LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64,
+ "memory size");
+ LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64,
+ "memory size limit");
+ LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64,
+ "number of times lock escalation ran");
+ LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS,
+ STATUS_TOKUTIME, "time spent running escalation (seconds)");
+ LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT,
+ LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64,
+ "latest post-escalation memory size");
+ LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64,
+ "number of locktrees open now");
+ LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS,
+ STATUS_UINT64, "number of pending lock requests");
+ LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM,
+ STATUS_UINT64, "number of locktrees eligible for the STO");
+ LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM,
+ STATUS_UINT64,
+ "number of times a locktree ended the STO early");
+ LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS,
+ STATUS_TOKUTIME, "time spent ending the STO early (seconds)");
+ LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64,
+ "number of wait locks");
+ LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64,
+ "time waiting for locks");
+ LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64,
+ "number of long wait locks");
+ LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64,
+ "long time waiting for locks");
+ LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64,
+ "number of lock timeouts");
+ LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT,
+ STATUS_UINT64, "number of waits on lock escalation");
+ LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME,
+ STATUS_UINT64, "time waiting on lock escalation");
+ LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT,
+ LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64,
+ "number of long waits on lock escalation");
+ LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME,
+ LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64,
+ "long time waiting on lock escalation");
+
+ m_initialized = true;
+#undef LTM_STATUS_INIT
+}
+void LTM_STATUS_S::destroy() {
+ if (!m_initialized) return;
+ for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) {
+ if (status[i].type == STATUS_PARCOUNT) {
+ // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount);
+ }
+ }
+}
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+ size_t key2len) {
+ size_t comparelen = key1len < key2len ? key1len : key2len;
+ int c = memcmp(key1, key2, comparelen);
+ if (__builtin_expect(c != 0, 1)) {
+ return c;
+ } else {
+ if (key1len < key2len) {
+ return -1;
+ } else if (key1len > key2len) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+int toku_builtin_compare_fun(const DBT *a, const DBT *b) {
+ return toku_keycompare(a->data, a->size, b->data, b->size);
+}
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
new file mode 100644
index 000000000..63cc3a267
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
@@ -0,0 +1,153 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "dbt.h"
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+DBT *toku_init_dbt(DBT *dbt) {
+ memset(dbt, 0, sizeof(*dbt));
+ return dbt;
+}
+
+DBT toku_empty_dbt(void) {
+ static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0};
+ return empty_dbt;
+}
+
+DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
+ toku_init_dbt(dbt);
+ dbt->flags = flags;
+ return dbt;
+}
+
+void toku_destroy_dbt(DBT *dbt) {
+ switch (dbt->flags) {
+ case DB_DBT_MALLOC:
+ case DB_DBT_REALLOC:
+ toku_free(dbt->data);
+ toku_init_dbt(dbt);
+ break;
+ }
+}
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) {
+ toku_init_dbt(dbt);
+ dbt->size = len;
+ dbt->data = (char *)k;
+ return dbt;
+}
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
+ toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
+ dbt->size = len;
+ dbt->data = toku_xmemdup(k, len);
+ return dbt;
+}
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
+ dst->flags = 0;
+ dst->ulen = 0;
+ dst->size = src.size;
+ dst->data = src.data;
+ return dst;
+}
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
+ return toku_memdup_dbt(dst, src.data, src.size);
+}
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt) {
+ if (sdbt->data) toku_free(sdbt->data);
+ memset(sdbt, 0, sizeof(*sdbt));
+}
+
+const DBT *toku_dbt_positive_infinity(void) {
+ static DBT positive_infinity_dbt = {
+ .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port
+ return &positive_infinity_dbt;
+}
+
+const DBT *toku_dbt_negative_infinity(void) {
+ static DBT negative_infinity_dbt = {
+ .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port
+ return &negative_infinity_dbt;
+}
+
+bool toku_dbt_is_infinite(const DBT *dbt) {
+ return dbt == toku_dbt_positive_infinity() ||
+ dbt == toku_dbt_negative_infinity();
+}
+
+bool toku_dbt_is_empty(const DBT *dbt) {
+ // can't have a null data field with a non-zero size
+ paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
+ return dbt->data == nullptr;
+}
+
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
+ if (a == b) {
+ return 0;
+ } else if (a == toku_dbt_positive_infinity()) {
+ return 1;
+ } else if (b == toku_dbt_positive_infinity()) {
+ return -1;
+ } else if (a == toku_dbt_negative_infinity()) {
+ return -1;
+ } else {
+ invariant(b == toku_dbt_negative_infinity());
+ return 1;
+ }
+}
+
+bool toku_dbt_equals(const DBT *a, const DBT *b) {
+ if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
+ return a->data == b->data && a->size == b->size;
+ } else {
+ // a or b is infinite, so they're equal if they are the same infinite
+ return a == b ? true : false;
+ }
+}
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
new file mode 100644
index 000000000..d86c440f8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
@@ -0,0 +1,98 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+
+// TODO: John
+// Document this API a little better so that DBT
+// memory management can be morm widely understood.
+
+DBT *toku_init_dbt(DBT *);
+
+// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
+DBT toku_empty_dbt(void);
+
+DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
+
+void toku_destroy_dbt(DBT *);
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src);
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src);
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt);
+
+// returns: special DBT pointer representing positive infinity
+const DBT *toku_dbt_positive_infinity(void);
+
+// returns: special DBT pointer representing negative infinity
+const DBT *toku_dbt_negative_infinity(void);
+
+// returns: true if the given dbt is either positive or negative infinity
+bool toku_dbt_is_infinite(const DBT *dbt);
+
+// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
+bool toku_dbt_is_empty(const DBT *dbt);
+
+// effect: compares two potentially infinity-valued dbts
+// requires: at least one is infinite (assert otherwise)
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
+
+// returns: true if the given dbts have the same data pointer and size
+bool toku_dbt_equals(const DBT *a, const DBT *b);
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
new file mode 100644
index 000000000..158750fdb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
@@ -0,0 +1,144 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+
+//******************************************************************************
+//
+// Overview: A growable array is a little bit like std::vector except that
+// it doesn't have constructors (hence can be used in static constructs, since
+// the google style guide says no constructors), and it's a little simpler.
+// Operations:
+// init and deinit (we don't have constructors and destructors).
+// fetch_unchecked to get values out.
+// store_unchecked to put values in.
+// push to add an element at the end
+// get_size to find out the size
+// get_memory_size to find out how much memory the data stucture is using.
+//
+//******************************************************************************
+
+namespace toku {
+
+template <typename T>
+class GrowableArray {
+ public:
+ void init(void)
+ // Effect: Initialize the array to contain no elements.
+ {
+ m_array = NULL;
+ m_size = 0;
+ m_size_limit = 0;
+ }
+
+ void deinit(void)
+ // Effect: Deinitialize the array (freeing any memory it uses, for example).
+ {
+ toku_free(m_array);
+ m_array = NULL;
+ m_size = 0;
+ m_size_limit = 0;
+ }
+
+ T fetch_unchecked(size_t i) const
+ // Effect: Fetch the ith element. If i is out of range, the system asserts.
+ {
+ return m_array[i];
+ }
+
+ void store_unchecked(size_t i, T v)
+ // Effect: Store v in the ith element. If i is out of range, the system
+ // asserts.
+ {
+ paranoid_invariant(i < m_size);
+ m_array[i] = v;
+ }
+
+ void push(T v)
+ // Effect: Add v to the end of the array (increasing the size). The amortized
+ // cost of this operation is constant. Implementation hint: Double the size
+ // of the array when it gets too big so that the amortized cost stays
+ // constant.
+ {
+ if (m_size >= m_size_limit) {
+ if (m_array == NULL) {
+ m_size_limit = 1;
+ } else {
+ m_size_limit *= 2;
+ }
+ XREALLOC_N(m_size_limit, m_array);
+ }
+ m_array[m_size++] = v;
+ }
+
+ size_t get_size(void) const
+ // Effect: Return the number of elements in the array.
+ {
+ return m_size;
+ }
+ size_t memory_size(void) const
+ // Effect: Return the size (in bytes) that the array occupies in memory. This
+ // is really only an estimate.
+ {
+ return sizeof(*this) + sizeof(T) * m_size_limit;
+ }
+
+ private:
+ T *m_array;
+ size_t m_size;
+ size_t m_size_limit; // How much space is allocated in array.
+};
+
+} // namespace toku
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
new file mode 100644
index 000000000..0e7a9880b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
@@ -0,0 +1,201 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "memarena.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+#include "../portability/memory.h"
+
+void memarena::create(size_t initial_size) {
+ _current_chunk = arena_chunk();
+ _other_chunks = nullptr;
+ _size_of_other_chunks = 0;
+ _footprint_of_other_chunks = 0;
+ _n_other_chunks = 0;
+
+ _current_chunk.size = initial_size;
+ if (_current_chunk.size > 0) {
+ XMALLOC_N(_current_chunk.size, _current_chunk.buf);
+ }
+}
+
+void memarena::destroy(void) {
+ if (_current_chunk.buf) {
+ toku_free(_current_chunk.buf);
+ }
+ for (int i = 0; i < _n_other_chunks; i++) {
+ toku_free(_other_chunks[i].buf);
+ }
+ if (_other_chunks) {
+ toku_free(_other_chunks);
+ }
+ _current_chunk = arena_chunk();
+ _other_chunks = nullptr;
+ _n_other_chunks = 0;
+}
+
+static size_t round_to_page(size_t size) {
+ const size_t page_size = 4096;
+ const size_t r = page_size + ((size - 1) & ~(page_size - 1));
+ assert((r & (page_size - 1)) == 0); // make sure it's aligned
+ assert(r >= size); // make sure it's not too small
+ assert(r <
+ size + page_size); // make sure we didn't grow by more than a page.
+ return r;
+}
+
+static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
+
+void *memarena::malloc_from_arena(size_t size) {
+ if (_current_chunk.buf == nullptr ||
+ _current_chunk.size < _current_chunk.used + size) {
+ // The existing block isn't big enough.
+ // Add the block to the vector of blocks.
+ if (_current_chunk.buf) {
+ invariant(_current_chunk.size > 0);
+ int old_n = _n_other_chunks;
+ XREALLOC_N(old_n + 1, _other_chunks);
+ _other_chunks[old_n] = _current_chunk;
+ _n_other_chunks = old_n + 1;
+ _size_of_other_chunks += _current_chunk.size;
+ _footprint_of_other_chunks +=
+ toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+ }
+
+ // Make a new one. Grow the buffer size exponentially until we hit
+ // the max chunk size, but make it at least `size' bytes so the
+ // current allocation always fit.
+ size_t new_size =
+ std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
+ if (new_size < size) {
+ new_size = size;
+ }
+ new_size = round_to_page(
+ new_size); // at least size, but round to the next page size
+ XMALLOC_N(new_size, _current_chunk.buf);
+ _current_chunk.used = 0;
+ _current_chunk.size = new_size;
+ }
+ invariant(_current_chunk.buf != nullptr);
+
+ // allocate in the existing block.
+ char *p = _current_chunk.buf + _current_chunk.used;
+ _current_chunk.used += size;
+ return p;
+}
+
+void memarena::move_memory(memarena *dest) {
+ // Move memory to dest
+ XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
+ dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
+ dest->_footprint_of_other_chunks +=
+ _footprint_of_other_chunks +
+ toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+ for (int i = 0; i < _n_other_chunks; i++) {
+ dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
+ }
+ dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
+
+ // Clear out this memarena's memory
+ toku_free(_other_chunks);
+ _current_chunk = arena_chunk();
+ _other_chunks = nullptr;
+ _size_of_other_chunks = 0;
+ _footprint_of_other_chunks = 0;
+ _n_other_chunks = 0;
+}
+
+size_t memarena::total_memory_size(void) const {
+ return sizeof(*this) + total_size_in_use() +
+ _n_other_chunks * sizeof(*_other_chunks);
+}
+
+size_t memarena::total_size_in_use(void) const {
+ return _size_of_other_chunks + _current_chunk.used;
+}
+
+size_t memarena::total_footprint(void) const {
+ return sizeof(*this) + _footprint_of_other_chunks +
+ toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
+ _n_other_chunks * sizeof(*_other_chunks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const void *memarena::chunk_iterator::current(size_t *used) const {
+ if (_chunk_idx < 0) {
+ *used = _ma->_current_chunk.used;
+ return _ma->_current_chunk.buf;
+ } else if (_chunk_idx < _ma->_n_other_chunks) {
+ *used = _ma->_other_chunks[_chunk_idx].used;
+ return _ma->_other_chunks[_chunk_idx].buf;
+ }
+ *used = 0;
+ return nullptr;
+}
+
+void memarena::chunk_iterator::next() { _chunk_idx++; }
+
+bool memarena::chunk_iterator::more() const {
+ if (_chunk_idx < 0) {
+ return _ma->_current_chunk.buf != nullptr;
+ }
+ return _chunk_idx < _ma->_n_other_chunks;
+}
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
new file mode 100644
index 000000000..ddcc1144f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+/*
+ * A memarena is used to efficiently store a collection of objects that never
+ * move The pattern is allocate more and more stuff and free all of the items at
+ * once. The underlying memory will store 1 or more objects per chunk. Each
+ * chunk is contiguously laid out in memory but chunks are not necessarily
+ * contiguous with each other.
+ */
+class memarena {
+ public:
+ memarena()
+ : _current_chunk(arena_chunk()),
+ _other_chunks(nullptr),
+ _n_other_chunks(0),
+ _size_of_other_chunks(0),
+ _footprint_of_other_chunks(0) {}
+
+ // Effect: Create a memarena with the specified initial size
+ void create(size_t initial_size);
+
+ void destroy(void);
+
+ // Effect: Allocate some memory. The returned value remains valid until the
+ // memarena is cleared or closed.
+ // In case of ENOMEM, aborts.
+ void *malloc_from_arena(size_t size);
+
+ // Effect: Move all the memory from this memarena into DEST.
+ // When SOURCE is closed the memory won't be freed.
+ // When DEST is closed, the memory will be freed, unless DEST moves
+ // its memory to another memarena...
+ void move_memory(memarena *dest);
+
+ // Effect: Calculate the amount of memory used by a memory arena.
+ size_t total_memory_size(void) const;
+
+ // Effect: Calculate the used space of the memory arena (ie: excludes unused
+ // space)
+ size_t total_size_in_use(void) const;
+
+ // Effect: Calculate the amount of memory used, according to
+ // toku_memory_footprint(),
+ // which is a more expensive but more accurate count of memory used.
+ size_t total_footprint(void) const;
+
+ // iterator over the underlying chunks that store objects in the memarena.
+ // a chunk is represented by a pointer to const memory and a usable byte
+ // count.
+ class chunk_iterator {
+ public:
+ chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {}
+
+ // returns: base pointer to the current chunk
+ // *used set to the number of usable bytes
+ // if more() is false, returns nullptr and *used = 0
+ const void *current(size_t *used) const;
+
+ // requires: more() is true
+ void next();
+
+ bool more() const;
+
+ private:
+ // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
+ // >= 0 represents the i'th chunk in the ma->_other_chunks array
+ const memarena *_ma;
+ int _chunk_idx;
+ };
+
+ private:
+ struct arena_chunk {
+ arena_chunk() : buf(nullptr), used(0), size(0) {}
+ char *buf;
+ size_t used;
+ size_t size;
+ };
+
+ struct arena_chunk _current_chunk;
+ struct arena_chunk *_other_chunks;
+ int _n_other_chunks;
+ size_t _size_of_other_chunks; // the buf_size of all the other chunks.
+ size_t _footprint_of_other_chunks; // the footprint of all the other chunks.
+
+ friend class memarena_unit_test;
+};
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h
new file mode 100644
index 000000000..f208002d3
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h
@@ -0,0 +1,794 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+#include <stdint.h>
+
+#include "../portability/toku_portability.h"
+#include "../portability/toku_race_tools.h"
+#include "growable_array.h"
+
+namespace toku {
+
+/**
+ * Order Maintenance Tree (OMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an
+ * integer weight. The OMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight. The weight of the $i$th element is denoted
+ * $w(V_i)$.
+ *
+ * We can create a new OMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ * $|V'|=1+|V|$ and
+ *
+ * V'_j = V_j if $j<i$
+ * x if $j=i$
+ * V_{j-1} if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of
+ * $b$ must be monotonically increasing. When lookup up values, we can look up
+ * $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a
+ * special return code if no such value exists.) (Rationale: Ordinarily we want
+ * $i$ to be unique. But for various reasons we want to allow multiple zeros,
+ * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum
+ * integer such that $b(V_i)>0$. (Or an indication that no such value exists.)
+ * $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an
+ * indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its
+ * index.
+ *
+ * We can also split an OMT into two OMTs, splitting the weight of the values
+ * evenly. Find a value $j$ such that the values to the left of $j$ have about
+ * the same total weight as the values to the right of $j$. The resulting two
+ * OMTs contain the values to the left of $j$ and the values to the right of $j$
+ * respectively. All of the values from the original OMT go into one of the new
+ * OMTs. If the weights of the values don't split exactly evenly, then the
+ * implementation has the freedom to choose whether the new left OMT or the new
+ * right OMT is larger.
+ *
+ * Performance:
+ * Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$
+ * calls to the Heaviside function. The memory required is O(|V|).
+ *
+ * Usage:
+ * The omt is templated by two parameters:
+ * - omtdata_t is what will be stored within the omt. These could be pointers
+ * or real data types (ints, structs).
+ * - omtdataout_t is what will be returned by find and related functions. By
+ * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To
+ * create an omt which will store "TXNID"s, for example, it is a good idea to
+ * typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing
+ * structs, you may want to be able to get a pointer to the data actually stored
+ * in the omt (see find_zero). To do this, use the second template parameter:
+ * typedef omt<struct foo, struct foo *> foo_omt_t;
+ */
+
+namespace omt_internal {
+
+template <bool subtree_supports_marks>
+class subtree_templated {
+ private:
+ uint32_t m_index;
+
+ public:
+ static const uint32_t NODE_NULL = UINT32_MAX;
+ inline void set_to_null(void) { m_index = NODE_NULL; }
+
+ inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+ inline uint32_t get_index(void) const { return m_index; }
+
+ inline void set_index(uint32_t index) {
+ paranoid_invariant(index != NODE_NULL);
+ m_index = index;
+ }
+} __attribute__((__packed__, aligned(4)));
+
+template <>
+class subtree_templated<true> {
+ private:
+ uint32_t m_bitfield;
+ static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
+ static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
+
+ inline void set_index_internal(uint32_t new_index) {
+ m_bitfield = (m_bitfield & MASK_BIT) | new_index;
+ }
+
+ public:
+ static const uint32_t NODE_NULL = INT32_MAX;
+ inline void set_to_null(void) { this->set_index_internal(NODE_NULL); }
+
+ inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+ inline uint32_t get_index(void) const {
+ TOKU_DRD_IGNORE_VAR(m_bitfield);
+ const uint32_t bits = m_bitfield;
+ TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+ return bits & MASK_INDEX;
+ }
+
+ inline void set_index(uint32_t index) {
+ paranoid_invariant(index < NODE_NULL);
+ this->set_index_internal(index);
+ }
+
+ inline bool get_bit(void) const {
+ TOKU_DRD_IGNORE_VAR(m_bitfield);
+ const uint32_t bits = m_bitfield;
+ TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+ return (bits & MASK_BIT) != 0;
+ }
+
+ inline void enable_bit(void) {
+ // These bits may be set by a thread with a write lock on some
+ // leaf, and the index can be read by another thread with a (read
+ // or write) lock on another thread. Also, the has_marks_below
+ // bit can be set by two threads simultaneously. Neither of these
+ // are real races, so if we are using DRD we should tell it to
+ // ignore these bits just while we set this bit. If there were a
+ // race in setting the index, that would be a real race.
+ TOKU_DRD_IGNORE_VAR(m_bitfield);
+ m_bitfield |= MASK_BIT;
+ TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+ }
+
+ inline void disable_bit(void) { m_bitfield &= MASK_INDEX; }
+} __attribute__((__packed__));
+
+template <typename omtdata_t, bool subtree_supports_marks>
+class omt_node_templated {
+ public:
+ omtdata_t value;
+ uint32_t weight;
+ subtree_templated<subtree_supports_marks> left;
+ subtree_templated<subtree_supports_marks> right;
+
+ // this needs to be in both implementations because we don't have
+ // a "static if" the caller can use
+ inline void clear_stolen_bits(void) {}
+}; // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+template <typename omtdata_t>
+class omt_node_templated<omtdata_t, true> {
+ public:
+ omtdata_t value;
+ uint32_t weight;
+ subtree_templated<true> left;
+ subtree_templated<true> right;
+ inline bool get_marked(void) const { return left.get_bit(); }
+ inline void set_marked_bit(void) { return left.enable_bit(); }
+ inline void unset_marked_bit(void) { return left.disable_bit(); }
+
+ inline bool get_marks_below(void) const { return right.get_bit(); }
+ inline void set_marks_below_bit(void) {
+ // This function can be called by multiple threads.
+ // Checking first reduces cache invalidation.
+ if (!this->get_marks_below()) {
+ right.enable_bit();
+ }
+ }
+ inline void unset_marks_below_bit(void) { right.disable_bit(); }
+
+ inline void clear_stolen_bits(void) {
+ this->unset_marked_bit();
+ this->unset_marks_below_bit();
+ }
+}; // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+} // namespace omt_internal
+
+template <typename omtdata_t, typename omtdataout_t = omtdata_t,
+ bool supports_marks = false>
+class omt {
+ public:
+ /**
+ * Effect: Create an empty OMT.
+ * Performance: constant time.
+ */
+ void create(void);
+
+ /**
+ * Effect: Create an empty OMT with no internal allocated space.
+ * Performance: constant time.
+ * Rationale: In some cases we need a valid omt but don't want to malloc.
+ */
+ void create_no_array(void);
+
+ /**
+ * Effect: Create a OMT containing values. The number of values is in
+ * numvalues. Stores the new OMT in *omtp. Requires: this has not been created
+ * yet Requires: values != NULL Requires: values is sorted Performance:
+ * time=O(numvalues) Rationale: Normally to insert N values takes O(N lg N)
+ * amortized time. If the N values are known in advance, are sorted, and the
+ * structure is empty, we can batch insert them much faster.
+ */
+ __attribute__((nonnull)) void create_from_sorted_array(
+ const omtdata_t *const values, const uint32_t numvalues);
+
+ /**
+ * Effect: Create an OMT containing values. The number of values is in
+ * numvalues. On success the OMT takes ownership of *values array, and sets
+ * values=NULL. Requires: this has not been created yet Requires: values !=
+ * NULL Requires: *values is sorted Requires: *values was allocated with
+ * toku_malloc Requires: Capacity of the *values array is <= new_capacity
+ * Requires: On success, *values may not be accessed again by the caller.
+ * Performance: time=O(1)
+ * Rational: create_from_sorted_array takes O(numvalues) time.
+ * By taking ownership of the array, we save a malloc and
+ * memcpy, and possibly a free (if the caller is done with the array).
+ */
+ void create_steal_sorted_array(omtdata_t **const values,
+ const uint32_t numvalues,
+ const uint32_t new_capacity);
+
+ /**
+ * Effect: Create a new OMT, storing it in *newomt.
+ * The values to the right of index (starting at index) are moved to *newomt.
+ * Requires: newomt != NULL
+ * Returns
+ * 0 success,
+ * EINVAL if index > toku_omt_size(omt)
+ * On nonzero return, omt and *newomt are unmodified.
+ * Performance: time=O(n)
+ * Rationale: We don't need a split-evenly operation. We need to split items
+ * so that their total sizes are even, and other similar splitting criteria.
+ * It's easy to split evenly by calling size(), and dividing by two.
+ */
+ __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx);
+
+ /**
+ * Effect: Appends leftomt and rightomt to produce a new omt.
+ * Creates this as the new omt.
+ * leftomt and rightomt are destroyed.
+ * Performance: time=O(n) is acceptable, but one can imagine implementations
+ * that are O(\log n) worst-case.
+ */
+ __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt);
+
+ /**
+ * Effect: Creates a copy of an omt.
+ * Creates this as the clone.
+ * Each element is copied directly. If they are pointers, the underlying
+ * data is not duplicated. Performance: O(n) or the running time of
+ * fill_array_with_subtree_values()
+ */
+ void clone(const omt &src);
+
+ /**
+ * Effect: Set the tree to be empty.
+ * Note: Will not reallocate or resize any memory.
+ * Performance: time=O(1)
+ */
+ void clear(void);
+
+ /**
+ * Effect: Destroy an OMT, freeing all its memory.
+ * If the values being stored are pointers, their underlying data is not
+ * freed. See free_items() Those values may be freed before or after calling
+ * toku_omt_destroy. Rationale: Returns no values since free() cannot fail.
+ * Rationale: Does not free the underlying pointers to reduce complexity.
+ * Performance: time=O(1)
+ */
+ void destroy(void);
+
+ /**
+ * Effect: return |this|.
+ * Performance: time=O(1)
+ */
+ uint32_t size(void) const;
+
+ /**
+ * Effect: Insert value into the OMT.
+ * If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+ * Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+ * If no such i exists, then let i be |V|
+ * Then this has the same effect as
+ * insert_at(tree, value, i);
+ * If idx!=NULL then i is stored in *idx
+ * Requires: The signum of h must be monotonically increasing.
+ * Returns:
+ * 0 success
+ * DB_KEYEXIST the key is present (h was equal to zero for some value)
+ * On nonzero return, omt is unchanged.
+ * Performance: time=O(\log N) amortized.
+ * Rationale: Some future implementation may be O(\log N) worst-case time, but
+ * O(\log N) amortized is good enough for now.
+ */
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
+
+ /**
+ * Effect: Increases indexes of all items at slot >= idx by 1.
+ * Insert value into the position at idx.
+ * Returns:
+ * 0 success
+ * EINVAL if idx > this->size()
+ * On error, omt is unchanged.
+ * Performance: time=O(\log N) amortized time.
+ * Rationale: Some future implementation may be O(\log N) worst-case time, but
+ * O(\log N) amortized is good enough for now.
+ */
+ int insert_at(const omtdata_t &value, const uint32_t idx);
+
+ /**
+ * Effect: Replaces the item at idx with value.
+ * Returns:
+ * 0 success
+ * EINVAL if idx>=this->size()
+ * On error, omt is unchanged.
+ * Performance: time=O(\log N)
+ * Rationale: The FT needs to be able to replace a value with another copy of
+ * the same value (allocated in a different location)
+ *
+ */
+ int set_at(const omtdata_t &value, const uint32_t idx);
+
+ /**
+ * Effect: Delete the item in slot idx.
+ * Decreases indexes of all items at slot > idx by 1.
+ * Returns
+ * 0 success
+ * EINVAL if idx>=this->size()
+ * On error, omt is unchanged.
+ * Rationale: To delete an item, first find its index using find or find_zero,
+ * then delete it. Performance: time=O(\log N) amortized.
+ */
+ int delete_at(const uint32_t idx);
+
+ /**
+ * Effect: Iterate over the values of the omt, from left to right, calling f
+ * on each value. The first argument passed to f is a ref-to-const of the
+ * value stored in the omt. The second argument passed to f is the index of
+ * the value. The third argument passed to f is iterate_extra. The indices run
+ * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL
+ * Returns:
+ * If f ever returns nonzero, then the iteration stops, and the value
+ * returned by f is returned by iterate. If f always returns zero, then
+ * iterate returns 0. Requires: Don't modify the omt while running. (E.g., f
+ * may not insert or delete values from the omt.) Performance: time=O(i+\log
+ * N) where i is the number of times f is called, and N is the number of
+ * elements in the omt. Rationale: Although the functional iterator requires
+ * defining another function (as opposed to C++ style iterator), it is much
+ * easier to read. Rationale: We may at some point use functors, but for now
+ * this is a smaller change from the old OMT.
+ */
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate(iterate_extra_t *const iterate_extra) const;
+
+ /**
+ * Effect: Iterate over the values of the omt, from left to right, calling f
+ * on each value. The first argument passed to f is a ref-to-const of the
+ * value stored in the omt. The second argument passed to f is the index of
+ * the value. The third argument passed to f is iterate_extra. The indices run
+ * from 0 (inclusive) to this->size() (exclusive). We will iterate only over
+ * [left,right)
+ *
+ * Requires: left <= right
+ * Requires: f != NULL
+ * Returns:
+ * EINVAL if right > this->size()
+ * If f ever returns nonzero, then the iteration stops, and the value
+ * returned by f is returned by iterate_on_range. If f always returns zero,
+ * then iterate_on_range returns 0. Requires: Don't modify the omt while
+ * running. (E.g., f may not insert or delete values from the omt.)
+ * Performance: time=O(i+\log N) where i is the number of times f is called,
+ * and N is the number of elements in the omt. Rational: Although the
+ * functional iterator requires defining another function (as opposed to C++
+ * style iterator), it is much easier to read.
+ */
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_on_range(const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) const;
+
+ /**
+ * Effect: Iterate over the values of the omt, and mark the nodes that are
+ * visited. Other than the marks, this behaves the same as iterate_on_range.
+ * Requires: supports_marks == true
+ * Performance: time=O(i+\log N) where i is the number of times f is called,
+ * and N is the number of elements in the omt. Notes: This function MAY be
+ * called concurrently by multiple threads, but not concurrently with any
+ * other non-const function.
+ */
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_and_mark_range(const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra);
+
+ /**
+ * Effect: Iterate over the values of the omt, from left to right, calling f
+ * on each value whose node has been marked. Other than the marks, this
+ * behaves the same as iterate. Requires: supports_marks == true Performance:
+ * time=O(i+\log N) where i is the number of times f is called, and N is the
+ * number of elements in the omt.
+ */
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
+
+ /**
+ * Effect: Delete all elements from the omt, whose nodes have been marked.
+ * Requires: supports_marks == true
+ * Performance: time=O(N + i\log N) where i is the number of marked elements,
+ * {c,sh}ould be faster
+ */
+ void delete_all_marked(void);
+
+ /**
+ * Effect: Verify that the internal state of the marks in the tree are
+ * self-consistent. Crashes the system if the marks are in a bad state.
+ * Requires: supports_marks == true
+ * Performance: time=O(N)
+ * Notes:
+ * Even though this is a const function, it requires exclusive access.
+ * Rationale:
+ * The current implementation of the marks relies on a sort of
+ * "cache" bit representing the state of bits below it in the tree.
+ * This allows glass-box testing that these bits are correct.
+ */
+ void verify_marks_consistent(void) const;
+
+ /**
+ * Effect: None
+ * Returns whether there are any marks in the tree.
+ */
+ bool has_marks(void) const;
+
+ /**
+ * Effect: Iterate over the values of the omt, from left to right, calling f
+ * on each value. The first argument passed to f is a pointer to the value
+ * stored in the omt. The second argument passed to f is the index of the
+ * value. The third argument passed to f is iterate_extra. The indices run
+ * from 0 (inclusive) to this->size() (exclusive). Requires: same as for
+ * iterate() Returns: same as for iterate() Performance: same as for iterate()
+ * Rationale: In general, most iterators should use iterate() since they
+ * should not modify the data stored in the omt. This function is for
+ * iterators which need to modify values (for example, free_items). Rationale:
+ * We assume if you are transforming the data in place, you want to do it to
+ * everything at once, so there is not yet an iterate_on_range_ptr (but there
+ * could be).
+ */
+ template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+ void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+ /**
+ * Effect: Set *value=V_idx
+ * Returns
+ * 0 success
+ * EINVAL if index>=toku_omt_size(omt)
+ * On nonzero return, *value is unchanged
+ * Performance: time=O(\log N)
+ */
+ int fetch(const uint32_t idx, omtdataout_t *const value) const;
+
+ /**
+ * Effect: Find the smallest i such that h(V_i, extra)>=0
+ * If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value =
+ * V_i, and return 0. If there is such an i and h(V_i,extra)>0 then set
+ * *idxp=i and return DB_NOTFOUND. If there is no such i then set
+ * *idx=this->size() and return DB_NOTFOUND. Note: value is of type
+ * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is
+ * fixed by the instantiation. If it is the value type, then the value is
+ * copied out (even if the value type is a pointer to something else) If it is
+ * the pointer type, then *value is set to a pointer to the data within the
+ * omt. This is determined by the type of the omt as initially declared. If
+ * the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's
+ * will be returned by find and related functions. If the omt is declared as
+ * omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the
+ * stored items will be returned by find and related functions. Rationale:
+ * Structs too small for malloc should be stored directly in the omt.
+ * These structs may need to be edited as they exist inside the omt, so we
+ * need a way to get a pointer within the omt. Using separate functions for
+ * returning pointers and values increases code duplication and reduces
+ * type-checking. That also reduces the ability of the creator of a data
+ * structure to give advice to its future users. Slight overloading in this
+ * case seemed to provide a better API and better type checking.
+ */
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_zero(const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const;
+
+ /**
+ * Effect:
+ * If direction >0 then find the smallest i such that h(V_i,extra)>0.
+ * If direction <0 then find the largest i such that h(V_i,extra)<0.
+ * (Direction may not be equal to zero.)
+ * If value!=NULL then store V_i in *value
+ * If idxp!=NULL then store i in *idxp.
+ * Requires: The signum of h is monotically increasing.
+ * Returns
+ * 0 success
+ * DB_NOTFOUND no such value is found.
+ * On nonzero return, *value and *idxp are unchanged
+ * Performance: time=O(\log N)
+ * Rationale:
+ * Here's how to use the find function to find various things
+ * Cases for find:
+ * find first value: ( h(v)=+1, direction=+1 )
+ * find last value ( h(v)=-1, direction=-1 )
+ * find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 )
+ * find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 )
+ * find X or successor to X ( same as find first X. )
+ *
+ * Rationale: To help understand heaviside functions and behavor of find:
+ * There are 7 kinds of heaviside functions.
+ * The signus of the h must be monotonically increasing.
+ * Given a function of the following form, A is the element
+ * returned for direction>0, B is the element returned
+ * for direction<0, C is the element returned for
+ * direction==0 (see find_zero) (with a return of 0), and D is the element
+ * returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+ * If any of A, B, or C are not found, then asking for the
+ * associated direction will return DB_NOTFOUND.
+ * See find_zero for more information.
+ *
+ * Let the following represent the signus of the heaviside function.
+ *
+ * -...-
+ * A
+ * D
+ *
+ * +...+
+ * B
+ * D
+ *
+ * 0...0
+ * C
+ *
+ * -...-0...0
+ * AC
+ *
+ * 0...0+...+
+ * C B
+ *
+ * -...-+...+
+ * AB
+ * D
+ *
+ * -...-0...0+...+
+ * AC B
+ */
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find(const omtcmp_t &extra, int direction, omtdataout_t *const value,
+ uint32_t *const idxp) const;
+
+ /**
+ * Effect: Return the size (in bytes) of the omt, as it resides in main
+ * memory. If the data stored are pointers, don't include the size of what
+ * they all point to.
+ */
+ size_t memory_size(void);
+
+ private:
+ typedef uint32_t node_idx;
+ typedef omt_internal::subtree_templated<supports_marks> subtree;
+ typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
+ ENSURE_POD(subtree);
+
+ struct omt_array {
+ uint32_t start_idx;
+ uint32_t num_values;
+ omtdata_t *values;
+ };
+
+ struct omt_tree {
+ subtree root;
+ uint32_t free_idx;
+ omt_node *nodes;
+ };
+
+ bool is_array;
+ uint32_t capacity;
+ union {
+ struct omt_array a;
+ struct omt_tree t;
+ } d;
+
+ __attribute__((nonnull)) void unmark(const subtree &subtree,
+ const uint32_t index,
+ GrowableArray<node_idx> *const indexes);
+
+ void create_internal_no_array(const uint32_t new_capacity);
+
+ void create_internal(const uint32_t new_capacity);
+
+ uint32_t nweight(const subtree &subtree) const;
+
+ node_idx node_malloc(void);
+
+ void node_free(const node_idx idx);
+
+ void maybe_resize_array(const uint32_t n);
+
+ __attribute__((nonnull)) void fill_array_with_subtree_values(
+ omtdata_t *const array, const subtree &subtree) const;
+
+ void convert_to_array(void);
+
+ __attribute__((nonnull)) void rebuild_from_sorted_array(
+ subtree *const subtree, const omtdata_t *const values,
+ const uint32_t numvalues);
+
+ void convert_to_tree(void);
+
+ void maybe_resize_or_convert(const uint32_t n);
+
+ bool will_need_rebalance(const subtree &subtree, const int leftmod,
+ const int rightmod) const;
+
+ __attribute__((nonnull)) void insert_internal(
+ subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+ subtree **const rebalance_subtree);
+
+ void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
+
+ void set_at_internal(const subtree &subtree, const omtdata_t &value,
+ const uint32_t idx);
+
+ void delete_internal(subtree *const subtreep, const uint32_t idx,
+ omt_node *const copyn,
+ subtree **const rebalance_subtree);
+
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_internal_array(const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) const;
+
+ template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+ void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+ const subtree &subtree, const uint32_t idx,
+ iterate_extra_t *const iterate_extra);
+
+ template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+ void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra);
+
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_internal(const uint32_t left, const uint32_t right,
+ const subtree &subtree, const uint32_t idx,
+ iterate_extra_t *const iterate_extra) const;
+
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+ const subtree &subtree,
+ const uint32_t idx,
+ iterate_extra_t *const iterate_extra);
+
+ template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+ int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
+ iterate_extra_t *const iterate_extra) const;
+
+ uint32_t verify_marks_consistent_internal(const subtree &subtree,
+ const bool allow_marks) const;
+
+ void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
+
+ void fetch_internal(const subtree &subtree, const uint32_t i,
+ omtdataout_t *const value) const;
+
+ __attribute__((nonnull)) void fill_array_with_subtree_idxs(
+ node_idx *const array, const subtree &subtree) const;
+
+ __attribute__((nonnull)) void rebuild_subtree_from_idxs(
+ subtree *const subtree, const node_idx *const idxs,
+ const uint32_t numvalues);
+
+ __attribute__((nonnull)) void rebalance(subtree *const subtree);
+
+ __attribute__((nonnull)) static void copyout(omtdata_t *const out,
+ const omt_node *const n);
+
+ __attribute__((nonnull)) static void copyout(omtdata_t **const out,
+ omt_node *const n);
+
+ __attribute__((nonnull)) static void copyout(
+ omtdata_t *const out, const omtdata_t *const stored_value_ptr);
+
+ __attribute__((nonnull)) static void copyout(
+ omtdata_t **const out, omtdata_t *const stored_value_ptr);
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const;
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_zero(const subtree &subtree, const omtcmp_t &extra,
+ omtdataout_t *const value, uint32_t *const idxp) const;
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const;
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_plus(const subtree &subtree, const omtcmp_t &extra,
+ omtdataout_t *const value, uint32_t *const idxp) const;
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_minus_array(const omtcmp_t &extra,
+ omtdataout_t *const value,
+ uint32_t *const idxp) const;
+
+ template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+ int find_internal_minus(const subtree &subtree, const omtcmp_t &extra,
+ omtdataout_t *const value,
+ uint32_t *const idxp) const;
+};
+
+} // namespace toku
+
+// include the implementation here
+#include "omt_impl.h"
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
new file mode 100644
index 000000000..e77986716
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
@@ -0,0 +1,1295 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+namespace toku {
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) {
+ this->create_internal(2);
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) {
+ if (!supports_marks) {
+ this->create_internal_no_array(0);
+ } else {
+ this->is_array = false;
+ this->capacity = 0;
+ this->d.t.nodes = nullptr;
+ this->d.t.root.set_to_null();
+ this->d.t.free_idx = 0;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(
+ const omtdata_t *const values, const uint32_t numvalues) {
+ this->create_internal(numvalues);
+ memcpy(this->d.a.values, values, numvalues * (sizeof values[0]));
+ this->d.a.num_values = numvalues;
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(
+ omtdata_t **const values, const uint32_t numvalues,
+ const uint32_t new_capacity) {
+ paranoid_invariant_notnull(values);
+ this->create_internal_no_array(new_capacity);
+ this->d.a.num_values = numvalues;
+ this->d.a.values = *values;
+ *values = nullptr;
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt,
+ const uint32_t idx) {
+ barf_if_marked(*this);
+ paranoid_invariant_notnull(newomt);
+ if (idx > this->size()) {
+ return EINVAL;
+ }
+ this->convert_to_array();
+ const uint32_t newsize = this->size() - idx;
+ newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx],
+ newsize);
+ this->d.a.num_values = idx;
+ this->maybe_resize_array(idx);
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt,
+ omt *const rightomt) {
+ barf_if_marked(*this);
+ paranoid_invariant_notnull(leftomt);
+ paranoid_invariant_notnull(rightomt);
+ const uint32_t leftsize = leftomt->size();
+ const uint32_t rightsize = rightomt->size();
+ const uint32_t newsize = leftsize + rightsize;
+
+ if (leftomt->is_array) {
+ if (leftomt->capacity -
+ (leftomt->d.a.start_idx + leftomt->d.a.num_values) >=
+ rightsize) {
+ this->create_steal_sorted_array(
+ &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity);
+ this->d.a.start_idx = leftomt->d.a.start_idx;
+ } else {
+ this->create_internal(newsize);
+ memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx],
+ leftomt->d.a.num_values * (sizeof this->d.a.values[0]));
+ }
+ } else {
+ this->create_internal(newsize);
+ leftomt->fill_array_with_subtree_values(&this->d.a.values[0],
+ leftomt->d.t.root);
+ }
+ leftomt->destroy();
+ this->d.a.num_values = leftsize;
+
+ if (rightomt->is_array) {
+ memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+ &rightomt->d.a.values[rightomt->d.a.start_idx],
+ rightomt->d.a.num_values * (sizeof this->d.a.values[0]));
+ } else {
+ rightomt->fill_array_with_subtree_values(
+ &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+ rightomt->d.t.root);
+ }
+ rightomt->destroy();
+ this->d.a.num_values += rightsize;
+ paranoid_invariant(this->size() == newsize);
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) {
+ barf_if_marked(*this);
+ this->create_internal(src.size());
+ if (src.is_array) {
+ memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx],
+ src.d.a.num_values * (sizeof this->d.a.values[0]));
+ } else {
+ src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root);
+ }
+ this->d.a.num_values = src.size();
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) {
+ if (this->is_array) {
+ this->d.a.start_idx = 0;
+ this->d.a.num_values = 0;
+ } else {
+ this->d.t.root.set_to_null();
+ this->d.t.free_idx = 0;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) {
+ this->clear();
+ this->capacity = 0;
+ if (this->is_array) {
+ if (this->d.a.values != nullptr) {
+ toku_free(this->d.a.values);
+ }
+ this->d.a.values = nullptr;
+ } else {
+ if (this->d.t.nodes != nullptr) {
+ toku_free(this->d.t.nodes);
+ }
+ this->d.t.nodes = nullptr;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const {
+ if (this->is_array) {
+ return this->d.a.num_values;
+ } else {
+ return this->nweight(this->d.t.root);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert(const omtdata_t &value,
+ const omtcmp_t &v,
+ uint32_t *const idx) {
+ int r;
+ uint32_t insert_idx;
+
+ r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx);
+ if (r == 0) {
+ if (idx) *idx = insert_idx;
+ return DB_KEYEXIST;
+ }
+ if (r != DB_NOTFOUND) return r;
+
+ if ((r = this->insert_at(value, insert_idx))) return r;
+ if (idx) *idx = insert_idx;
+
+ return 0;
+}
+
+// The following 3 functions implement a static if for us.
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, false> &UU(omt)) {
+}
+
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) {
+ invariant(!omt.has_marks());
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const {
+ static_assert(supports_marks, "Does not support marks");
+ if (this->d.t.root.is_null()) {
+ return false;
+ }
+ const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()];
+ return node.get_marks_below() || node.get_marked();
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at(
+ const omtdata_t &value, const uint32_t idx) {
+ barf_if_marked(*this);
+ if (idx > this->size()) {
+ return EINVAL;
+ }
+
+ this->maybe_resize_or_convert(this->size() + 1);
+ if (this->is_array && idx != this->d.a.num_values &&
+ (idx != 0 || this->d.a.start_idx == 0)) {
+ this->convert_to_tree();
+ }
+ if (this->is_array) {
+ if (idx == this->d.a.num_values) {
+ this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value;
+ } else {
+ this->d.a.values[--this->d.a.start_idx] = value;
+ }
+ this->d.a.num_values++;
+ } else {
+ subtree *rebalance_subtree = nullptr;
+ this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+ if (rebalance_subtree != nullptr) {
+ this->rebalance(rebalance_subtree);
+ }
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(const omtdata_t &value,
+ const uint32_t idx) {
+ barf_if_marked(*this);
+ if (idx >= this->size()) {
+ return EINVAL;
+ }
+
+ if (this->is_array) {
+ this->set_at_internal_array(value, idx);
+ } else {
+ this->set_at_internal(this->d.t.root, value, idx);
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at(
+ const uint32_t idx) {
+ barf_if_marked(*this);
+ if (idx >= this->size()) {
+ return EINVAL;
+ }
+
+ this->maybe_resize_or_convert(this->size() - 1);
+ if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) {
+ this->convert_to_tree();
+ }
+ if (this->is_array) {
+ // Testing for 0 does not rule out it being the last entry.
+ // Test explicitly for num_values-1
+ if (idx != this->d.a.num_values - 1) {
+ this->d.a.start_idx++;
+ }
+ this->d.a.num_values--;
+ } else {
+ subtree *rebalance_subtree = nullptr;
+ this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+ if (rebalance_subtree != nullptr) {
+ this->rebalance(rebalance_subtree);
+ }
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate(
+ iterate_extra_t *const iterate_extra) const {
+ return this->iterate_on_range<iterate_extra_t, f>(0, this->size(),
+ iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range(
+ const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) const {
+ if (right > this->size()) {
+ return EINVAL;
+ }
+ if (left == right) {
+ return 0;
+ }
+ if (this->is_array) {
+ return this->iterate_internal_array<iterate_extra_t, f>(left, right,
+ iterate_extra);
+ }
+ return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root,
+ 0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(
+ const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) {
+ static_assert(supports_marks, "does not support marks");
+ if (right > this->size()) {
+ return EINVAL;
+ }
+ if (left == right) {
+ return 0;
+ }
+ paranoid_invariant(!this->is_array);
+ return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+ left, right, this->d.t.root, 0, iterate_extra);
+}
+
+// TODO: We can optimize this if we steal 3 bits. 1 bit: this node is
+// marked. 1 bit: left subtree has marks. 1 bit: right subtree has marks.
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(
+ iterate_extra_t *const iterate_extra) const {
+ static_assert(supports_marks, "does not support marks");
+ paranoid_invariant(!this->is_array);
+ return this->iterate_over_marked_internal<iterate_extra_t, f>(
+ this->d.t.root, 0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::unmark(
+ const subtree &st, const uint32_t index,
+ GrowableArray<node_idx> *const indexes) {
+ if (st.is_null()) {
+ return;
+ }
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t index_root = index + this->nweight(n.left);
+
+ const bool below = n.get_marks_below();
+ if (below) {
+ this->unmark(n.left, index, indexes);
+ }
+ if (n.get_marked()) {
+ indexes->push(index_root);
+ }
+ n.clear_stolen_bits();
+ if (below) {
+ this->unmark(n.right, index_root + 1, indexes);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
+ static_assert(supports_marks, "does not support marks");
+ if (!this->has_marks()) {
+ return;
+ }
+ paranoid_invariant(!this->is_array);
+ GrowableArray<node_idx> marked_indexes;
+ marked_indexes.init();
+
+ // Remove all marks.
+ // We need to delete all the stolen bits before calling delete_at to
+ // prevent barfing.
+ this->unmark(this->d.t.root, 0, &marked_indexes);
+
+ for (uint32_t i = 0; i < marked_indexes.get_size(); i++) {
+ // Delete from left to right, shift by number already deleted.
+ // Alternative is delete from right to left.
+ int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i);
+ lazy_assert_zero(r);
+ }
+ marked_indexes.deinit();
+ barf_if_marked(*this);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t
+omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(
+ const subtree &st, const bool UU(allow_marks)) const {
+ if (st.is_null()) {
+ return 0;
+ }
+ const omt_node &node = this->d.t.nodes[st.get_index()];
+ uint32_t num_marks =
+ verify_marks_consistent_internal(node.left, node.get_marks_below());
+ num_marks +=
+ verify_marks_consistent_internal(node.right, node.get_marks_below());
+ if (node.get_marks_below()) {
+ paranoid_invariant(allow_marks);
+ paranoid_invariant(num_marks > 0);
+ } else {
+ // redundant with invariant below, but nice to have explicitly
+ paranoid_invariant(num_marks == 0);
+ }
+ if (node.get_marked()) {
+ paranoid_invariant(allow_marks);
+ ++num_marks;
+ }
+ return num_marks;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(
+ void) const {
+ static_assert(supports_marks, "does not support marks");
+ paranoid_invariant(!this->is_array);
+ this->verify_marks_consistent_internal(this->d.t.root, true);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr(
+ iterate_extra_t *const iterate_extra) {
+ if (this->is_array) {
+ this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(),
+ iterate_extra);
+ } else {
+ this->iterate_ptr_internal<iterate_extra_t, f>(
+ 0, this->size(), this->d.t.root, 0, iterate_extra);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::fetch(
+ const uint32_t idx, omtdataout_t *const value) const {
+ if (idx >= this->size()) {
+ return EINVAL;
+ }
+ if (this->is_array) {
+ this->fetch_internal_array(idx, value);
+ } else {
+ this->fetch_internal(this->d.t.root, idx, value);
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero(
+ const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ uint32_t tmp_index;
+ uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+ int r;
+ if (this->is_array) {
+ r = this->find_internal_zero_array<omtcmp_t, h>(extra, value, child_idxp);
+ } else {
+ r = this->find_internal_zero<omtcmp_t, h>(this->d.t.root, extra, value,
+ child_idxp);
+ }
+ return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find(
+ const omtcmp_t &extra, int direction, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ uint32_t tmp_index;
+ uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+ paranoid_invariant(direction != 0);
+ if (direction < 0) {
+ if (this->is_array) {
+ return this->find_internal_minus_array<omtcmp_t, h>(extra, value,
+ child_idxp);
+ } else {
+ return this->find_internal_minus<omtcmp_t, h>(this->d.t.root, extra,
+ value, child_idxp);
+ }
+ } else {
+ if (this->is_array) {
+ return this->find_internal_plus_array<omtcmp_t, h>(extra, value,
+ child_idxp);
+ } else {
+ return this->find_internal_plus<omtcmp_t, h>(this->d.t.root, extra, value,
+ child_idxp);
+ }
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) {
+ if (this->is_array) {
+ return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]);
+ }
+ return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array(
+ const uint32_t new_capacity) {
+ this->is_array = true;
+ this->d.a.start_idx = 0;
+ this->d.a.num_values = 0;
+ this->d.a.values = nullptr;
+ this->capacity = new_capacity;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal(
+ const uint32_t new_capacity) {
+ this->create_internal_no_array(new_capacity);
+ XMALLOC_N(this->capacity, this->d.a.values);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(
+ const subtree &st) const {
+ if (st.is_null()) {
+ return 0;
+ } else {
+ return this->d.t.nodes[st.get_index()].weight;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx
+omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
+ paranoid_invariant(this->d.t.free_idx < this->capacity);
+ omt_node &n = this->d.t.nodes[this->d.t.free_idx];
+ n.clear_stolen_bits();
+ return this->d.t.free_idx++;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(
+ const node_idx UU(idx)) {
+ paranoid_invariant(idx < this->capacity);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array(
+ const uint32_t n) {
+ const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+ const uint32_t room = this->capacity - this->d.a.start_idx;
+
+ if (room < n || this->capacity / 2 >= new_size) {
+ omtdata_t *XMALLOC_N(new_size, tmp_values);
+ if (this->d.a.num_values) {
+ memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx],
+ this->d.a.num_values * (sizeof tmp_values[0]));
+ }
+ this->d.a.start_idx = 0;
+ this->capacity = new_size;
+ toku_free(this->d.a.values);
+ this->d.a.values = tmp_values;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t,
+ supports_marks>::fill_array_with_subtree_values(omtdata_t *const array,
+ const subtree &st)
+ const {
+ if (st.is_null()) return;
+ const omt_node &tree = this->d.t.nodes[st.get_index()];
+ this->fill_array_with_subtree_values(&array[0], tree.left);
+ array[this->nweight(tree.left)] = tree.value;
+ this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1],
+ tree.right);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) {
+ if (!this->is_array) {
+ const uint32_t num_values = this->size();
+ uint32_t new_size = 2 * num_values;
+ new_size = new_size < 4 ? 4 : new_size;
+
+ omtdata_t *XMALLOC_N(new_size, tmp_values);
+ this->fill_array_with_subtree_values(tmp_values, this->d.t.root);
+ toku_free(this->d.t.nodes);
+ this->is_array = true;
+ this->capacity = new_size;
+ this->d.a.num_values = num_values;
+ this->d.a.values = tmp_values;
+ this->d.a.start_idx = 0;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array(
+ subtree *const st, const omtdata_t *const values,
+ const uint32_t numvalues) {
+ if (numvalues == 0) {
+ st->set_to_null();
+ } else {
+ const uint32_t halfway = numvalues / 2;
+ const node_idx newidx = this->node_malloc();
+ omt_node *const newnode = &this->d.t.nodes[newidx];
+ newnode->weight = numvalues;
+ newnode->value = values[halfway];
+ st->set_index(newidx);
+ // update everything before the recursive calls so the second call
+ // can be a tail call.
+ this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway);
+ this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1],
+ numvalues - (halfway + 1));
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) {
+ if (this->is_array) {
+ const uint32_t num_nodes = this->size();
+ uint32_t new_size = num_nodes * 2;
+ new_size = new_size < 4 ? 4 : new_size;
+
+ omt_node *XMALLOC_N(new_size, new_nodes);
+ omtdata_t *const values = this->d.a.values;
+ omtdata_t *const tmp_values = &values[this->d.a.start_idx];
+ this->is_array = false;
+ this->d.t.nodes = new_nodes;
+ this->capacity = new_size;
+ this->d.t.free_idx = 0;
+ this->d.t.root.set_to_null();
+ this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes);
+ toku_free(values);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert(
+ const uint32_t n) {
+ if (this->is_array) {
+ this->maybe_resize_array(n);
+ } else {
+ const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+ const uint32_t num_nodes = this->nweight(this->d.t.root);
+ if ((this->capacity / 2 >= new_size) ||
+ (this->d.t.free_idx >= this->capacity && num_nodes < n) ||
+ (this->capacity < n)) {
+ this->convert_to_array();
+ // if we had a free list, the "supports_marks" version could
+ // just resize, as it is now, we have to convert to and back
+ // from an array.
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+ }
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(
+ const subtree &st, const int leftmod, const int rightmod) const {
+ if (st.is_null()) {
+ return false;
+ }
+ const omt_node &n = this->d.t.nodes[st.get_index()];
+ // one of the 1's is for the root.
+ // the other is to take ceil(n/2)
+ const uint32_t weight_left = this->nweight(n.left) + leftmod;
+ const uint32_t weight_right = this->nweight(n.right) + rightmod;
+ return ((1 + weight_left < (1 + 1 + weight_right) / 2) ||
+ (1 + weight_right < (1 + 1 + weight_left) / 2));
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(
+ subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+ subtree **const rebalance_subtree) {
+ if (subtreep->is_null()) {
+ paranoid_invariant_zero(idx);
+ const node_idx newidx = this->node_malloc();
+ omt_node *const newnode = &this->d.t.nodes[newidx];
+ newnode->weight = 1;
+ newnode->left.set_to_null();
+ newnode->right.set_to_null();
+ newnode->value = value;
+ subtreep->set_index(newidx);
+ } else {
+ omt_node &n = this->d.t.nodes[subtreep->get_index()];
+ n.weight++;
+ if (idx <= this->nweight(n.left)) {
+ if (*rebalance_subtree == nullptr &&
+ this->will_need_rebalance(*subtreep, 1, 0)) {
+ *rebalance_subtree = subtreep;
+ }
+ this->insert_internal(&n.left, value, idx, rebalance_subtree);
+ } else {
+ if (*rebalance_subtree == nullptr &&
+ this->will_need_rebalance(*subtreep, 0, 1)) {
+ *rebalance_subtree = subtreep;
+ }
+ const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+ this->insert_internal(&n.right, value, sub_index, rebalance_subtree);
+ }
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(
+ const omtdata_t &value, const uint32_t idx) {
+ this->d.a.values[this->d.a.start_idx + idx] = value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(
+ const subtree &st, const omtdata_t &value, const uint32_t idx) {
+ paranoid_invariant(!st.is_null());
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t leftweight = this->nweight(n.left);
+ if (idx < leftweight) {
+ this->set_at_internal(n.left, value, idx);
+ } else if (idx == leftweight) {
+ n.value = value;
+ } else {
+ this->set_at_internal(n.right, value, idx - leftweight - 1);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(
+ subtree *const subtreep, const uint32_t idx, omt_node *const copyn,
+ subtree **const rebalance_subtree) {
+ paranoid_invariant_notnull(subtreep);
+ paranoid_invariant_notnull(rebalance_subtree);
+ paranoid_invariant(!subtreep->is_null());
+ omt_node &n = this->d.t.nodes[subtreep->get_index()];
+ const uint32_t leftweight = this->nweight(n.left);
+ if (idx < leftweight) {
+ n.weight--;
+ if (*rebalance_subtree == nullptr &&
+ this->will_need_rebalance(*subtreep, -1, 0)) {
+ *rebalance_subtree = subtreep;
+ }
+ this->delete_internal(&n.left, idx, copyn, rebalance_subtree);
+ } else if (idx == leftweight) {
+ if (n.left.is_null()) {
+ const uint32_t oldidx = subtreep->get_index();
+ *subtreep = n.right;
+ if (copyn != nullptr) {
+ copyn->value = n.value;
+ }
+ this->node_free(oldidx);
+ } else if (n.right.is_null()) {
+ const uint32_t oldidx = subtreep->get_index();
+ *subtreep = n.left;
+ if (copyn != nullptr) {
+ copyn->value = n.value;
+ }
+ this->node_free(oldidx);
+ } else {
+ if (*rebalance_subtree == nullptr &&
+ this->will_need_rebalance(*subtreep, 0, -1)) {
+ *rebalance_subtree = subtreep;
+ }
+ // don't need to copy up value, it's only used by this
+ // next call, and when that gets to the bottom there
+ // won't be any more recursion
+ n.weight--;
+ this->delete_internal(&n.right, 0, &n, rebalance_subtree);
+ }
+ } else {
+ n.weight--;
+ if (*rebalance_subtree == nullptr &&
+ this->will_need_rebalance(*subtreep, 0, -1)) {
+ *rebalance_subtree = subtreep;
+ }
+ this->delete_internal(&n.right, idx - leftweight - 1, copyn,
+ rebalance_subtree);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array(
+ const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) const {
+ int r;
+ for (uint32_t i = left; i < right; ++i) {
+ r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal(
+ const uint32_t left, const uint32_t right, const subtree &st,
+ const uint32_t idx, iterate_extra_t *const iterate_extra) {
+ if (!st.is_null()) {
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t idx_root = idx + this->nweight(n.left);
+ if (left < idx_root) {
+ this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx,
+ iterate_extra);
+ }
+ if (left <= idx_root && idx_root < right) {
+ int r = f(&n.value, idx_root, iterate_extra);
+ lazy_assert_zero(r);
+ }
+ if (idx_root + 1 < right) {
+ this->iterate_ptr_internal<iterate_extra_t, f>(
+ left, right, n.right, idx_root + 1, iterate_extra);
+ }
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array(
+ const uint32_t left, const uint32_t right,
+ iterate_extra_t *const iterate_extra) {
+ for (uint32_t i = left; i < right; ++i) {
+ int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+ lazy_assert_zero(r);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal(
+ const uint32_t left, const uint32_t right, const subtree &st,
+ const uint32_t idx, iterate_extra_t *const iterate_extra) const {
+ if (st.is_null()) {
+ return 0;
+ }
+ int r;
+ const omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t idx_root = idx + this->nweight(n.left);
+ if (left < idx_root) {
+ r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx,
+ iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (left <= idx_root && idx_root < right) {
+ r = f(n.value, idx_root, iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (idx_root + 1 < right) {
+ return this->iterate_internal<iterate_extra_t, f>(
+ left, right, n.right, idx_root + 1, iterate_extra);
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::
+ iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+ const subtree &st, const uint32_t idx,
+ iterate_extra_t *const iterate_extra) {
+ paranoid_invariant(!st.is_null());
+ int r;
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t idx_root = idx + this->nweight(n.left);
+ if (left < idx_root && !n.left.is_null()) {
+ n.set_marks_below_bit();
+ r = this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+ left, right, n.left, idx, iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (left <= idx_root && idx_root < right) {
+ n.set_marked_bit();
+ r = f(n.value, idx_root, iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (idx_root + 1 < right && !n.right.is_null()) {
+ n.set_marks_below_bit();
+ return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+ left, right, n.right, idx_root + 1, iterate_extra);
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+ int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal(
+ const subtree &st, const uint32_t idx,
+ iterate_extra_t *const iterate_extra) const {
+ if (st.is_null()) {
+ return 0;
+ }
+ int r;
+ const omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t idx_root = idx + this->nweight(n.left);
+ if (n.get_marks_below()) {
+ r = this->iterate_over_marked_internal<iterate_extra_t, f>(n.left, idx,
+ iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (n.get_marked()) {
+ r = f(n.value, idx_root, iterate_extra);
+ if (r != 0) {
+ return r;
+ }
+ }
+ if (n.get_marks_below()) {
+ return this->iterate_over_marked_internal<iterate_extra_t, f>(
+ n.right, idx_root + 1, iterate_extra);
+ }
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array(
+ const uint32_t i, omtdataout_t *const value) const {
+ if (value != nullptr) {
+ copyout(value, &this->d.a.values[this->d.a.start_idx + i]);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal(
+ const subtree &st, const uint32_t i, omtdataout_t *const value) const {
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ const uint32_t leftweight = this->nweight(n.left);
+ if (i < leftweight) {
+ this->fetch_internal(n.left, i, value);
+ } else if (i == leftweight) {
+ if (value != nullptr) {
+ copyout(value, &n);
+ }
+ } else {
+ this->fetch_internal(n.right, i - leftweight - 1, value);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs(
+ node_idx *const array, const subtree &st) const {
+ if (!st.is_null()) {
+ const omt_node &tree = this->d.t.nodes[st.get_index()];
+ this->fill_array_with_subtree_idxs(&array[0], tree.left);
+ array[this->nweight(tree.left)] = st.get_index();
+ this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1],
+ tree.right);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs(
+ subtree *const st, const node_idx *const idxs, const uint32_t numvalues) {
+ if (numvalues == 0) {
+ st->set_to_null();
+ } else {
+ uint32_t halfway = numvalues / 2;
+ st->set_index(idxs[halfway]);
+ // node_idx newidx = idxs[halfway];
+ omt_node &newnode = this->d.t.nodes[st->get_index()];
+ newnode.weight = numvalues;
+ // value is already in there.
+ this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway);
+ this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1],
+ numvalues - (halfway + 1));
+ // n_idx = newidx;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance(
+ subtree *const st) {
+ node_idx idx = st->get_index();
+ if (idx == this->d.t.root.get_index()) {
+ // Try to convert to an array.
+ // If this fails, (malloc) nothing will have changed.
+ // In the failure case we continue on to the standard rebalance
+ // algorithm.
+ this->convert_to_array();
+ if (supports_marks) {
+ this->convert_to_tree();
+ }
+ } else {
+ const omt_node &n = this->d.t.nodes[idx];
+ node_idx *tmp_array;
+ size_t mem_needed = n.weight * (sizeof tmp_array[0]);
+ size_t mem_free =
+ (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]);
+ bool malloced;
+ if (mem_needed <= mem_free) {
+ // There is sufficient free space at the end of the nodes array
+ // to hold enough node indexes to rebalance.
+ malloced = false;
+ tmp_array =
+ reinterpret_cast<node_idx *>(&this->d.t.nodes[this->d.t.free_idx]);
+ } else {
+ malloced = true;
+ XMALLOC_N(n.weight, tmp_array);
+ }
+ this->fill_array_with_subtree_idxs(tmp_array, *st);
+ this->rebuild_subtree_from_idxs(st, tmp_array, n.weight);
+ if (malloced) toku_free(tmp_array);
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+ omtdata_t *const out, const omt_node *const n) {
+ *out = n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+ omtdata_t **const out, omt_node *const n) {
+ *out = &n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+ omtdata_t *const out, const omtdata_t *const stored_value_ptr) {
+ *out = *stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+ omtdata_t **const out, omtdata_t *const stored_value_ptr) {
+ *out = stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(
+ const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ uint32_t min = this->d.a.start_idx;
+ uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+ uint32_t best_pos = subtree::NODE_NULL;
+ uint32_t best_zero = subtree::NODE_NULL;
+
+ while (min != limit) {
+ uint32_t mid = (min + limit) / 2;
+ int hv = h(this->d.a.values[mid], extra);
+ if (hv < 0) {
+ min = mid + 1;
+ } else if (hv > 0) {
+ best_pos = mid;
+ limit = mid;
+ } else {
+ best_zero = mid;
+ limit = mid;
+ }
+ }
+ if (best_zero != subtree::NODE_NULL) {
+ // Found a zero
+ if (value != nullptr) {
+ copyout(value, &this->d.a.values[best_zero]);
+ }
+ *idxp = best_zero - this->d.a.start_idx;
+ return 0;
+ }
+ if (best_pos != subtree::NODE_NULL)
+ *idxp = best_pos - this->d.a.start_idx;
+ else
+ *idxp = this->d.a.num_values;
+ return DB_NOTFOUND;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(
+ const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ if (st.is_null()) {
+ *idxp = 0;
+ return DB_NOTFOUND;
+ }
+ omt_node &n = this->d.t.nodes[st.get_index()];
+ int hv = h(n.value, extra);
+ if (hv < 0) {
+ int r = this->find_internal_zero<omtcmp_t, h>(n.right, extra, value, idxp);
+ *idxp += this->nweight(n.left) + 1;
+ return r;
+ } else if (hv > 0) {
+ return this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+ } else {
+ int r = this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+ if (r == DB_NOTFOUND) {
+ *idxp = this->nweight(n.left);
+ if (value != nullptr) {
+ copyout(value, &n);
+ }
+ r = 0;
+ }
+ return r;
+ }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(
+ const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ uint32_t min = this->d.a.start_idx;
+ uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+ uint32_t best = subtree::NODE_NULL;
+
+ while (min != limit) {
+ const uint32_t mid = (min + limit) / 2;
+ const int hv = h(this->d.a.values[mid], extra);
+ if (hv > 0) {
+ best = mid;
+ limit = mid;
+ } else {
+ min = mid + 1;
+ }
+ }
+ if (best == subtree::NODE_NULL) {
+ return DB_NOTFOUND;
+ }
+ if (value != nullptr) {
+ copyout(value, &this->d.a.values[best]);
+ }
+ *idxp = best - this->d.a.start_idx;
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(
+ const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ if (st.is_null()) {
+ return DB_NOTFOUND;
+ }
+ omt_node *const n = &this->d.t.nodes[st.get_index()];
+ int hv = h(n->value, extra);
+ int r;
+ if (hv > 0) {
+ r = this->find_internal_plus<omtcmp_t, h>(n->left, extra, value, idxp);
+ if (r == DB_NOTFOUND) {
+ *idxp = this->nweight(n->left);
+ if (value != nullptr) {
+ copyout(value, n);
+ }
+ r = 0;
+ }
+ } else {
+ r = this->find_internal_plus<omtcmp_t, h>(n->right, extra, value, idxp);
+ if (r == 0) {
+ *idxp += this->nweight(n->left) + 1;
+ }
+ }
+ return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(
+ const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ uint32_t min = this->d.a.start_idx;
+ uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+ uint32_t best = subtree::NODE_NULL;
+
+ while (min != limit) {
+ const uint32_t mid = (min + limit) / 2;
+ const int hv = h(this->d.a.values[mid], extra);
+ if (hv < 0) {
+ best = mid;
+ min = mid + 1;
+ } else {
+ limit = mid;
+ }
+ }
+ if (best == subtree::NODE_NULL) {
+ return DB_NOTFOUND;
+ }
+ if (value != nullptr) {
+ copyout(value, &this->d.a.values[best]);
+ }
+ *idxp = best - this->d.a.start_idx;
+ return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(
+ const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+ uint32_t *const idxp) const {
+ paranoid_invariant_notnull(idxp);
+ if (st.is_null()) {
+ return DB_NOTFOUND;
+ }
+ omt_node *const n = &this->d.t.nodes[st.get_index()];
+ int hv = h(n->value, extra);
+ if (hv < 0) {
+ int r =
+ this->find_internal_minus<omtcmp_t, h>(n->right, extra, value, idxp);
+ if (r == 0) {
+ *idxp += this->nweight(n->left) + 1;
+ } else if (r == DB_NOTFOUND) {
+ *idxp = this->nweight(n->left);
+ if (value != nullptr) {
+ copyout(value, n);
+ }
+ r = 0;
+ }
+ return r;
+ } else {
+ return this->find_internal_minus<omtcmp_t, h>(n->left, extra, value, idxp);
+ }
+}
+} // namespace toku
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
new file mode 100644
index 000000000..f20eeedf2
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
@@ -0,0 +1,165 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Overview: A partitioned_counter provides a counter that can be incremented
+// and the running sum can be read at any time.
+// We assume that increments are frequent, whereas reading is infrequent.
+// Implementation hint: Use thread-local storage so each thread increments its
+// own data. The increment does not require a lock or atomic operation.
+// Reading the data can be performed by iterating over the thread-local
+// versions, summing them up. The data structure also includes a sum for all
+// the threads that have died. Use a pthread_key to create the thread-local
+// versions. When a thread finishes, the system calls pthread_key destructor
+// which can add that thread's copy into the sum_of_dead counter.
+// Rationale: For statistics such as are found in engine status, we need a
+// counter that requires no cache misses to increment. We've seen significant
+// performance speedups by removing certain counters. Rather than removing
+// those statistics, we would like to just make the counter fast. We generally
+// increment the counters frequently, and want to fetch the values
+// infrequently. The counters are monotonic. The counters can be split into
+// many counters, which can be summed up at the end. We don't care if we get
+// slightly out-of-date counter sums when we read the counter. We don't care
+// if there is a race on reading the a counter
+// variable and incrementing.
+// See tests/test_partitioned_counter.c for some performance measurements.
+// Operations:
+// create_partitioned_counter Create a counter initialized to zero.
+// destroy_partitioned_counter Destroy it.
+// increment_partitioned_counter Increment it. This is the frequent
+// operation. read_partitioned_counter Get the current value. This is
+// infrequent.
+// See partitioned_counter.cc for the abstraction function and representation
+// invariant.
+//
+// The google style guide says to avoid using constructors, and it appears that
+// constructors may have broken all the tests, because they called
+// pthread_key_create before the key was actually created. So the google style
+// guide may have some wisdom there...
+//
+// This version does not use constructors, essentially reverrting to the google
+// C++ style guide.
+//
+
+// The old C interface. This required a bunch of explicit
+// ___attribute__((__destructor__)) functions to remember to destroy counters at
+// the end.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct partitioned_counter *PARTITIONED_COUNTER;
+PARTITIONED_COUNTER create_partitioned_counter(void);
+// Effect: Create a counter, initialized to zero.
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER);
+// Effect: Destroy the counter. No operations on that counter are permitted
+// after this.
+
+void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
+// Effect: Increment the counter by amount.
+// Requires: No overflows. This is a 64-bit unsigned counter.
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER)
+ __attribute__((__visibility__("default")));
+// Effect: Return the current value of the counter.
+
+void partitioned_counters_init(void);
+// Effect: Initialize any partitioned counters data structures that must be set
+// up before any partitioned counters run.
+
+void partitioned_counters_destroy(void);
+// Effect: Destroy any partitioned counters data structures.
+
+#if defined(__cplusplus)
+};
+#endif
+
+#if 0
+#include <pthread.h>
+
+#include "fttypes.h"
+
+// Used inside the PARTITIONED_COUNTER.
+struct linked_list_head {
+ struct linked_list_element *first;
+};
+
+
+class PARTITIONED_COUNTER {
+public:
+ PARTITIONED_COUNTER(void);
+ // Effect: Construct a counter, initialized to zero.
+
+ ~PARTITIONED_COUNTER(void);
+ // Effect: Destruct the counter.
+
+ void increment(uint64_t amount);
+ // Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
+ // Requires: Don't use this from a static constructor or destructor.
+
+ uint64_t read(void);
+ // Effect: Read the sum.
+ // Requires: Don't use this from a static constructor or destructor.
+
+private:
+ uint64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated.
+ pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage.
+ struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
+
+ // This function is used to destroy the thread-local part of the state when a thread terminates.
+ // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
+ friend void destroy_thread_local_part_of_partitioned_counters (void *);
+};
+#endif
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h
new file mode 100644
index 000000000..3fd0095d0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h
@@ -0,0 +1,76 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ PerconaFT is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License, version 3,
+ as published by the Free Software Foundation.
+
+ PerconaFT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+======= */
+
+#ident \
+ "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "partitioned_counter.h"
+// PORT2: #include <util/constexpr.h>
+
+#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc) \
+ do { \
+ array.status[k].keyname = #k; \
+ array.status[k].columnname = #c; \
+ array.status[k].type = t; \
+ array.status[k].legend = l; \
+ constexpr_static_assert( \
+ strcmp(#c, "NULL") && strcmp(#c, "0"), \
+ "Use nullptr for no column name instead of NULL, 0, etc..."); \
+ constexpr_static_assert( \
+ (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"), \
+ "Missing column name."); \
+ array.status[k].include = \
+ static_cast<toku_engine_status_include_type>(inc); \
+ if (t == STATUS_PARCOUNT) { \
+ array.status[k].value.parcount = create_partitioned_counter(); \
+ } \
+ } while (0)
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
new file mode 100644
index 000000000..531165dea
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
@@ -0,0 +1,503 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockManagerHandle* NewRangeLockManager(
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory) {
+ std::shared_ptr<TransactionDBMutexFactory> use_factory;
+
+ if (mutex_factory) {
+ use_factory = mutex_factory;
+ } else {
+ use_factory.reset(new TransactionDBMutexFactoryImpl());
+ }
+ return new RangeTreeLockManager(use_factory);
+}
+
+static const char SUFFIX_INFIMUM = 0x0;
+static const char SUFFIX_SUPREMUM = 0x1;
+
+// Convert Endpoint into an internal format used for storing it in locktree
+// (DBT structure is used for passing endpoints to locktree and getting back)
+void serialize_endpoint(const Endpoint& endp, std::string* buf) {
+ buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM);
+ buf->append(endp.slice.data(), endp.slice.size());
+}
+
+// Decode the endpoint from the format it is stored in the locktree (DBT) to
+// the one used outside: either Endpoint or EndpointWithString
+template <typename EndpointStruct>
+void deserialize_endpoint(const DBT* dbt, EndpointStruct* endp) {
+ assert(dbt->size >= 1);
+ const char* dbt_data = (const char*)dbt->data;
+ char suffix = dbt_data[0];
+ assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM);
+ endp->inf_suffix = (suffix == SUFFIX_SUPREMUM);
+ endp->slice = decltype(EndpointStruct::slice)(dbt_data + 1, dbt->size - 1);
+}
+
+// Get a range lock on [start_key; end_key] range
+Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
+ uint32_t column_family_id,
+ const Endpoint& start_endp,
+ const Endpoint& end_endp, Env*,
+ bool exclusive) {
+ toku::lock_request request;
+ request.create(mutex_factory_);
+ DBT start_key_dbt, end_key_dbt;
+
+ TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter");
+ std::string start_key;
+ std::string end_key;
+ serialize_endpoint(start_endp, &start_key);
+ serialize_endpoint(end_endp, &end_key);
+
+ toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size());
+ toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size());
+
+ auto lt = GetLockTreeForCF(column_family_id);
+
+ // Put the key waited on into request's m_extra. See
+ // wait_callback_for_locktree for details.
+ std::string wait_key(start_endp.slice.data(), start_endp.slice.size());
+
+ request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt,
+ exclusive ? toku::lock_request::WRITE : toku::lock_request::READ,
+ false /* not a big txn */, &wait_key);
+
+ // This is for "periodically wake up and check if the wait is killed" feature
+ // which we are not using.
+ uint64_t killed_time_msec = 0;
+ uint64_t wait_time_msec = txn->GetLockTimeout();
+
+ if (wait_time_msec == static_cast<uint64_t>(-1)) {
+ // The transaction has no wait timeout. lock_request::wait doesn't support
+ // this, it needs a number of milliseconds to wait. Pass it one year to
+ // be safe.
+ wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365;
+ } else {
+ // convert microseconds to milliseconds
+ wait_time_msec = (wait_time_msec + 500) / 1000;
+ }
+
+ std::vector<RangeDeadlockInfo> di_path;
+ request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive,
+ const DBT* start_dbt, const DBT* end_dbt) {
+ EndpointWithString start;
+ EndpointWithString end;
+ deserialize_endpoint(start_dbt, &start);
+ deserialize_endpoint(end_dbt, &end);
+
+ di_path.push_back({txnid, column_family_id, is_exclusive, std::move(start),
+ std::move(end)});
+ };
+
+ request.start();
+
+ const int r = request.wait(wait_time_msec, killed_time_msec,
+ nullptr, // killed_callback
+ wait_callback_for_locktree, nullptr);
+
+ // Inform the txn that we are no longer waiting:
+ txn->ClearWaitingTxn();
+
+ request.destroy();
+ switch (r) {
+ case 0:
+ break; // fall through
+ case DB_LOCK_NOTGRANTED:
+ return Status::TimedOut(Status::SubCode::kLockTimeout);
+ case TOKUDB_OUT_OF_LOCKS:
+ return Status::Busy(Status::SubCode::kLockLimit);
+ case DB_LOCK_DEADLOCK: {
+ std::reverse(di_path.begin(), di_path.end());
+ dlock_buffer_.AddNewPath(
+ RangeDeadlockPath(di_path, request.get_start_time()));
+ return Status::Busy(Status::SubCode::kDeadlock);
+ }
+ default:
+ assert(0);
+ return Status::Busy(Status::SubCode::kLockLimit);
+ }
+
+ return Status::OK();
+}
+
+// Wait callback that locktree library will call to inform us about
+// the lock waits that are in progress.
+void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) {
+ TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:EnterWaitingTxn");
+ for (auto wait_info : *infos) {
+ // As long as we hold the lock on the locktree's pending request queue
+ // this should be safe.
+ auto txn = (PessimisticTransaction*)wait_info.waiter;
+ auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid;
+
+ autovector<TransactionID> waitee_ids;
+ for (auto waitee : wait_info.waitees) {
+ waitee_ids.push_back(waitee);
+ }
+ txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra);
+ }
+
+ // Here we can assume that the locktree code will now wait for some lock
+ TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn");
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+ ColumnFamilyId column_family_id,
+ const std::string& key, Env*) {
+ auto locktree = GetLockTreeForCF(column_family_id);
+ std::string endp_image;
+ serialize_endpoint({key.data(), key.size(), false}, &endp_image);
+
+ DBT key_dbt;
+ toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size());
+
+ toku::range_buffer range_buf;
+ range_buf.create();
+ range_buf.append(&key_dbt, &key_dbt);
+
+ locktree->release_locks((TXNID)txn, &range_buf);
+ range_buf.destroy();
+
+ toku::lock_request::retry_all_lock_requests(
+ locktree.get(), wait_callback_for_locktree, nullptr);
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+ const LockTracker& tracker, Env*) {
+ const RangeTreeLockTracker* range_tracker =
+ static_cast<const RangeTreeLockTracker*>(&tracker);
+
+ RangeTreeLockTracker* range_trx_tracker =
+ static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks());
+ bool all_keys = (range_trx_tracker == range_tracker);
+
+ // tracked_locks_->range_list may hold nullptr if the transaction has never
+ // acquired any locks.
+ ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys);
+}
+
+int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key,
+ const DBT* b_key) {
+ const char* a = (const char*)a_key->data;
+ const char* b = (const char*)b_key->data;
+
+ size_t a_len = a_key->size;
+ size_t b_len = b_key->size;
+
+ size_t min_len = std::min(a_len, b_len);
+
+ // Compare the values. The first byte encodes the endpoint type, its value
+ // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM.
+ Comparator* cmp = (Comparator*)arg;
+ int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1));
+ if (!res) {
+ if (b_len > min_len) {
+ // a is shorter;
+ if (a[0] == SUFFIX_INFIMUM) {
+ return -1; //"a is smaller"
+ } else {
+ // a is considered padded with 0xFF:FF:FF:FF...
+ return 1; // "a" is bigger
+ }
+ } else if (a_len > min_len) {
+ // the opposite of the above: b is shorter.
+ if (b[0] == SUFFIX_INFIMUM) {
+ return 1; //"b is smaller"
+ } else {
+ // b is considered padded with 0xFF:FF:FF:FF...
+ return -1; // "b" is bigger
+ }
+ } else {
+ // the lengths are equal (and the key values, too)
+ if (a[0] < b[0]) {
+ return -1;
+ } else if (a[0] > b[0]) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ } else {
+ return res;
+ }
+}
+
+namespace {
+void UnrefLockTreeMapsCache(void* ptr) {
+ // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+ auto lock_tree_map_cache = static_cast<
+ std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>*>(
+ ptr);
+ delete lock_tree_map_cache;
+}
+} // anonymous namespace
+
+RangeTreeLockManager::RangeTreeLockManager(
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
+ : mutex_factory_(mutex_factory),
+ ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)),
+ dlock_buffer_(10) {
+ ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_);
+}
+
+int RangeTreeLockManager::on_create(toku::locktree* lt, void* arg) {
+ // arg is a pointer to RangeTreeLockManager
+ lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg);
+ return 0;
+}
+
+bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT* a, const DBT* b,
+ void* extra) {
+ Endpoint a_endp, b_endp;
+ deserialize_endpoint(a, &a_endp);
+ deserialize_endpoint(b, &b_endp);
+ auto self = static_cast<RangeTreeLockManager*>(extra);
+ return self->barrier_func_(a_endp, b_endp);
+}
+
+void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize(
+ uint32_t target_size) {
+ dlock_buffer_.Resize(target_size);
+}
+
+void RangeTreeLockManager::Resize(uint32_t target_size) {
+ SetRangeDeadlockInfoBufferSize(target_size);
+}
+
+std::vector<RangeDeadlockPath>
+RangeTreeLockManager::GetRangeDeadlockInfoBuffer() {
+ return dlock_buffer_.PrepareBuffer();
+}
+
+std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() {
+ std::vector<DeadlockPath> res;
+ std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer();
+ // report left endpoints
+ for (auto it = data.begin(); it != data.end(); ++it) {
+ std::vector<DeadlockInfo> path;
+
+ for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) {
+ path.push_back(
+ {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice});
+ }
+ res.push_back(DeadlockPath(path, it->deadlock_time));
+ }
+ return res;
+}
+
+// @brief Lock Escalation Callback function
+//
+// @param txnid Transaction whose locks got escalated
+// @param lt Lock Tree where escalation is happening
+// @param buffer Escalation result: list of locks that this transaction now
+// owns in this lock tree.
+// @param void* Callback context
+void RangeTreeLockManager::on_escalate(TXNID txnid, const toku::locktree* lt,
+ const toku::range_buffer& buffer,
+ void*) {
+ auto txn = (PessimisticTransaction*)txnid;
+ ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer);
+}
+
+RangeTreeLockManager::~RangeTreeLockManager() {
+ autovector<void*> local_caches;
+ ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+ for (auto cache : local_caches) {
+ delete static_cast<LockTreeMap*>(cache);
+ }
+ ltree_map_.clear(); // this will call release_lt() for all locktrees
+ ltm_.destroy();
+}
+
+RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() {
+ LTM_STATUS_S ltm_status_test;
+ ltm_.get_status(&ltm_status_test);
+ Counters res;
+
+ // Searching status variable by its string name is how Toku's unit tests
+ // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
+ // lookup keyname in status
+ for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
+ TOKU_ENGINE_STATUS_ROW status = &ltm_status_test.status[i];
+ if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+ res.escalation_count = status->value.num;
+ continue;
+ }
+ if (strcmp(status->keyname, "LTM_WAIT_COUNT") == 0) {
+ res.lock_wait_count = status->value.num;
+ continue;
+ }
+ if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
+ res.current_lock_memory = status->value.num;
+ }
+ }
+ return res;
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::MakeLockTreePtr(
+ toku::locktree* lt) {
+ toku::locktree_manager* ltm = &ltm_;
+ return std::shared_ptr<toku::locktree>(
+ lt, [ltm](toku::locktree* p) { ltm->release_lt(p); });
+}
+
+void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) {
+ uint32_t column_family_id = cfh->GetID();
+
+ InstrumentedMutexLock l(&ltree_map_mutex_);
+ if (ltree_map_.find(column_family_id) == ltree_map_.end()) {
+ DICTIONARY_ID dict_id = {.dictid = column_family_id};
+ toku::comparator cmp;
+ cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator());
+ toku::locktree* ltree =
+ ltm_.get_lt(dict_id, cmp,
+ /* on_create_extra*/ static_cast<void*>(this));
+ // This is ok to because get_lt has copied the comparator:
+ cmp.destroy();
+
+ ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)});
+ }
+}
+
+void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) {
+ uint32_t column_family_id = cfh->GetID();
+ // Remove lock_map for this column family. Since the lock map is stored
+ // as a shared ptr, concurrent transactions can still keep using it
+ // until they release their references to it.
+
+ // TODO what if one drops a column family while transaction(s) still have
+ // locks in it?
+ // locktree uses column family'c Comparator* as the criteria to do tree
+ // ordering. If the comparator is gone, we won't even be able to remove the
+ // elements from the locktree.
+ // A possible solution might be to remove everything right now:
+ // - wait until everyone traversing the locktree are gone
+ // - remove everything from the locktree.
+ // - some transactions may have acquired locks in their LockTracker objects.
+ // Arrange something so we don't blow up when they try to release them.
+ // - ...
+ // This use case (drop column family while somebody is using it) doesn't seem
+ // the priority, though.
+
+ {
+ InstrumentedMutexLock l(&ltree_map_mutex_);
+
+ auto lock_maps_iter = ltree_map_.find(column_family_id);
+ assert(lock_maps_iter != ltree_map_.end());
+ ltree_map_.erase(lock_maps_iter);
+ } // lock_map_mutex_
+
+ autovector<void*> local_caches;
+ ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+ for (auto cache : local_caches) {
+ delete static_cast<LockTreeMap*>(cache);
+ }
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::GetLockTreeForCF(
+ ColumnFamilyId column_family_id) {
+ // First check thread-local cache
+ if (ltree_lookup_cache_->Get() == nullptr) {
+ ltree_lookup_cache_->Reset(new LockTreeMap());
+ }
+
+ auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get());
+
+ auto it = ltree_map_cache->find(column_family_id);
+ if (it != ltree_map_cache->end()) {
+ // Found lock map for this column family.
+ return it->second;
+ }
+
+ // Not found in local cache, grab mutex and check shared LockMaps
+ InstrumentedMutexLock l(&ltree_map_mutex_);
+
+ it = ltree_map_.find(column_family_id);
+ if (it == ltree_map_.end()) {
+ return nullptr;
+ } else {
+ // Found lock map. Store in thread-local cache and return.
+ ltree_map_cache->insert({column_family_id, it->second});
+ return it->second;
+ }
+}
+
+struct LOCK_PRINT_CONTEXT {
+ RangeLockManagerHandle::RangeLockStatus* data; // Save locks here
+ uint32_t cfh_id; // Column Family whose tree we are traversing
+};
+
+// Report left endpoints of the acquired locks
+LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() {
+ PointLockStatus res;
+ LockManager::RangeLockStatus data = GetRangeLockStatus();
+ // report left endpoints
+ for (auto it = data.begin(); it != data.end(); ++it) {
+ auto& val = it->second;
+ res.insert({it->first, {val.start.slice, val.ids, val.exclusive}});
+ }
+ return res;
+}
+
+static void push_into_lock_status_data(void* param, const DBT* left,
+ const DBT* right, TXNID txnid_arg,
+ bool is_shared, TxnidVector* owners) {
+ struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param;
+ struct RangeLockInfo info;
+
+ info.exclusive = !is_shared;
+
+ deserialize_endpoint(left, &info.start);
+ deserialize_endpoint(right, &info.end);
+
+ if (txnid_arg != TXNID_SHARED) {
+ info.ids.push_back(txnid_arg);
+ } else {
+ for (auto it : *owners) {
+ info.ids.push_back(it);
+ }
+ }
+ ctx->data->insert({ctx->cfh_id, info});
+}
+
+LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() {
+ LockManager::RangeLockStatus data;
+ {
+ InstrumentedMutexLock l(&ltree_map_mutex_);
+ for (auto it : ltree_map_) {
+ LOCK_PRINT_CONTEXT ctx = {&data, it.first};
+ it.second->dump_locks((void*)&ctx, push_into_lock_status_data);
+ }
+ }
+ return data;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
new file mode 100644
index 000000000..e4236d600
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+// For DeadlockInfoBuffer:
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+
+// Lock Tree library:
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h"
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer;
+
+// A Range Lock Manager that uses PerconaFT's locktree library
+class RangeTreeLockManager : public RangeLockManagerBase,
+ public RangeLockManagerHandle {
+ public:
+ LockManager* getLockManager() override { return this; }
+
+ void AddColumnFamily(const ColumnFamilyHandle* cfh) override;
+ void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override;
+
+ void Resize(uint32_t) override;
+ std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+ std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override;
+ void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override;
+
+ // Get a lock on a range
+ // @note only exclusive locks are currently supported (requesting a
+ // non-exclusive lock will get an exclusive one)
+ using LockManager::TryLock;
+ Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const Endpoint& start_endp, const Endpoint& end_endp, Env* env,
+ bool exclusive) override;
+
+ void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+ Env* env) override;
+ void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+ const std::string& key, Env* env) override;
+ void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&,
+ const Endpoint&, Env*) override {
+ // TODO: range unlock does nothing...
+ }
+
+ explicit RangeTreeLockManager(
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+ ~RangeTreeLockManager() override;
+
+ int SetMaxLockMemory(size_t max_lock_memory) override {
+ return ltm_.set_max_lock_memory(max_lock_memory);
+ }
+
+ size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); }
+
+ Counters GetStatus() override;
+
+ bool IsPointLockSupported() const override {
+ // One could have acquired a point lock (it is reduced to range lock)
+ return true;
+ }
+
+ PointLockStatus GetPointLockStatus() override;
+
+ // This is from LockManager
+ LockManager::RangeLockStatus GetRangeLockStatus() override;
+
+ // This has the same meaning as GetRangeLockStatus but is from
+ // RangeLockManagerHandle
+ RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override {
+ return GetRangeLockStatus();
+ }
+
+ bool IsRangeLockSupported() const override { return true; }
+
+ const LockTrackerFactory& GetLockTrackerFactory() const override {
+ return RangeTreeLockTrackerFactory::Get();
+ }
+
+ // Get the locktree which stores locks for the Column Family with given cf_id
+ std::shared_ptr<toku::locktree> GetLockTreeForCF(ColumnFamilyId cf_id);
+
+ void SetEscalationBarrierFunc(EscalationBarrierFunc func) override {
+ barrier_func_ = func;
+ }
+
+ private:
+ toku::locktree_manager ltm_;
+
+ EscalationBarrierFunc barrier_func_ =
+ [](const Endpoint&, const Endpoint&) -> bool { return false; };
+
+ std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+ // Map from cf_id to locktree*. Can only be accessed while holding the
+ // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt
+ using LockTreeMap =
+ std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>;
+ LockTreeMap ltree_map_;
+
+ InstrumentedMutex ltree_map_mutex_;
+
+ // Per-thread cache of ltree_map_.
+ // (uses the same approach as TransactionLockMgr::lock_maps_cache_)
+ std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_;
+
+ RangeDeadlockInfoBuffer dlock_buffer_;
+
+ std::shared_ptr<toku::locktree> MakeLockTreePtr(toku::locktree* lt);
+ static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key);
+
+ // Callbacks
+ static int on_create(toku::locktree*, void*);
+ static void on_destroy(toku::locktree*) {}
+ static void on_escalate(TXNID txnid, const toku::locktree* lt,
+ const toku::range_buffer& buffer, void* extra);
+
+ static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra);
+};
+
+void serialize_endpoint(const Endpoint& endp, std::string* buf);
+void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos);
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
new file mode 100644
index 000000000..be1e1478b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockList *RangeTreeLockTracker::getOrCreateList() {
+ if (range_list_) return range_list_.get();
+
+ // Doesn't exist, create
+ range_list_.reset(new RangeLockList());
+ return range_list_.get();
+}
+
+void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) {
+ DBT key_dbt;
+ std::string key;
+ serialize_endpoint(Endpoint(lock_req.key, false), &key);
+ toku_fill_dbt(&key_dbt, key.data(), key.size());
+ RangeLockList *rl = getOrCreateList();
+ rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt);
+}
+
+void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) {
+ DBT start_dbt, end_dbt;
+ std::string start_key, end_key;
+
+ serialize_endpoint(lock_req.start_endp, &start_key);
+ serialize_endpoint(lock_req.end_endp, &end_key);
+
+ toku_fill_dbt(&start_dbt, start_key.data(), start_key.size());
+ toku_fill_dbt(&end_dbt, end_key.data(), end_key.size());
+
+ RangeLockList *rl = getOrCreateList();
+ rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt);
+}
+
+PointLockStatus RangeTreeLockTracker::GetPointLockStatus(
+ ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const {
+ // This function is not expected to be called as RangeTreeLockTracker::
+ // IsPointLockSupported() returns false. Return the status which indicates
+ // the point is not locked.
+ PointLockStatus p;
+ p.locked = false;
+ p.exclusive = true;
+ p.seq = 0;
+ return p;
+}
+
+void RangeTreeLockTracker::Clear() { range_list_.reset(); }
+
+void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key,
+ const DBT *right_key) {
+ MutexLock l(&mutex_);
+ // Only the transaction owner thread calls this function.
+ // The same thread does the lock release, so we can be certain nobody is
+ // releasing the locks concurrently.
+ assert(!releasing_locks_.load());
+ auto it = buffers_.find(cf_id);
+ if (it == buffers_.end()) {
+ // create a new one
+ it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first;
+ it->second->create();
+ }
+ it->second->append(left_key, right_key);
+}
+
+void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr,
+ PessimisticTransaction *txn,
+ bool all_trx_locks) {
+ {
+ MutexLock l(&mutex_);
+ // The lt->release_locks() call below will walk range_list->buffer_. We
+ // need to prevent lock escalation callback from replacing
+ // range_list->buffer_ while we are doing that.
+ //
+ // Additional complication here is internal mutex(es) in the locktree
+ // (let's call them latches):
+ // - Lock escalation first obtains latches on the lock tree
+ // - Then, it calls RangeTreeLockManager::on_escalate to replace
+ // transaction's range_list->buffer_. = Access to that buffer must be
+ // synchronized, so it will want to acquire the range_list->mutex_.
+ //
+ // While in this function we would want to do the reverse:
+ // - Acquire range_list->mutex_ to prevent access to the range_list.
+ // - Then, lt->release_locks() call will walk through the range_list
+ // - and acquire latches on parts of the lock tree to remove locks from
+ // it.
+ //
+ // How do we avoid the deadlock? The idea is that here we set
+ // releasing_locks_=true, and release the mutex.
+ // All other users of the range_list must:
+ // - Acquire the mutex, then check that releasing_locks_=false.
+ // (the code in this function doesnt do that as there's only one thread
+ // that releases transaction's locks)
+ releasing_locks_.store(true);
+ }
+
+ for (auto it : buffers_) {
+ // Don't try to call release_locks() if the buffer is empty! if we are
+ // not holding any locks, the lock tree might be in the STO-mode with
+ // another transaction, and our attempt to release an empty set of locks
+ // will cause an assertion failure.
+ if (it.second->get_num_ranges()) {
+ auto lt_ptr = mgr->GetLockTreeForCF(it.first);
+ toku::locktree *lt = lt_ptr.get();
+
+ lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks);
+
+ it.second->destroy();
+ it.second->create();
+
+ toku::lock_request::retry_all_lock_requests(lt,
+ wait_callback_for_locktree);
+ }
+ }
+
+ Clear();
+ releasing_locks_.store(false);
+}
+
+void RangeLockList::ReplaceLocks(const toku::locktree *lt,
+ const toku::range_buffer &buffer) {
+ MutexLock l(&mutex_);
+ if (releasing_locks_.load()) {
+ // Do nothing. The transaction is releasing its locks, so it will not care
+ // about having a correct list of ranges. (In TokuDB,
+ // toku_db_txn_escalate_callback() makes use of this property, too)
+ return;
+ }
+
+ ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid;
+
+ auto it = buffers_.find(cf_id);
+ it->second->destroy();
+ it->second->create();
+
+ toku::range_buffer::iterator iter(&buffer);
+ toku::range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ it->second->append(rec.get_left_key(), rec.get_right_key());
+ iter.next();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // OS_WIN
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
new file mode 100644
index 000000000..4ef48d252
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "util/mutexlock.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+// Range Locking:
+#include "lib/locktree/lock_request.h"
+#include "lib/locktree/locktree.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTreeLockManager;
+
+// Storage for locks that are currently held by a transaction.
+//
+// Locks are kept in toku::range_buffer because toku::locktree::release_locks()
+// accepts that as an argument.
+//
+// Note: the list of locks may differ slighly from the contents of the lock
+// tree, due to concurrency between lock acquisition, lock release, and lock
+// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details.
+// This property is currently harmless.
+//
+// Append() and ReleaseLocks() are not thread-safe, as they are expected to be
+// called only by the owner transaction. ReplaceLocks() is safe to call from
+// other threads.
+class RangeLockList {
+ public:
+ ~RangeLockList() { Clear(); }
+
+ RangeLockList() : releasing_locks_(false) {}
+
+ void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key);
+ void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+ bool all_trx_locks);
+ void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer);
+
+ private:
+ void Clear() {
+ for (auto it : buffers_) {
+ it.second->destroy();
+ }
+ buffers_.clear();
+ }
+
+ std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>>
+ buffers_;
+ port::Mutex mutex_;
+ std::atomic<bool> releasing_locks_;
+};
+
+// A LockTracker-based object that is used together with RangeTreeLockManager.
+class RangeTreeLockTracker : public LockTracker {
+ public:
+ RangeTreeLockTracker() : range_list_(nullptr) {}
+
+ RangeTreeLockTracker(const RangeTreeLockTracker&) = delete;
+ RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete;
+
+ void Track(const PointLockRequest&) override;
+ void Track(const RangeLockRequest&) override;
+
+ bool IsPointLockSupported() const override {
+ // This indicates that we don't implement GetPointLockStatus()
+ return false;
+ }
+ bool IsRangeLockSupported() const override { return true; }
+
+ // a Not-supported dummy implementation.
+ UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+ return UntrackStatus::NOT_TRACKED;
+ }
+
+ UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override {
+ return UntrackStatus::NOT_TRACKED;
+ }
+
+ // "If this method is not supported, leave it as a no-op."
+ void Merge(const LockTracker&) override {}
+
+ // "If this method is not supported, leave it as a no-op."
+ void Subtract(const LockTracker&) override {}
+
+ void Clear() override;
+
+ // "If this method is not supported, returns nullptr."
+ virtual LockTracker* GetTrackedLocksSinceSavePoint(
+ const LockTracker&) const override {
+ return nullptr;
+ }
+
+ PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+ const std::string& key) const override;
+
+ // The return value is only used for tests
+ uint64_t GetNumPointLocks() const override { return 0; }
+
+ ColumnFamilyIterator* GetColumnFamilyIterator() const override {
+ return nullptr;
+ }
+
+ KeyIterator* GetKeyIterator(
+ ColumnFamilyId /*column_family_id*/) const override {
+ return nullptr;
+ }
+
+ void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+ bool all_trx_locks) {
+ if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks);
+ }
+
+ void ReplaceLocks(const toku::locktree* lt,
+ const toku::range_buffer& buffer) {
+ // range_list_ cannot be NULL here
+ range_list_->ReplaceLocks(lt, buffer);
+ }
+
+ private:
+ RangeLockList* getOrCreateList();
+ std::unique_ptr<RangeLockList> range_list_;
+};
+
+class RangeTreeLockTrackerFactory : public LockTrackerFactory {
+ public:
+ static const RangeTreeLockTrackerFactory& Get() {
+ static const RangeTreeLockTrackerFactory instance;
+ return instance;
+ }
+
+ LockTracker* Create() const override { return new RangeTreeLockTracker(); }
+
+ private:
+ RangeTreeLockTrackerFactory() {}
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.cc b/src/rocksdb/utilities/transactions/optimistic_transaction.cc
new file mode 100644
index 000000000..0ee0f28b6
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/optimistic_transaction.h"
+
+#include <string>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+#include "utilities/transactions/optimistic_transaction.h"
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+OptimisticTransaction::OptimisticTransaction(
+ OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options)
+ : TransactionBaseImpl(txn_db->GetBaseDB(), write_options,
+ PointLockTrackerFactory::Get()),
+ txn_db_(txn_db) {
+ Initialize(txn_options);
+}
+
+void OptimisticTransaction::Initialize(
+ const OptimisticTransactionOptions& txn_options) {
+ if (txn_options.set_snapshot) {
+ SetSnapshot();
+ }
+}
+
+void OptimisticTransaction::Reinitialize(
+ OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options) {
+ TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options);
+ Initialize(txn_options);
+}
+
+OptimisticTransaction::~OptimisticTransaction() {}
+
+void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); }
+
+Status OptimisticTransaction::Prepare() {
+ return Status::InvalidArgument(
+ "Two phase commit not supported for optimistic transactions.");
+}
+
+Status OptimisticTransaction::Commit() {
+ auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl,
+ OptimisticTransactionDB>(txn_db_);
+ assert(txn_db_impl);
+ switch (txn_db_impl->GetValidatePolicy()) {
+ case OccValidationPolicy::kValidateParallel:
+ return CommitWithParallelValidate();
+ case OccValidationPolicy::kValidateSerial:
+ return CommitWithSerialValidate();
+ default:
+ assert(0);
+ }
+ // unreachable, just void compiler complain
+ return Status::OK();
+}
+
+Status OptimisticTransaction::CommitWithSerialValidate() {
+ // Set up callback which will call CheckTransactionForConflicts() to
+ // check whether this transaction is safe to be committed.
+ OptimisticTransactionCallback callback(this);
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+ Status s = db_impl->WriteWithCallback(
+ write_options_, GetWriteBatch()->GetWriteBatch(), &callback);
+
+ if (s.ok()) {
+ Clear();
+ }
+
+ return s;
+}
+
+Status OptimisticTransaction::CommitWithParallelValidate() {
+ auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl,
+ OptimisticTransactionDB>(txn_db_);
+ assert(txn_db_impl);
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+ assert(db_impl);
+ const size_t space = txn_db_impl->GetLockBucketsSize();
+ std::set<size_t> lk_idxes;
+ std::vector<std::unique_lock<std::mutex>> lks;
+ std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+ tracked_locks_->GetColumnFamilyIterator());
+ assert(cf_it != nullptr);
+ while (cf_it->HasNext()) {
+ ColumnFamilyId cf = cf_it->Next();
+ std::unique_ptr<LockTracker::KeyIterator> key_it(
+ tracked_locks_->GetKeyIterator(cf));
+ assert(key_it != nullptr);
+ while (key_it->HasNext()) {
+ const std::string& key = key_it->Next();
+ lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space));
+ }
+ }
+ // NOTE: in a single txn, all bucket-locks are taken in ascending order.
+ // In this way, txns from different threads all obey this rule so that
+ // deadlock can be avoided.
+ for (auto v : lk_idxes) {
+ lks.emplace_back(txn_db_impl->LockBucket(v));
+ }
+
+ Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
+ true /* cache_only */);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = db_impl->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+ if (s.ok()) {
+ Clear();
+ }
+
+ return s;
+}
+
+Status OptimisticTransaction::Rollback() {
+ Clear();
+ return Status::OK();
+}
+
+// Record this key so that we can check it for conflicts at commit time.
+//
+// 'exclusive' is unused for OptimisticTransaction.
+Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+ const Slice& key, bool read_only,
+ bool exclusive, const bool do_validate,
+ const bool assume_tracked) {
+ assert(!assume_tracked); // not supported
+ (void)assume_tracked;
+ if (!do_validate) {
+ return Status::OK();
+ }
+ uint32_t cfh_id = GetColumnFamilyID(column_family);
+
+ SetSnapshotIfNeeded();
+
+ SequenceNumber seq;
+ if (snapshot_) {
+ seq = snapshot_->GetSequenceNumber();
+ } else {
+ seq = db_->GetLatestSequenceNumber();
+ }
+
+ std::string key_str = key.ToString();
+
+ TrackKey(cfh_id, key_str, seq, read_only, exclusive);
+
+ // Always return OK. Confilct checking will happen at commit time.
+ return Status::OK();
+}
+
+// Returns OK if it is safe to commit this transaction. Returns Status::Busy
+// if there are read or write conflicts that would prevent us from committing OR
+// if we can not determine whether there would be any such conflicts.
+//
+// Should only be called on writer thread in order to avoid any race conditions
+// in detecting write conflicts.
+Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) {
+ auto db_impl = static_cast_with_check<DBImpl>(db);
+
+ // Since we are on the write thread and do not want to block other writers,
+ // we will do a cache-only conflict check. This can result in TryAgain
+ // getting returned if there is not sufficient memtable history to check
+ // for conflicts.
+ return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
+ true /* cache_only */);
+}
+
+Status OptimisticTransaction::SetName(const TransactionName& /* unused */) {
+ return Status::InvalidArgument("Optimistic transactions cannot be named.");
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.h b/src/rocksdb/utilities/transactions/optimistic_transaction.h
new file mode 100644
index 000000000..de23233d5
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransaction : public TransactionBaseImpl {
+ public:
+ OptimisticTransaction(OptimisticTransactionDB* db,
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options);
+ // No copying allowed
+ OptimisticTransaction(const OptimisticTransaction&) = delete;
+ void operator=(const OptimisticTransaction&) = delete;
+
+ virtual ~OptimisticTransaction();
+
+ void Reinitialize(OptimisticTransactionDB* txn_db,
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options);
+
+ Status Prepare() override;
+
+ Status Commit() override;
+
+ Status Rollback() override;
+
+ Status SetName(const TransactionName& name) override;
+
+ protected:
+ Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+ bool read_only, bool exclusive, const bool do_validate = true,
+ const bool assume_tracked = false) override;
+
+ private:
+ ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_;
+
+ friend class OptimisticTransactionCallback;
+
+ void Initialize(const OptimisticTransactionOptions& txn_options);
+
+ // Returns OK if it is safe to commit this transaction. Returns Status::Busy
+ // if there are read or write conflicts that would prevent us from committing
+ // OR if we can not determine whether there would be any such conflicts.
+ //
+ // Should only be called on writer thread.
+ Status CheckTransactionForConflicts(DB* db);
+
+ void Clear() override;
+
+ void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */,
+ const Slice& /* unused */) override {
+ // Nothing to unlock.
+ }
+
+ Status CommitWithSerialValidate();
+
+ Status CommitWithParallelValidate();
+};
+
+// Used at commit time to trigger transaction validation
+class OptimisticTransactionCallback : public WriteCallback {
+ public:
+ explicit OptimisticTransactionCallback(OptimisticTransaction* txn)
+ : txn_(txn) {}
+
+ Status Callback(DB* db) override {
+ return txn_->CheckTransactionForConflicts(db);
+ }
+
+ bool AllowWriteBatching() override { return false; }
+
+ private:
+ OptimisticTransaction* txn_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
new file mode 100644
index 000000000..bffb3d5ed
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "utilities/transactions/optimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Transaction* OptimisticTransactionDBImpl::BeginTransaction(
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options, Transaction* old_txn) {
+ if (old_txn != nullptr) {
+ ReinitializeTransaction(old_txn, write_options, txn_options);
+ return old_txn;
+ } else {
+ return new OptimisticTransaction(this, write_options, txn_options);
+ }
+}
+
+std::unique_lock<std::mutex> OptimisticTransactionDBImpl::LockBucket(
+ size_t idx) {
+ assert(idx < bucketed_locks_.size());
+ return std::unique_lock<std::mutex>(*bucketed_locks_[idx]);
+}
+
+Status OptimisticTransactionDB::Open(const Options& options,
+ const std::string& dbname,
+ OptimisticTransactionDB** dbptr) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = Open(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ delete handles[0];
+ }
+
+ return s;
+}
+
+Status OptimisticTransactionDB::Open(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr) {
+ return OptimisticTransactionDB::Open(db_options,
+ OptimisticTransactionDBOptions(), dbname,
+ column_families, handles, dbptr);
+}
+
+Status OptimisticTransactionDB::Open(
+ const DBOptions& db_options,
+ const OptimisticTransactionDBOptions& occ_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles,
+ OptimisticTransactionDB** dbptr) {
+ Status s;
+ DB* db;
+
+ std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+
+ // Enable MemTable History if not already enabled
+ for (auto& column_family : column_families_copy) {
+ ColumnFamilyOptions* options = &column_family.options;
+
+ if (options->max_write_buffer_size_to_maintain == 0 &&
+ options->max_write_buffer_number_to_maintain == 0) {
+ // Setting to -1 will set the History size to
+ // max_write_buffer_number * write_buffer_size.
+ options->max_write_buffer_size_to_maintain = -1;
+ }
+ }
+
+ s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
+
+ if (s.ok()) {
+ *dbptr = new OptimisticTransactionDBImpl(db, occ_options);
+ }
+
+ return s;
+}
+
+void OptimisticTransactionDBImpl::ReinitializeTransaction(
+ Transaction* txn, const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options) {
+ assert(dynamic_cast<OptimisticTransaction*>(txn) != nullptr);
+ auto txn_impl = reinterpret_cast<OptimisticTransaction*>(txn);
+
+ txn_impl->Reinitialize(this, write_options, txn_options);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
new file mode 100644
index 000000000..88e86ea4a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <mutex>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
+ public:
+ explicit OptimisticTransactionDBImpl(
+ DB* db, const OptimisticTransactionDBOptions& occ_options,
+ bool take_ownership = true)
+ : OptimisticTransactionDB(db),
+ db_owner_(take_ownership),
+ validate_policy_(occ_options.validate_policy) {
+ if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
+ uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets);
+ bucketed_locks_.reserve(bucket_size);
+ for (size_t i = 0; i < bucket_size; ++i) {
+ bucketed_locks_.emplace_back(
+ std::unique_ptr<std::mutex>(new std::mutex));
+ }
+ }
+ }
+
+ ~OptimisticTransactionDBImpl() {
+ // Prevent this stackable from destroying
+ // base db
+ if (!db_owner_) {
+ db_ = nullptr;
+ }
+ }
+
+ Transaction* BeginTransaction(const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options,
+ Transaction* old_txn) override;
+
+ // Transactional `DeleteRange()` is not yet supported.
+ using StackableDB::DeleteRange;
+ virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+ const Slice&, const Slice&) override {
+ return Status::NotSupported();
+ }
+
+ // Range deletions also must not be snuck into `WriteBatch`es as they are
+ // incompatible with `OptimisticTransactionDB`.
+ virtual Status Write(const WriteOptions& write_opts,
+ WriteBatch* batch) override {
+ if (batch->HasDeleteRange()) {
+ return Status::NotSupported();
+ }
+ return OptimisticTransactionDB::Write(write_opts, batch);
+ }
+
+ size_t GetLockBucketsSize() const { return bucketed_locks_.size(); }
+
+ OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
+
+ std::unique_lock<std::mutex> LockBucket(size_t idx);
+
+ private:
+ // NOTE: used in validation phase. Each key is hashed into some
+ // bucket. We then take the lock in the hash value order to avoid deadlock.
+ std::vector<std::unique_ptr<std::mutex>> bucketed_locks_;
+
+ bool db_owner_;
+
+ const OccValidationPolicy validate_policy_;
+
+ void ReinitializeTransaction(Transaction* txn,
+ const WriteOptions& write_options,
+ const OptimisticTransactionOptions& txn_options =
+ OptimisticTransactionOptions());
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
new file mode 100644
index 000000000..aa8192c32
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
@@ -0,0 +1,1491 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/transaction_test_util.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransactionTest
+ : public testing::Test,
+ public testing::WithParamInterface<OccValidationPolicy> {
+ public:
+ OptimisticTransactionDB* txn_db;
+ std::string dbname;
+ Options options;
+
+ OptimisticTransactionTest() {
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 2;
+ options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+ options.merge_operator.reset(new TestPutOperator());
+ dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
+
+ EXPECT_OK(DestroyDB(dbname, options));
+ Open();
+ }
+ ~OptimisticTransactionTest() override {
+ delete txn_db;
+ EXPECT_OK(DestroyDB(dbname, options));
+ }
+
+ void Reopen() {
+ delete txn_db;
+ txn_db = nullptr;
+ Open();
+ }
+
+ private:
+ void Open() {
+ ColumnFamilyOptions cf_options(options);
+ OptimisticTransactionDBOptions occ_opts;
+ occ_opts.validate_policy = GetParam();
+ std::vector<ColumnFamilyDescriptor> column_families;
+ std::vector<ColumnFamilyHandle*> handles;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ Status s =
+ OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname,
+ column_families, &handles, &txn_db);
+
+ ASSERT_OK(s);
+ ASSERT_NE(txn_db, nullptr);
+ ASSERT_EQ(handles.size(), 1);
+ delete handles[0];
+ }
+};
+
+TEST_P(OptimisticTransactionTest, SuccessTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+ ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ ASSERT_OK(txn->Put("foo", "bar2"));
+
+ // This Put outside of a transaction will conflict with the previous write
+ ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+ ASSERT_EQ(1, txn->GetNumKeys());
+
+ Status s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy()); // Txn should not commit
+
+ // Verify that transaction did not write anything
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+ ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
+ ASSERT_EQ(value, "bar");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+ ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+ txn_options.set_snapshot = true;
+ Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+ ASSERT_NE(txn, nullptr);
+
+ // This Put outside of a transaction will conflict with a later write
+ ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+ ASSERT_OK(txn->Put(
+ "foo", "bar2")); // Conflicts with write done after snapshot taken
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+
+ Status s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy()); // Txn should not commit
+
+ // Verify that transaction did not write anything
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+ ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
+ ASSERT_EQ(value, "bar");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest3) {
+ ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
+
+ Transaction* txn = txn_db->BeginTransaction(WriteOptions());
+ ASSERT_NE(txn, nullptr);
+
+ std::string value;
+ ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
+ ASSERT_EQ(value, "bar");
+ ASSERT_OK(txn->Merge("foo", "bar3"));
+
+ // Merge outside of a transaction should conflict with the previous merge
+ ASSERT_OK(txn_db->Merge(WriteOptions(), "foo", "bar2"));
+ ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ ASSERT_EQ(1, txn->GetNumKeys());
+
+ Status s = txn->Commit();
+ EXPECT_TRUE(s.IsBusy()); // Txn should not commit
+
+ // Verify that transaction did not write anything
+ ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflict4) {
+ ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
+
+ Transaction* txn = txn_db->BeginTransaction(WriteOptions());
+ ASSERT_NE(txn, nullptr);
+
+ std::string value;
+ ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
+ ASSERT_EQ(value, "bar");
+ ASSERT_OK(txn->Merge("foo", "bar3"));
+
+ // Range delete outside of a transaction should conflict with the previous
+ // merge inside txn
+ auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
+ ColumnFamilyHandle* default_cf = dbimpl->DefaultColumnFamily();
+ ASSERT_OK(dbimpl->DeleteRange(WriteOptions(), default_cf, "foo", "foo1"));
+ Status s = txn_db->Get(ReadOptions(), "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_EQ(1, txn->GetNumKeys());
+
+ s = txn->Commit();
+ EXPECT_TRUE(s.IsBusy()); // Txn should not commit
+
+ // Verify that transaction did not write anything
+ s = txn_db->Get(ReadOptions(), "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, ReadConflictTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+ ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+ txn_options.set_snapshot = true;
+ Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+ ASSERT_NE(txn, nullptr);
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ // This Put outside of a transaction will conflict with the previous read
+ ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+
+ Status s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy()); // Txn should not commit
+
+ // Verify that transaction did not write anything
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+ ASSERT_EQ(value, "barz");
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value));
+ ASSERT_EQ(value, "bar");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, TxnOnlyTest) {
+ // Test to make sure transactions work when there are no other writes in an
+ // empty db.
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ ASSERT_OK(txn->Put("x", "y"));
+
+ ASSERT_OK(txn->Commit());
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, FlushTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ // Put a random key so we have a memtable to flush
+ ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
+
+ // force a memtable flush
+ FlushOptions flush_ops;
+ ASSERT_OK(txn_db->Flush(flush_ops));
+
+ // txn should commit since the flushed table is still in MemtableList History
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, FlushTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ // Put a random key so we have a MemTable to flush
+ ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
+
+ // force a memtable flush
+ FlushOptions flush_ops;
+ ASSERT_OK(txn_db->Flush(flush_ops));
+
+ // Put a random key so we have a MemTable to flush
+ ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2"));
+
+ // force a memtable flush
+ ASSERT_OK(txn_db->Flush(flush_ops));
+
+ ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3"));
+
+ // force a memtable flush
+ // Since our test db has max_write_buffer_number=2, this flush will cause
+ // the first memtable to get purged from the MemtableList history.
+ ASSERT_OK(txn_db->Flush(flush_ops));
+
+ Status s = txn->Commit();
+ // txn should not commit since MemTableList History is not large enough
+ ASSERT_TRUE(s.IsTryAgain());
+
+ ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ delete txn;
+}
+
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
+ const int kAttemptHistoryMemtable = 0;
+ const int kAttemptImmMemTable = 1;
+ for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+ attempt++) {
+ Reopen();
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ ReadOptions snapshot_read_options;
+ ReadOptions snapshot_read_options2;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn != nullptr);
+
+ Transaction* txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn2 != nullptr);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+ ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+ snapshot_read_options2.snapshot = txn2->GetSnapshot();
+ ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value));
+ ASSERT_EQ(value, "bar");
+ ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2")));
+
+ // txn updates "foo" and txn2 updates "foo2", and now a write is
+ // issued for "foo", which conflicts with txn but not txn2
+ ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+
+ if (attempt == kAttemptImmMemTable) {
+ // For the second attempt, hold flush from beginning. The memtable
+ // will be switched to immutable after calling TEST_SwitchMemtable()
+ // while CheckKey() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"OptimisticTransactionTest.CheckKeySkipOldMemtable",
+ "FlushJob::Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ // force a memtable flush. The memtable should still be kept
+ FlushOptions flush_ops;
+ if (attempt == kAttemptHistoryMemtable) {
+ ASSERT_OK(txn_db->Flush(flush_ops));
+ } else {
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
+ ASSERT_OK(db_impl->TEST_SwitchMemtable());
+ }
+ uint64_t num_imm_mems;
+ ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+ &num_imm_mems));
+ if (attempt == kAttemptHistoryMemtable) {
+ ASSERT_EQ(0, num_imm_mems);
+ } else {
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ ASSERT_EQ(1, num_imm_mems);
+ }
+
+ // Put something in active memtable
+ ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+ // Create txn3 after flushing, when this transaction is commited,
+ // only need to check the active memtable
+ Transaction* txn3 = txn_db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn3 != nullptr);
+
+ // Commit both of txn and txn2. txn will conflict but txn2 will
+ // pass. In both ways, both memtables are queried.
+ SetPerfLevel(PerfLevel::kEnableCount);
+
+ get_perf_context()->Reset();
+ Status s = txn->Commit();
+ // We should have checked two memtables
+ ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+ // txn should fail because of conflict, even if the memtable
+ // has flushed, because it is still preserved in history.
+ ASSERT_TRUE(s.IsBusy());
+
+ get_perf_context()->Reset();
+ s = txn2->Commit();
+ // We should have checked two memtables
+ ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+ ASSERT_TRUE(s.ok());
+
+ ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2")));
+ get_perf_context()->Reset();
+ s = txn3->Commit();
+ // txn3 is created after the active memtable is created, so that is the only
+ // memtable to check.
+ ASSERT_EQ(1, get_perf_context()->get_from_memtable_count);
+ ASSERT_TRUE(s.ok());
+
+ TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ SetPerfLevel(PerfLevel::kDisable);
+
+ delete txn;
+ delete txn2;
+ delete txn3;
+ }
+}
+
+TEST_P(OptimisticTransactionTest, NoSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ // Modify key after transaction start
+ ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
+
+ // Read and write without a snapshot
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn->Put("AAA", "bar2"));
+
+ // Should commit since read/write was done after data changed
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+ ASSERT_OK(txn_db->Put(write_options, "BBB", "bar"));
+ ASSERT_OK(txn_db->Put(write_options, "CCC", "bar"));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
+
+ // Read and write without a snapshot
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn->Put("AAA", "bar2"));
+
+ // Modify BBB before snapshot is taken
+ ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1"));
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ // Read and write with snapshot
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn->Put("BBB", "bar2"));
+
+ ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1"));
+
+ // Set a new snapshot
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ // Read and write with snapshot
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn->Put("CCC", "bar2"));
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar2");
+ ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value));
+ ASSERT_EQ(value, "bar2");
+ ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value));
+ ASSERT_EQ(value, "bar2");
+
+ ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
+ ASSERT_EQ(value, "bar1");
+ ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
+ ASSERT_EQ(value, "bar1");
+
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar2");
+ ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
+ ASSERT_EQ(value, "bar2");
+ ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
+ ASSERT_EQ(value, "bar2");
+
+ // verify that we track multiple writes to the same key at different snapshots
+ delete txn;
+ txn = txn_db->BeginTransaction(write_options);
+
+ // Potentially conflicting writes
+ ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz"));
+ ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx"));
+
+ txn->SetSnapshot();
+
+ OptimisticTransactionOptions txn_options;
+ txn_options.set_snapshot = true;
+ Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ txn2->SetSnapshot();
+
+ // This should not conflict in txn since the snapshot is later than the
+ // previous write (spoiler alert: it will later conflict with txn2).
+ ASSERT_OK(txn->Put("ZZZ", "zzzz"));
+ ASSERT_OK(txn->Commit());
+
+ delete txn;
+
+ // This will conflict since the snapshot is earlier than another write to ZZZ
+ ASSERT_OK(txn2->Put("ZZZ", "xxxxx"));
+
+ Status s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn2;
+}
+
+TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ ColumnFamilyHandle *cfa, *cfb;
+ ColumnFamilyOptions cf_options;
+
+ // Create 2 new column families
+ ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa));
+ ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb));
+
+ delete cfa;
+ delete cfb;
+ delete txn_db;
+ txn_db = nullptr;
+
+ // open DB with three column families
+ std::vector<ColumnFamilyDescriptor> column_families;
+ // have to open default column family
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ // open the new column families
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families,
+ &handles, &txn_db));
+ assert(txn_db != nullptr);
+ ASSERT_NE(txn_db, nullptr);
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ txn_options.set_snapshot = true;
+ Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ // Write some data to the db
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "foo"));
+ ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+ ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+ ASSERT_OK(txn_db->Write(write_options, &batch));
+ ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ"));
+
+ // These keys do no conflict with existing writes since they're in
+ // different column families
+ ASSERT_OK(txn->Delete("AAA"));
+ Status s =
+ txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ Slice key_slice("AAAZZZ");
+ Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+ ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1),
+ SliceParts(value_slices, 2)));
+
+ ASSERT_EQ(3, txn->GetNumKeys());
+
+ // Txn should commit
+ ASSERT_OK(txn->Commit());
+ s = txn_db->Get(read_options, "AAA", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
+ ASSERT_EQ(value, "barbar");
+
+ Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+ Slice value_slice("barbarbar");
+ // This write will cause a conflict with the earlier batch write
+ ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3),
+ SliceParts(&value_slice, 1)));
+
+ ASSERT_OK(txn2->Delete(handles[2], "XXX"));
+ ASSERT_OK(txn2->Delete(handles[1], "XXX"));
+ s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Verify txn did not commit
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(value, "barbar");
+
+ delete txn;
+ delete txn2;
+
+ txn = txn_db->BeginTransaction(write_options, txn_options);
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ ASSERT_NE(txn, nullptr);
+
+ std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+ handles[0], handles[2]};
+ std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+ std::vector<std::string> values(4);
+
+ std::vector<Status> results = txn->MultiGetForUpdate(
+ snapshot_read_options, multiget_cfh, multiget_keys, &values);
+ ASSERT_OK(results[0]);
+ ASSERT_OK(results[1]);
+ ASSERT_OK(results[2]);
+ ASSERT_TRUE(results[3].IsNotFound());
+ ASSERT_EQ(values[0], "bar");
+ ASSERT_EQ(values[1], "barbar");
+ ASSERT_EQ(values[2], "foo");
+
+ ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+ ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY"));
+ ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY"));
+ ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+ ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar"));
+
+ ASSERT_EQ(5, txn->GetNumKeys());
+
+ // Txn should commit
+ ASSERT_OK(txn->Commit());
+ s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Put a key which will conflict with the next txn using the previous snapshot
+ ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000"));
+
+ results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+ multiget_keys, &values);
+ ASSERT_OK(results[0]);
+ ASSERT_OK(results[1]);
+ ASSERT_OK(results[2]);
+ ASSERT_TRUE(results[3].IsNotFound());
+ ASSERT_EQ(values[0], "bar");
+ ASSERT_EQ(values[1], "barbar");
+ ASSERT_EQ(values[2], "foo");
+
+ // Verify Txn Did not Commit
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn_db->DropColumnFamily(handles[1]);
+ ASSERT_OK(s);
+ s = txn_db->DropColumnFamily(handles[2]);
+ ASSERT_OK(s);
+
+ delete txn;
+ delete txn2;
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+}
+
+TEST_P(OptimisticTransactionTest, EmptyTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa"));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn->Commit());
+ delete txn;
+
+ txn = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn->Rollback());
+ delete txn;
+
+ txn = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
+ ASSERT_EQ(value, "aaa");
+
+ ASSERT_OK(txn->Commit());
+ delete txn;
+
+ txn = txn_db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+ ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
+ ASSERT_EQ(value, "aaa");
+
+ ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx"));
+ Status s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
+ WriteOptions write_options;
+ ReadOptions read_options1, read_options2;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ txn_options.set_snapshot = true;
+ Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ Transaction* txn2 = txn_db->BeginTransaction(write_options);
+ txn2->SetSnapshot();
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ std::vector<Slice> multiget_keys = {"1", "2", "3"};
+ std::vector<std::string> multiget_values;
+
+ std::vector<Status> results =
+ txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+ ASSERT_TRUE(results[0].IsNotFound());
+ ASSERT_TRUE(results[1].IsNotFound());
+ ASSERT_TRUE(results[2].IsNotFound());
+
+ ASSERT_OK(txn2->Put("2", "x"));
+
+ ASSERT_OK(txn2->Commit());
+
+ multiget_values.clear();
+ results =
+ txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+ ASSERT_TRUE(results[0].IsNotFound());
+ ASSERT_TRUE(results[1].IsNotFound());
+ ASSERT_TRUE(results[2].IsNotFound());
+
+ // should not commit since txn2 wrote a key txn has read
+ Status s = txn1->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ ASSERT_OK(txn1->Put("4", "x"));
+
+ ASSERT_OK(txn2->Delete("4"));
+
+ // txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
+ ASSERT_OK(txn1->Commit());
+
+ s = txn2->GetForUpdate(read_options2, "4", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // txn2 cannot commit since txn1 changed "4"
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(OptimisticTransactionTest, LostUpdate) {
+ WriteOptions write_options;
+ ReadOptions read_options, read_options1, read_options2;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ // Test 2 transactions writing to the same key in multiple orders and
+ // with/without snapshots
+
+ Transaction* txn1 = txn_db->BeginTransaction(write_options);
+ Transaction* txn2 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->Put("1", "1"));
+ ASSERT_OK(txn2->Put("1", "2"));
+
+ ASSERT_OK(txn1->Commit());
+
+ Status s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn1;
+ delete txn2;
+
+ txn_options.set_snapshot = true;
+ txn1 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ ASSERT_OK(txn1->Put("1", "3"));
+ ASSERT_OK(txn2->Put("1", "4"));
+
+ ASSERT_OK(txn1->Commit());
+
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ ASSERT_OK(txn1->Put("1", "5"));
+ ASSERT_OK(txn1->Commit());
+
+ ASSERT_OK(txn2->Put("1", "6"));
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = txn_db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ ASSERT_OK(txn1->Put("1", "5"));
+ ASSERT_OK(txn1->Commit());
+
+ txn2->SetSnapshot();
+ ASSERT_OK(txn2->Put("1", "6"));
+ ASSERT_OK(txn2->Commit());
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+ txn2 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->Put("1", "7"));
+ ASSERT_OK(txn1->Commit());
+
+ ASSERT_OK(txn2->Put("1", "8"));
+ ASSERT_OK(txn2->Commit());
+
+ delete txn1;
+ delete txn2;
+
+ ASSERT_OK(txn_db->Get(read_options, "1", &value));
+ ASSERT_EQ(value, "8");
+}
+
+TEST_P(OptimisticTransactionTest, UntrackedWrites) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ // Verify transaction rollback works for untracked keys.
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn->PutUntracked("untracked", "0"));
+ ASSERT_OK(txn->Rollback());
+ s = txn_db->Get(read_options, "untracked", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+ txn = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn->Put("tracked", "1"));
+ ASSERT_OK(txn->PutUntracked("untracked", "1"));
+ ASSERT_OK(txn->MergeUntracked("untracked", "2"));
+ ASSERT_OK(txn->DeleteUntracked("untracked"));
+
+ // Write to the untracked key outside of the transaction and verify
+ // it doesn't prevent the transaction from committing.
+ ASSERT_OK(txn_db->Put(write_options, "untracked", "x"));
+
+ ASSERT_OK(txn->Commit());
+
+ s = txn_db->Get(read_options, "untracked", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+ txn = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn->Put("tracked", "10"));
+ ASSERT_OK(txn->PutUntracked("untracked", "A"));
+
+ // Write to tracked key outside of the transaction and verify that the
+ // untracked keys are not written when the commit fails.
+ ASSERT_OK(txn_db->Delete(write_options, "tracked"));
+
+ s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn_db->Get(read_options, "untracked", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, IteratorTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ // Write some keys to the db
+ ASSERT_OK(txn_db->Put(write_options, "A", "a"));
+ ASSERT_OK(txn_db->Put(write_options, "G", "g"));
+ ASSERT_OK(txn_db->Put(write_options, "F", "f"));
+ ASSERT_OK(txn_db->Put(write_options, "C", "c"));
+ ASSERT_OK(txn_db->Put(write_options, "D", "d"));
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ // Write some keys in a txn
+ ASSERT_OK(txn->Put("B", "b"));
+ ASSERT_OK(txn->Put("H", "h"));
+ ASSERT_OK(txn->Delete("D"));
+ ASSERT_OK(txn->Put("E", "e"));
+
+ txn->SetSnapshot();
+ const Snapshot* snapshot = txn->GetSnapshot();
+
+ // Write some keys to the db after the snapshot
+ ASSERT_OK(txn_db->Put(write_options, "BB", "xx"));
+ ASSERT_OK(txn_db->Put(write_options, "C", "xx"));
+
+ read_options.snapshot = snapshot;
+ Iterator* iter = txn->GetIterator(read_options);
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+
+ // Read all keys via iter and lock them all
+ std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+ for (int i = 0; i < 7; i++) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(results[i], iter->value().ToString());
+
+ ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr));
+
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Seek("G");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("g", iter->value().ToString());
+
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("f", iter->value().ToString());
+
+ iter->Seek("D");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("e", iter->value().ToString());
+
+ iter->Seek("C");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("c", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("e", iter->value().ToString());
+
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", iter->value().ToString());
+
+ iter->Seek("X");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+
+ iter->SeekToLast();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("h", iter->value().ToString());
+
+ // key "C" was modified in the db after txn's snapshot. txn will not commit.
+ Status s = txn->Commit();
+ ASSERT_TRUE(s.IsBusy());
+
+ delete iter;
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) {
+ // `OptimisticTransactionDB` does not allow range deletion in any API.
+ ASSERT_TRUE(
+ txn_db
+ ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b")
+ .IsNotSupported());
+ WriteBatch wb;
+ ASSERT_OK(wb.DeleteRange("a", "b"));
+ ASSERT_NOK(txn_db->Write(WriteOptions(), &wb));
+}
+
+TEST_P(OptimisticTransactionTest, SavepointTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ Transaction* txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ Status s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn->SetSavePoint(); // 1
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn
+ s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn->Put("B", "b"));
+
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn_db->Get(read_options, "B", &value));
+ ASSERT_EQ("b", value);
+
+ delete txn;
+ txn = txn_db->BeginTransaction(write_options);
+ ASSERT_NE(txn, nullptr);
+
+ ASSERT_OK(txn->Put("A", "a"));
+ ASSERT_OK(txn->Put("B", "bb"));
+ ASSERT_OK(txn->Put("C", "c"));
+
+ txn->SetSavePoint(); // 2
+
+ ASSERT_OK(txn->Delete("B"));
+ ASSERT_OK(txn->Put("C", "cc"));
+ ASSERT_OK(txn->Put("D", "d"));
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2
+
+ ASSERT_OK(txn->Get(read_options, "A", &value));
+ ASSERT_EQ("a", value);
+ ASSERT_OK(txn->Get(read_options, "B", &value));
+ ASSERT_EQ("bb", value);
+ ASSERT_OK(txn->Get(read_options, "C", &value));
+ ASSERT_EQ("c", value);
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn->Put("A", "a"));
+ ASSERT_OK(txn->Put("E", "e"));
+
+ // Rollback to beginning of txn
+ s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_OK(txn->Rollback());
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_OK(txn->Get(read_options, "B", &value));
+ ASSERT_EQ("b", value);
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(read_options, "E", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn->Put("A", "aa"));
+ ASSERT_OK(txn->Put("F", "f"));
+
+ txn->SetSavePoint(); // 3
+ txn->SetSavePoint(); // 4
+
+ ASSERT_OK(txn->Put("G", "g"));
+ ASSERT_OK(txn->Delete("F"));
+ ASSERT_OK(txn->Delete("B"));
+
+ ASSERT_OK(txn->Get(read_options, "A", &value));
+ ASSERT_EQ("aa", value);
+
+ s = txn->Get(read_options, "F", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3
+
+ ASSERT_OK(txn->Get(read_options, "F", &value));
+ ASSERT_EQ("f", value);
+
+ s = txn->Get(read_options, "G", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(txn_db->Get(read_options, "F", &value));
+ ASSERT_EQ("f", value);
+
+ s = txn_db->Get(read_options, "G", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(txn_db->Get(read_options, "A", &value));
+ ASSERT_EQ("aa", value);
+
+ ASSERT_OK(txn_db->Get(read_options, "B", &value));
+ ASSERT_EQ("b", value);
+
+ s = txn_db->Get(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn_db->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn_db->Get(read_options, "E", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ OptimisticTransactionOptions txn_options;
+ std::string value;
+
+ ASSERT_OK(txn_db->Put(write_options, "A", ""));
+
+ Transaction* txn1 = txn_db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn1);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->UndoGetForUpdate("A");
+
+ Transaction* txn2 = txn_db->BeginTransaction(write_options);
+ txn2->Put("A", "x");
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 can commit since A isn't conflict checked
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn1->Put("A", "a"));
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 cannot commit since A will still be conflict checked
+ Status s = txn1->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 cannot commit since A will still be conflict checked
+ s = txn1->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 can commit since A isn't conflict checked
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->SetSavePoint();
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 cannot commit since A will still be conflict checked
+ s = txn1->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->SetSavePoint();
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 cannot commit since A will still be conflict checked
+ s = txn1->Commit();
+ ASSERT_TRUE(s.IsBusy());
+ delete txn1;
+
+ txn1 = txn_db->BeginTransaction(write_options);
+
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+ txn1->SetSavePoint();
+ ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+ txn1->UndoGetForUpdate("A");
+
+ ASSERT_OK(txn1->RollbackToSavePoint());
+ txn1->UndoGetForUpdate("A");
+
+ txn2 = txn_db->BeginTransaction(write_options);
+ ASSERT_OK(txn2->Put("A", "x"));
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+
+ // Verify that txn1 can commit since A isn't conflict checked
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+}
+
+namespace {
+Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db,
+ const size_t num_transactions,
+ const size_t num_sets,
+ const size_t num_keys_per_set) {
+ size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 _rand(seed);
+ WriteOptions write_options;
+ ReadOptions read_options;
+ OptimisticTransactionOptions txn_options;
+ txn_options.set_snapshot = true;
+
+ RandomTransactionInserter inserter(&_rand, write_options, read_options,
+ num_keys_per_set,
+ static_cast<uint16_t>(num_sets));
+
+ for (size_t t = 0; t < num_transactions; t++) {
+ bool success = inserter.OptimisticTransactionDBInsert(db, txn_options);
+ if (!success) {
+ // unexpected failure
+ return inserter.GetLastStatus();
+ }
+ }
+
+ inserter.GetLastStatus().PermitUncheckedError();
+
+ // Make sure at least some of the transactions succeeded. It's ok if
+ // some failed due to write-conflicts.
+ if (inserter.GetFailureCount() > num_transactions / 2) {
+ return Status::TryAgain("Too many transactions failed! " +
+ std::to_string(inserter.GetFailureCount()) + " / " +
+ std::to_string(num_transactions));
+ }
+
+ return Status::OK();
+}
+} // namespace
+
+TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) {
+ const size_t num_threads = 4;
+ const size_t num_transactions_per_thread = 10000;
+ const size_t num_sets = 3;
+ const size_t num_keys_per_set = 100;
+ // Setting the key-space to be 100 keys should cause enough write-conflicts
+ // to make this test interesting.
+
+ std::vector<port::Thread> threads;
+
+ std::function<void()> call_inserter = [&] {
+ ASSERT_OK(OptimisticTransactionStressTestInserter(
+ txn_db, num_transactions_per_thread, num_sets, num_keys_per_set));
+ };
+
+ // Create N threads that use RandomTransactionInserter to write
+ // many transactions.
+ for (uint32_t i = 0; i < num_threads; i++) {
+ threads.emplace_back(call_inserter);
+ }
+
+ // Wait for all threads to run
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ // Verify that data is consistent
+ Status s = RandomTransactionInserter::Verify(txn_db, num_sets);
+ ASSERT_OK(s);
+}
+
+TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) {
+ WriteOptions write_options;
+ OptimisticTransactionOptions transaction_options;
+
+ Transaction* transaction(
+ txn_db->BeginTransaction(write_options, transaction_options));
+ Status s = transaction->Put("foo", "val");
+ ASSERT_OK(s);
+ s = transaction->Put("foo2", "val");
+ ASSERT_OK(s);
+ s = transaction->Put("foo3", "val");
+ ASSERT_OK(s);
+ s = transaction->Commit();
+ ASSERT_OK(s);
+ delete transaction;
+
+ Reopen();
+ transaction = txn_db->BeginTransaction(write_options, transaction_options);
+ s = transaction->Put("bar", "val");
+ ASSERT_OK(s);
+ s = transaction->Put("bar2", "val");
+ ASSERT_OK(s);
+ s = transaction->Commit();
+ ASSERT_OK(s);
+
+ delete transaction;
+}
+
+TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) {
+ std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
+ ASSERT_OK(txn->Put("a", "v"));
+ Status s = txn->CommitAndTryCreateSnapshot();
+ ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) {
+ std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
+ ASSERT_OK(txn->Put("a", "v"));
+ std::shared_ptr<const Snapshot> snapshot;
+ Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot);
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(
+ InstanceOccGroup, OptimisticTransactionTest,
+ testing::Values(OccValidationPolicy::kValidateSerial,
+ OccValidationPolicy::kValidateParallel));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(
+ stderr,
+ "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc
new file mode 100644
index 000000000..cb8fd3bb6
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc
@@ -0,0 +1,1175 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/pessimistic_transaction.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_util.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+std::atomic<TransactionID> PessimisticTransaction::txn_id_counter_(1);
+
+TransactionID PessimisticTransaction::GenTxnID() {
+ return txn_id_counter_.fetch_add(1);
+}
+
+PessimisticTransaction::PessimisticTransaction(
+ TransactionDB* txn_db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options, const bool init)
+ : TransactionBaseImpl(
+ txn_db->GetRootDB(), write_options,
+ static_cast_with_check<PessimisticTransactionDB>(txn_db)
+ ->GetLockTrackerFactory()),
+ txn_db_impl_(nullptr),
+ expiration_time_(0),
+ txn_id_(0),
+ waiting_cf_id_(0),
+ waiting_key_(nullptr),
+ lock_timeout_(0),
+ deadlock_detect_(false),
+ deadlock_detect_depth_(0),
+ skip_concurrency_control_(false) {
+ txn_db_impl_ = static_cast_with_check<PessimisticTransactionDB>(txn_db);
+ db_impl_ = static_cast_with_check<DBImpl>(db_);
+ if (init) {
+ Initialize(txn_options);
+ }
+}
+
+void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
+ // Range lock manager uses address of transaction object as TXNID
+ const TransactionDBOptions& db_options = txn_db_impl_->GetTxnDBOptions();
+ if (db_options.lock_mgr_handle &&
+ db_options.lock_mgr_handle->getLockManager()->IsRangeLockSupported()) {
+ txn_id_ = reinterpret_cast<TransactionID>(this);
+ } else {
+ txn_id_ = GenTxnID();
+ }
+
+ txn_state_ = STARTED;
+
+ deadlock_detect_ = txn_options.deadlock_detect;
+ deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
+ write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
+ skip_concurrency_control_ = txn_options.skip_concurrency_control;
+
+ lock_timeout_ = txn_options.lock_timeout * 1000;
+ if (lock_timeout_ < 0) {
+ // Lock timeout not set, use default
+ lock_timeout_ =
+ txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000;
+ }
+
+ if (txn_options.expiration >= 0) {
+ expiration_time_ = start_time_ + txn_options.expiration * 1000;
+ } else {
+ expiration_time_ = 0;
+ }
+
+ if (txn_options.set_snapshot) {
+ SetSnapshot();
+ }
+
+ if (expiration_time_ > 0) {
+ txn_db_impl_->InsertExpirableTransaction(txn_id_, this);
+ }
+ use_only_the_last_commit_time_batch_for_recovery_ =
+ txn_options.use_only_the_last_commit_time_batch_for_recovery;
+ skip_prepare_ = txn_options.skip_prepare;
+
+ read_timestamp_ = kMaxTxnTimestamp;
+ commit_timestamp_ = kMaxTxnTimestamp;
+}
+
+PessimisticTransaction::~PessimisticTransaction() {
+ txn_db_impl_->UnLock(this, *tracked_locks_);
+ if (expiration_time_ > 0) {
+ txn_db_impl_->RemoveExpirableTransaction(txn_id_);
+ }
+ if (!name_.empty() && txn_state_ != COMMITTED) {
+ txn_db_impl_->UnregisterTransaction(this);
+ }
+}
+
+void PessimisticTransaction::Clear() {
+ txn_db_impl_->UnLock(this, *tracked_locks_);
+ TransactionBaseImpl::Clear();
+}
+
+void PessimisticTransaction::Reinitialize(
+ TransactionDB* txn_db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options) {
+ if (!name_.empty() && txn_state_ != COMMITTED) {
+ txn_db_impl_->UnregisterTransaction(this);
+ }
+ TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options);
+ Initialize(txn_options);
+}
+
+bool PessimisticTransaction::IsExpired() const {
+ if (expiration_time_ > 0) {
+ if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) {
+ // Transaction is expired.
+ return true;
+ }
+ }
+
+ return false;
+}
+
+WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options)
+ : PessimisticTransaction(txn_db, write_options, txn_options) {}
+
+Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value,
+ bool exclusive, const bool do_validate) {
+ return GetForUpdateImpl(read_options, column_family, key, value, exclusive,
+ do_validate);
+}
+
+Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key,
+ PinnableSlice* pinnable_val,
+ bool exclusive, const bool do_validate) {
+ return GetForUpdateImpl(read_options, column_family, key, pinnable_val,
+ exclusive, do_validate);
+}
+
+template <typename TValue>
+inline Status WriteCommittedTxn::GetForUpdateImpl(
+ const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const Slice& key, TValue* value, bool exclusive, const bool do_validate) {
+ column_family =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+ assert(column_family);
+ if (!read_options.timestamp) {
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (0 == ts_sz) {
+ return TransactionBaseImpl::GetForUpdate(read_options, column_family, key,
+ value, exclusive, do_validate);
+ }
+ } else {
+ Status s = db_impl_->FailIfTsMismatchCf(
+ column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!do_validate) {
+ return Status::InvalidArgument(
+ "If do_validate is false then GetForUpdate with read_timestamp is not "
+ "defined.");
+ } else if (kMaxTxnTimestamp == read_timestamp_) {
+ return Status::InvalidArgument("read_timestamp must be set for validation");
+ }
+
+ if (!read_options.timestamp) {
+ ReadOptions read_opts_copy = read_options;
+ char ts_buf[sizeof(kMaxTxnTimestamp)];
+ EncodeFixed64(ts_buf, read_timestamp_);
+ Slice ts(ts_buf, sizeof(ts_buf));
+ read_opts_copy.timestamp = &ts;
+ return TransactionBaseImpl::GetForUpdate(read_opts_copy, column_family, key,
+ value, exclusive, do_validate);
+ }
+ assert(read_options.timestamp);
+ const char* const ts_buf = read_options.timestamp->data();
+ assert(read_options.timestamp->size() == sizeof(kMaxTxnTimestamp));
+ TxnTimestamp ts = DecodeFixed64(ts_buf);
+ if (ts != read_timestamp_) {
+ return Status::InvalidArgument("Must read from the same read_timestamp");
+ }
+ return TransactionBaseImpl::GetForUpdate(read_options, column_family, key,
+ value, exclusive, do_validate);
+}
+
+Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, &value, this]() {
+ Status s =
+ GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ ++num_puts_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, &value, this]() {
+ Status s =
+ GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ ++num_puts_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ return Operate(
+ column_family, key, /*do_validate=*/false,
+ /*assume_tracked=*/false, [column_family, &key, &value, this]() {
+ Status s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ ++num_puts_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const SliceParts& value) {
+ return Operate(
+ column_family, key, /*do_validate=*/false,
+ /*assume_tracked=*/false, [column_family, &key, &value, this]() {
+ Status s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ ++num_puts_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family,
+ const Slice& key, const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, this]() {
+ Status s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, this]() {
+ Status s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ return Operate(column_family, key, /*do_validate=*/false,
+ /*assume_tracked=*/false, [column_family, &key, this]() {
+ Status s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ return Operate(column_family, key, /*do_validate=*/false,
+ /*assume_tracked=*/false, [column_family, &key, this]() {
+ Status s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, this]() {
+ Status s =
+ GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, this]() {
+ Status s =
+ GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::SingleDeleteUntracked(
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ return Operate(column_family, key, /*do_validate=*/false,
+ /*assume_tracked=*/false, [column_family, &key, this]() {
+ Status s =
+ GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ ++num_deletes_;
+ }
+ return s;
+ });
+}
+
+Status WriteCommittedTxn::Merge(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ return Operate(column_family, key, do_validate, assume_tracked,
+ [column_family, &key, &value, this]() {
+ Status s =
+ GetBatchForWrite()->Merge(column_family, key, value);
+ if (s.ok()) {
+ ++num_merges_;
+ }
+ return s;
+ });
+}
+
+template <typename TKey, typename TOperation>
+Status WriteCommittedTxn::Operate(ColumnFamilyHandle* column_family,
+ const TKey& key, const bool do_validate,
+ const bool assume_tracked,
+ TOperation&& operation) {
+ Status s;
+ if constexpr (std::is_same_v<Slice, TKey>) {
+ s = TryLock(column_family, key, /*read_only=*/false, /*exclusive=*/true,
+ do_validate, assume_tracked);
+ } else if constexpr (std::is_same_v<SliceParts, TKey>) {
+ std::string key_buf;
+ Slice contiguous_key(key, &key_buf);
+ s = TryLock(column_family, contiguous_key, /*read_only=*/false,
+ /*exclusive=*/true, do_validate, assume_tracked);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ column_family =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+ assert(column_family);
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (ts_sz > 0) {
+ assert(ts_sz == sizeof(TxnTimestamp));
+ if (!IndexingEnabled()) {
+ cfs_with_ts_tracked_when_indexing_disabled_.insert(
+ column_family->GetID());
+ }
+ }
+ return operation();
+}
+
+Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) {
+ if (read_timestamp_ < kMaxTxnTimestamp && ts < read_timestamp_) {
+ return Status::InvalidArgument(
+ "Cannot decrease read timestamp for validation");
+ }
+ read_timestamp_ = ts;
+ return Status::OK();
+}
+
+Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) {
+ if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) {
+ return Status::InvalidArgument(
+ "Cannot commit at timestamp smaller than or equal to read timestamp");
+ }
+ commit_timestamp_ = ts;
+ return Status::OK();
+}
+
+Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
+ if (batch && WriteBatchInternal::HasKeyWithTimestamp(*batch)) {
+ // CommitBatch() needs to lock the keys in the batch.
+ // However, the application also needs to specify the timestamp for the
+ // keys in batch before calling this API.
+ // This means timestamp order may violate the order of locking, thus
+ // violate the sequence number order for the same user key.
+ // Therefore, we disallow this operation for now.
+ return Status::NotSupported(
+ "Batch to commit includes timestamp assigned before locking");
+ }
+
+ std::unique_ptr<LockTracker> keys_to_unlock(lock_tracker_factory_.Create());
+ Status s = LockBatch(batch, keys_to_unlock.get());
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ bool can_commit = false;
+
+ if (IsExpired()) {
+ s = Status::Expired();
+ } else if (expiration_time_ > 0) {
+ TransactionState expected = STARTED;
+ can_commit = std::atomic_compare_exchange_strong(&txn_state_, &expected,
+ AWAITING_COMMIT);
+ } else if (txn_state_ == STARTED) {
+ // lock stealing is not a concern
+ can_commit = true;
+ }
+
+ if (can_commit) {
+ txn_state_.store(AWAITING_COMMIT);
+ s = CommitBatchInternal(batch);
+ if (s.ok()) {
+ txn_state_.store(COMMITTED);
+ }
+ } else if (txn_state_ == LOCKS_STOLEN) {
+ s = Status::Expired();
+ } else {
+ s = Status::InvalidArgument("Transaction is not in state for commit.");
+ }
+
+ txn_db_impl_->UnLock(this, *keys_to_unlock);
+
+ return s;
+}
+
+Status PessimisticTransaction::Prepare() {
+ if (name_.empty()) {
+ return Status::InvalidArgument(
+ "Cannot prepare a transaction that has not been named.");
+ }
+
+ if (IsExpired()) {
+ return Status::Expired();
+ }
+
+ Status s;
+ bool can_prepare = false;
+
+ if (expiration_time_ > 0) {
+ // must concern ourselves with expiraton and/or lock stealing
+ // need to compare/exchange bc locks could be stolen under us here
+ TransactionState expected = STARTED;
+ can_prepare = std::atomic_compare_exchange_strong(&txn_state_, &expected,
+ AWAITING_PREPARE);
+ } else if (txn_state_ == STARTED) {
+ // expiration and lock stealing is not possible
+ txn_state_.store(AWAITING_PREPARE);
+ can_prepare = true;
+ }
+
+ if (can_prepare) {
+ // transaction can't expire after preparation
+ expiration_time_ = 0;
+ assert(log_number_ == 0 ||
+ txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+
+ s = PrepareInternal();
+ if (s.ok()) {
+ txn_state_.store(PREPARED);
+ }
+ } else if (txn_state_ == LOCKS_STOLEN) {
+ s = Status::Expired();
+ } else if (txn_state_ == PREPARED) {
+ s = Status::InvalidArgument("Transaction has already been prepared.");
+ } else if (txn_state_ == COMMITTED) {
+ s = Status::InvalidArgument("Transaction has already been committed.");
+ } else if (txn_state_ == ROLLEDBACK) {
+ s = Status::InvalidArgument("Transaction has already been rolledback.");
+ } else {
+ s = Status::InvalidArgument("Transaction is not in state for commit.");
+ }
+
+ return s;
+}
+
+Status WriteCommittedTxn::PrepareInternal() {
+ WriteOptions write_options = write_options_;
+ write_options.disableWAL = false;
+ auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+ name_);
+ assert(s.ok());
+ class MarkLogCallback : public PreReleaseCallback {
+ public:
+ MarkLogCallback(DBImpl* db, bool two_write_queues)
+ : db_(db), two_write_queues_(two_write_queues) {
+ (void)two_write_queues_; // to silence unused private field warning
+ }
+ virtual Status Callback(SequenceNumber, bool is_mem_disabled,
+ uint64_t log_number, size_t /*index*/,
+ size_t /*total*/) override {
+#ifdef NDEBUG
+ (void)is_mem_disabled;
+#endif
+ assert(log_number != 0);
+ assert(!two_write_queues_ || is_mem_disabled); // implies the 2nd queue
+ db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number);
+ return Status::OK();
+ }
+
+ private:
+ DBImpl* db_;
+ bool two_write_queues_;
+ } mark_log_callback(db_impl_,
+ db_impl_->immutable_db_options().two_write_queues);
+
+ WriteCallback* const kNoWriteCallback = nullptr;
+ const uint64_t kRefNoLog = 0;
+ const bool kDisableMemtable = true;
+ SequenceNumber* const KIgnoreSeqUsed = nullptr;
+ const size_t kNoBatchCount = 0;
+ s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+ kNoWriteCallback, &log_number_, kRefNoLog,
+ kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
+ &mark_log_callback);
+ return s;
+}
+
+Status PessimisticTransaction::Commit() {
+ bool commit_without_prepare = false;
+ bool commit_prepared = false;
+
+ if (IsExpired()) {
+ return Status::Expired();
+ }
+
+ if (expiration_time_ > 0) {
+ // we must atomicaly compare and exchange the state here because at
+ // this state in the transaction it is possible for another thread
+ // to change our state out from under us in the even that we expire and have
+ // our locks stolen. In this case the only valid state is STARTED because
+ // a state of PREPARED would have a cleared expiration_time_.
+ TransactionState expected = STARTED;
+ commit_without_prepare = std::atomic_compare_exchange_strong(
+ &txn_state_, &expected, AWAITING_COMMIT);
+ TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1");
+ } else if (txn_state_ == PREPARED) {
+ // expiration and lock stealing is not a concern
+ commit_prepared = true;
+ } else if (txn_state_ == STARTED) {
+ // expiration and lock stealing is not a concern
+ if (skip_prepare_) {
+ commit_without_prepare = true;
+ } else {
+ return Status::TxnNotPrepared();
+ }
+ }
+
+ Status s;
+ if (commit_without_prepare) {
+ assert(!commit_prepared);
+ if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) {
+ s = Status::InvalidArgument(
+ "Commit-time batch contains values that will not be committed.");
+ } else {
+ txn_state_.store(AWAITING_COMMIT);
+ if (log_number_ > 0) {
+ dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+ log_number_);
+ }
+ s = CommitWithoutPrepareInternal();
+ if (!name_.empty()) {
+ txn_db_impl_->UnregisterTransaction(this);
+ }
+ Clear();
+ if (s.ok()) {
+ txn_state_.store(COMMITTED);
+ }
+ }
+ } else if (commit_prepared) {
+ txn_state_.store(AWAITING_COMMIT);
+
+ s = CommitInternal();
+
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+ "Commit write failed");
+ return s;
+ }
+
+ // FindObsoleteFiles must now look to the memtables
+ // to determine what prep logs must be kept around,
+ // not the prep section heap.
+ assert(log_number_ > 0);
+ dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+ log_number_);
+ txn_db_impl_->UnregisterTransaction(this);
+
+ Clear();
+ txn_state_.store(COMMITTED);
+ } else if (txn_state_ == LOCKS_STOLEN) {
+ s = Status::Expired();
+ } else if (txn_state_ == COMMITTED) {
+ s = Status::InvalidArgument("Transaction has already been committed.");
+ } else if (txn_state_ == ROLLEDBACK) {
+ s = Status::InvalidArgument("Transaction has already been rolledback.");
+ } else {
+ s = Status::InvalidArgument("Transaction is not in state for commit.");
+ }
+
+ return s;
+}
+
+Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
+ WriteBatchWithIndex* wbwi = GetWriteBatch();
+ assert(wbwi);
+ WriteBatch* wb = wbwi->GetWriteBatch();
+ assert(wb);
+
+ const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb);
+ if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) {
+ return Status::InvalidArgument("Must assign a commit timestamp");
+ }
+
+ if (needs_ts) {
+ assert(commit_timestamp_ != kMaxTxnTimestamp);
+ char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
+ EncodeFixed64(commit_ts_buf, commit_timestamp_);
+ Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
+
+ Status s =
+ wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
+ auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf);
+ if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) {
+ return sizeof(kMaxTxnTimestamp);
+ }
+ const Comparator* ucmp =
+ WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
+ return ucmp ? ucmp->timestamp_size()
+ : std::numeric_limits<uint64_t>::max();
+ });
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ uint64_t seq_used = kMaxSequenceNumber;
+ SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_,
+ snapshot_notifier_, snapshot_);
+ PostMemTableCallback* post_mem_cb = nullptr;
+ if (snapshot_needed_) {
+ if (commit_timestamp_ == kMaxTxnTimestamp) {
+ return Status::InvalidArgument("Must set transaction commit timestamp");
+ } else {
+ post_mem_cb = &snapshot_creation_cb;
+ }
+ }
+ auto s = db_impl_->WriteImpl(write_options_, wb,
+ /*callback*/ nullptr, /*log_used*/ nullptr,
+ /*log_ref*/ 0, /*disable_memtable*/ false,
+ &seq_used, /*batch_cnt=*/0,
+ /*pre_release_callback=*/nullptr, post_mem_cb);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ if (s.ok()) {
+ SetId(seq_used);
+ }
+ return s;
+}
+
+Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
+ uint64_t seq_used = kMaxSequenceNumber;
+ auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
+ /*log_used*/ nullptr, /*log_ref*/ 0,
+ /*disable_memtable*/ false, &seq_used);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ if (s.ok()) {
+ SetId(seq_used);
+ }
+ return s;
+}
+
+Status WriteCommittedTxn::CommitInternal() {
+ WriteBatchWithIndex* wbwi = GetWriteBatch();
+ assert(wbwi);
+ WriteBatch* wb = wbwi->GetWriteBatch();
+ assert(wb);
+
+ const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb);
+ if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) {
+ return Status::InvalidArgument("Must assign a commit timestamp");
+ }
+ // We take the commit-time batch and append the Commit marker.
+ // The Memtable will ignore the Commit marker in non-recovery mode
+ WriteBatch* working_batch = GetCommitTimeWriteBatch();
+
+ Status s;
+ if (!needs_ts) {
+ s = WriteBatchInternal::MarkCommit(working_batch, name_);
+ } else {
+ assert(commit_timestamp_ != kMaxTxnTimestamp);
+ char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
+ EncodeFixed64(commit_ts_buf, commit_timestamp_);
+ Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
+ s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_,
+ commit_ts);
+ if (s.ok()) {
+ s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
+ if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
+ cfs_with_ts_tracked_when_indexing_disabled_.end()) {
+ return sizeof(kMaxTxnTimestamp);
+ }
+ const Comparator* ucmp =
+ WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
+ return ucmp ? ucmp->timestamp_size()
+ : std::numeric_limits<uint64_t>::max();
+ });
+ }
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ // any operations appended to this working_batch will be ignored from WAL
+ working_batch->MarkWalTerminationPoint();
+
+ // insert prepared batch into Memtable only skipping WAL.
+ // Memtable will ignore BeginPrepare/EndPrepare markers
+ // in non recovery mode and simply insert the values
+ s = WriteBatchInternal::Append(working_batch, wb);
+ assert(s.ok());
+
+ uint64_t seq_used = kMaxSequenceNumber;
+ SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_,
+ snapshot_notifier_, snapshot_);
+ PostMemTableCallback* post_mem_cb = nullptr;
+ if (snapshot_needed_) {
+ if (commit_timestamp_ == kMaxTxnTimestamp) {
+ s = Status::InvalidArgument("Must set transaction commit timestamp");
+ return s;
+ } else {
+ post_mem_cb = &snapshot_creation_cb;
+ }
+ }
+ s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+ /*log_used*/ nullptr, /*log_ref*/ log_number_,
+ /*disable_memtable*/ false, &seq_used,
+ /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
+ post_mem_cb);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ if (s.ok()) {
+ SetId(seq_used);
+ }
+ return s;
+}
+
+Status PessimisticTransaction::Rollback() {
+ Status s;
+ if (txn_state_ == PREPARED) {
+ txn_state_.store(AWAITING_ROLLBACK);
+
+ s = RollbackInternal();
+
+ if (s.ok()) {
+ // we do not need to keep our prepared section around
+ assert(log_number_ > 0);
+ dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+ log_number_);
+ Clear();
+ txn_state_.store(ROLLEDBACK);
+ }
+ } else if (txn_state_ == STARTED) {
+ if (log_number_ > 0) {
+ assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+ assert(GetId() > 0);
+ s = RollbackInternal();
+
+ if (s.ok()) {
+ dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+ log_number_);
+ }
+ }
+ // prepare couldn't have taken place
+ Clear();
+ } else if (txn_state_ == COMMITTED) {
+ s = Status::InvalidArgument("This transaction has already been committed.");
+ } else {
+ s = Status::InvalidArgument(
+ "Two phase transaction is not in state for rollback.");
+ }
+
+ return s;
+}
+
+Status WriteCommittedTxn::RollbackInternal() {
+ WriteBatch rollback_marker;
+ auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_);
+ assert(s.ok());
+ s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+ return s;
+}
+
+Status PessimisticTransaction::RollbackToSavePoint() {
+ if (txn_state_ != STARTED) {
+ return Status::InvalidArgument("Transaction is beyond state for rollback.");
+ }
+
+ if (save_points_ != nullptr && !save_points_->empty()) {
+ // Unlock any keys locked since last transaction
+ auto& save_point_tracker = *save_points_->top().new_locks_;
+ std::unique_ptr<LockTracker> t(
+ tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker));
+ if (t) {
+ txn_db_impl_->UnLock(this, *t);
+ }
+ }
+
+ return TransactionBaseImpl::RollbackToSavePoint();
+}
+
+// Lock all keys in this batch.
+// On success, caller should unlock keys_to_unlock
+Status PessimisticTransaction::LockBatch(WriteBatch* batch,
+ LockTracker* keys_to_unlock) {
+ if (!batch) {
+ return Status::InvalidArgument("batch is nullptr");
+ }
+
+ class Handler : public WriteBatch::Handler {
+ public:
+ // Sorted map of column_family_id to sorted set of keys.
+ // Since LockBatch() always locks keys in sorted order, it cannot deadlock
+ // with itself. We're not using a comparator here since it doesn't matter
+ // what the sorting is as long as it's consistent.
+ std::map<uint32_t, std::set<std::string>> keys_;
+
+ Handler() {}
+
+ void RecordKey(uint32_t column_family_id, const Slice& key) {
+ std::string key_str = key.ToString();
+
+ auto& cfh_keys = keys_[column_family_id];
+ auto iter = cfh_keys.find(key_str);
+ if (iter == cfh_keys.end()) {
+ // key not yet seen, store it.
+ cfh_keys.insert({std::move(key_str)});
+ }
+ }
+
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& /* unused */) override {
+ RecordKey(column_family_id, key);
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& /* unused */) override {
+ RecordKey(column_family_id, key);
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ RecordKey(column_family_id, key);
+ return Status::OK();
+ }
+ };
+
+ // Iterating on this handler will add all keys in this batch into keys
+ Handler handler;
+ Status s = batch->Iterate(&handler);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Attempt to lock all keys
+ for (const auto& cf_iter : handler.keys_) {
+ uint32_t cfh_id = cf_iter.first;
+ auto& cfh_keys = cf_iter.second;
+
+ for (const auto& key_iter : cfh_keys) {
+ const std::string& key = key_iter;
+
+ s = txn_db_impl_->TryLock(this, cfh_id, key, true /* exclusive */);
+ if (!s.ok()) {
+ break;
+ }
+ PointLockRequest r;
+ r.column_family_id = cfh_id;
+ r.key = key;
+ r.seq = kMaxSequenceNumber;
+ r.read_only = false;
+ r.exclusive = true;
+ keys_to_unlock->Track(r);
+ }
+
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ if (!s.ok()) {
+ txn_db_impl_->UnLock(this, *keys_to_unlock);
+ }
+
+ return s;
+}
+
+// Attempt to lock this key.
+// Returns OK if the key has been successfully locked. Non-ok, otherwise.
+// If check_shapshot is true and this transaction has a snapshot set,
+// this key will only be locked if there have been no writes to this key since
+// the snapshot time.
+Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+ const Slice& key, bool read_only,
+ bool exclusive, const bool do_validate,
+ const bool assume_tracked) {
+ assert(!assume_tracked || !do_validate);
+ Status s;
+ if (UNLIKELY(skip_concurrency_control_)) {
+ return s;
+ }
+ uint32_t cfh_id = GetColumnFamilyID(column_family);
+ std::string key_str = key.ToString();
+
+ PointLockStatus status;
+ bool lock_upgrade;
+ bool previously_locked;
+ if (tracked_locks_->IsPointLockSupported()) {
+ status = tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+ previously_locked = status.locked;
+ lock_upgrade = previously_locked && exclusive && !status.exclusive;
+ } else {
+ // If the record is tracked, we can assume it was locked, too.
+ previously_locked = assume_tracked;
+ status.locked = false;
+ lock_upgrade = false;
+ }
+
+ // Lock this key if this transactions hasn't already locked it or we require
+ // an upgrade.
+ if (!previously_locked || lock_upgrade) {
+ s = txn_db_impl_->TryLock(this, cfh_id, key_str, exclusive);
+ }
+
+ const ColumnFamilyHandle* const cfh =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+ assert(cfh);
+ const Comparator* const ucmp = cfh->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+
+ SetSnapshotIfNeeded();
+
+ // Even though we do not care about doing conflict checking for this write,
+ // we still need to take a lock to make sure we do not cause a conflict with
+ // some other write. However, we do not need to check if there have been
+ // any writes since this transaction's snapshot.
+ // TODO(agiardullo): could optimize by supporting shared txn locks in the
+ // future.
+ SequenceNumber tracked_at_seq =
+ status.locked ? status.seq : kMaxSequenceNumber;
+ if (!do_validate || (snapshot_ == nullptr &&
+ (0 == ts_sz || kMaxTxnTimestamp == read_timestamp_))) {
+ if (assume_tracked && !previously_locked &&
+ tracked_locks_->IsPointLockSupported()) {
+ s = Status::InvalidArgument(
+ "assume_tracked is set but it is not tracked yet");
+ }
+ // Need to remember the earliest sequence number that we know that this
+ // key has not been modified after. This is useful if this same
+ // transaction later tries to lock this key again.
+ if (tracked_at_seq == kMaxSequenceNumber) {
+ // Since we haven't checked a snapshot, we only know this key has not
+ // been modified since after we locked it.
+ // Note: when last_seq_same_as_publish_seq_==false this is less than the
+ // latest allocated seq but it is ok since i) this is just a heuristic
+ // used only as a hint to avoid actual check for conflicts, ii) this would
+ // cause a false positive only if the snapthot is taken right after the
+ // lock, which would be an unusual sequence.
+ tracked_at_seq = db_->GetLatestSequenceNumber();
+ }
+ } else if (s.ok()) {
+ // If a snapshot is set, we need to make sure the key hasn't been modified
+ // since the snapshot. This must be done after we locked the key.
+ // If we already have validated an earilier snapshot it must has been
+ // reflected in tracked_at_seq and ValidateSnapshot will return OK.
+ s = ValidateSnapshot(column_family, key, &tracked_at_seq);
+
+ if (!s.ok()) {
+ // Failed to validate key
+ // Unlock key we just locked
+ if (lock_upgrade) {
+ s = txn_db_impl_->TryLock(this, cfh_id, key_str, false /* exclusive */);
+ assert(s.ok());
+ } else if (!previously_locked) {
+ txn_db_impl_->UnLock(this, cfh_id, key.ToString());
+ }
+ }
+ }
+
+ if (s.ok()) {
+ // We must track all the locked keys so that we can unlock them later. If
+ // the key is already locked, this func will update some stats on the
+ // tracked key. It could also update the tracked_at_seq if it is lower
+ // than the existing tracked key seq. These stats are necessary for
+ // RollbackToSavePoint to determine whether a key can be safely removed
+ // from tracked_keys_. Removal can only be done if a key was only locked
+ // during the current savepoint.
+ //
+ // Recall that if assume_tracked is true, we assume that TrackKey has been
+ // called previously since the last savepoint, with the same exclusive
+ // setting, and at a lower sequence number, so skipping here should be
+ // safe.
+ if (!assume_tracked) {
+ TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive);
+ } else {
+#ifndef NDEBUG
+ if (tracked_locks_->IsPointLockSupported()) {
+ PointLockStatus lock_status =
+ tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+ assert(lock_status.locked);
+ assert(lock_status.seq <= tracked_at_seq);
+ assert(lock_status.exclusive == exclusive);
+ }
+#endif
+ }
+ }
+
+ return s;
+}
+
+Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family,
+ const Endpoint& start_endp,
+ const Endpoint& end_endp) {
+ ColumnFamilyHandle* cfh =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+ uint32_t cfh_id = GetColumnFamilyID(cfh);
+
+ Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp);
+
+ if (s.ok()) {
+ RangeLockRequest req{cfh_id, start_endp, end_endp};
+ tracked_locks_->Track(req);
+ }
+ return s;
+}
+
+// Return OK() if this key has not been modified more recently than the
+// transaction snapshot_.
+// tracked_at_seq is the global seq at which we either locked the key or already
+// have done ValidateSnapshot.
+Status PessimisticTransaction::ValidateSnapshot(
+ ColumnFamilyHandle* column_family, const Slice& key,
+ SequenceNumber* tracked_at_seq) {
+ assert(snapshot_ || read_timestamp_ < kMaxTxnTimestamp);
+
+ SequenceNumber snap_seq = 0;
+ if (snapshot_) {
+ snap_seq = snapshot_->GetSequenceNumber();
+ if (*tracked_at_seq <= snap_seq) {
+ // If the key has been previous validated (or locked) at a sequence number
+ // earlier than the current snapshot's sequence number, we already know it
+ // has not been modified aftter snap_seq either.
+ return Status::OK();
+ }
+ } else {
+ snap_seq = db_impl_->GetLatestSequenceNumber();
+ }
+
+ // Otherwise we have either
+ // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key
+ // 2: snap_seq < tracked_at_seq: last time we lock the key was via
+ // do_validate=false which means we had skipped ValidateSnapshot. In both
+ // cases we should do ValidateSnapshot now.
+
+ *tracked_at_seq = snap_seq;
+
+ ColumnFamilyHandle* cfh =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+ assert(cfh);
+ const Comparator* const ucmp = cfh->GetComparator();
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ std::string ts_buf;
+ if (ts_sz > 0 && read_timestamp_ < kMaxTxnTimestamp) {
+ assert(ts_sz == sizeof(read_timestamp_));
+ PutFixed64(&ts_buf, read_timestamp_);
+ }
+
+ return TransactionUtil::CheckKeyForConflicts(
+ db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf,
+ false /* cache_only */);
+}
+
+bool PessimisticTransaction::TryStealingLocks() {
+ assert(IsExpired());
+ TransactionState expected = STARTED;
+ return std::atomic_compare_exchange_strong(&txn_state_, &expected,
+ LOCKS_STOLEN);
+}
+
+void PessimisticTransaction::UnlockGetForUpdate(
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString());
+}
+
+Status PessimisticTransaction::SetName(const TransactionName& name) {
+ Status s;
+ if (txn_state_ == STARTED) {
+ if (name_.length()) {
+ s = Status::InvalidArgument("Transaction has already been named.");
+ } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) {
+ s = Status::InvalidArgument("Transaction name must be unique.");
+ } else if (name.length() < 1 || name.length() > 512) {
+ s = Status::InvalidArgument(
+ "Transaction name length must be between 1 and 512 chars.");
+ } else {
+ name_ = name;
+ txn_db_impl_->RegisterTransaction(this);
+ }
+ } else {
+ s = Status::InvalidArgument("Transaction is beyond state for naming.");
+ }
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.h b/src/rocksdb/utilities/transactions/pessimistic_transaction.h
new file mode 100644
index 000000000..d43d1d3ac
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.h
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+// A transaction under pessimistic concurrency control. This class implements
+// the locking API and interfaces with the lock manager as well as the
+// pessimistic transactional db.
+class PessimisticTransaction : public TransactionBaseImpl {
+ public:
+ PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options,
+ const bool init = true);
+ // No copying allowed
+ PessimisticTransaction(const PessimisticTransaction&) = delete;
+ void operator=(const PessimisticTransaction&) = delete;
+
+ ~PessimisticTransaction() override;
+
+ void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options);
+
+ Status Prepare() override;
+
+ Status Commit() override;
+
+ // It is basically Commit without going through Prepare phase. The write batch
+ // is also directly provided instead of expecting txn to gradually batch the
+ // transactions writes to an internal write batch.
+ Status CommitBatch(WriteBatch* batch);
+
+ Status Rollback() override;
+
+ Status RollbackToSavePoint() override;
+
+ Status SetName(const TransactionName& name) override;
+
+ // Generate a new unique transaction identifier
+ static TransactionID GenTxnID();
+
+ TransactionID GetID() const override { return txn_id_; }
+
+ std::vector<TransactionID> GetWaitingTxns(uint32_t* column_family_id,
+ std::string* key) const override {
+ std::lock_guard<std::mutex> lock(wait_mutex_);
+ std::vector<TransactionID> ids(waiting_txn_ids_.size());
+ if (key) *key = waiting_key_ ? *waiting_key_ : "";
+ if (column_family_id) *column_family_id = waiting_cf_id_;
+ std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin());
+ return ids;
+ }
+
+ void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id,
+ const std::string* key) {
+ std::lock_guard<std::mutex> lock(wait_mutex_);
+ waiting_txn_ids_ = ids;
+ waiting_cf_id_ = column_family_id;
+ waiting_key_ = key;
+ }
+
+ void ClearWaitingTxn() {
+ std::lock_guard<std::mutex> lock(wait_mutex_);
+ waiting_txn_ids_.clear();
+ waiting_cf_id_ = 0;
+ waiting_key_ = nullptr;
+ }
+
+ // Returns the time (in microseconds according to Env->GetMicros())
+ // that this transaction will be expired. Returns 0 if this transaction does
+ // not expire.
+ uint64_t GetExpirationTime() const { return expiration_time_; }
+
+ // returns true if this transaction has an expiration_time and has expired.
+ bool IsExpired() const;
+
+ // Returns the number of microseconds a transaction can wait on acquiring a
+ // lock or -1 if there is no timeout.
+ int64_t GetLockTimeout() const { return lock_timeout_; }
+ void SetLockTimeout(int64_t timeout) override {
+ lock_timeout_ = timeout * 1000;
+ }
+
+ // Returns true if locks were stolen successfully, false otherwise.
+ bool TryStealingLocks();
+
+ bool IsDeadlockDetect() const override { return deadlock_detect_; }
+
+ int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
+
+ virtual Status GetRangeLock(ColumnFamilyHandle* column_family,
+ const Endpoint& start_key,
+ const Endpoint& end_key) override;
+
+ protected:
+ // Refer to
+ // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery
+ bool use_only_the_last_commit_time_batch_for_recovery_ = false;
+ // Refer to
+ // TransactionOptions::skip_prepare
+ bool skip_prepare_ = false;
+
+ virtual Status PrepareInternal() = 0;
+
+ virtual Status CommitWithoutPrepareInternal() = 0;
+
+ // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch
+ // with no duplicate keys. If zero, then the number of sub-batches is unknown.
+ virtual Status CommitBatchInternal(WriteBatch* batch,
+ size_t batch_cnt = 0) = 0;
+
+ virtual Status CommitInternal() = 0;
+
+ virtual Status RollbackInternal() = 0;
+
+ virtual void Initialize(const TransactionOptions& txn_options);
+
+ Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock);
+
+ Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+ bool read_only, bool exclusive, const bool do_validate = true,
+ const bool assume_tracked = false) override;
+
+ void Clear() override;
+
+ PessimisticTransactionDB* txn_db_impl_;
+ DBImpl* db_impl_;
+
+ // If non-zero, this transaction should not be committed after this time (in
+ // microseconds according to Env->NowMicros())
+ uint64_t expiration_time_;
+
+ // Timestamp used by the transaction to perform all GetForUpdate.
+ // Use this timestamp for conflict checking.
+ // read_timestamp_ == kMaxTxnTimestamp means this transaction has not
+ // performed any GetForUpdate. It is possible that the transaction has
+ // performed blind writes or Get, though.
+ TxnTimestamp read_timestamp_{kMaxTxnTimestamp};
+ TxnTimestamp commit_timestamp_{kMaxTxnTimestamp};
+
+ private:
+ friend class TransactionTest_ValidateSnapshotTest_Test;
+ // Used to create unique ids for transactions.
+ static std::atomic<TransactionID> txn_id_counter_;
+
+ // Unique ID for this transaction
+ TransactionID txn_id_;
+
+ // IDs for the transactions that are blocking the current transaction.
+ //
+ // empty if current transaction is not waiting.
+ autovector<TransactionID> waiting_txn_ids_;
+
+ // The following two represents the (cf, key) that a transaction is waiting
+ // on.
+ //
+ // If waiting_key_ is not null, then the pointer should always point to
+ // a valid string object. The reason is that it is only non-null when the
+ // transaction is blocked in the PointLockManager::AcquireWithTimeout
+ // function. At that point, the key string object is one of the function
+ // parameters.
+ uint32_t waiting_cf_id_;
+ const std::string* waiting_key_;
+
+ // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_.
+ mutable std::mutex wait_mutex_;
+
+ // Timeout in microseconds when locking a key or -1 if there is no timeout.
+ int64_t lock_timeout_;
+
+ // Whether to perform deadlock detection or not.
+ bool deadlock_detect_;
+
+ // Whether to perform deadlock detection or not.
+ int64_t deadlock_detect_depth_;
+
+ // Refer to TransactionOptions::skip_concurrency_control
+ bool skip_concurrency_control_;
+
+ virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ SequenceNumber* tracked_at_seq);
+
+ void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+};
+
+class WriteCommittedTxn : public PessimisticTransaction {
+ public:
+ WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options);
+ // No copying allowed
+ WriteCommittedTxn(const WriteCommittedTxn&) = delete;
+ void operator=(const WriteCommittedTxn&) = delete;
+
+ ~WriteCommittedTxn() override {}
+
+ using TransactionBaseImpl::GetForUpdate;
+ Status GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, bool exclusive,
+ const bool do_validate) override;
+ Status GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val, bool exclusive,
+ const bool do_validate) override;
+
+ using TransactionBaseImpl::Put;
+ // `key` does NOT include timestamp even when it's enabled.
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) override;
+ Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::PutUntracked;
+ Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+
+ using TransactionBaseImpl::Delete;
+ // `key` does NOT include timestamp even when it's enabled.
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) override;
+ Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::DeleteUntracked;
+ Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+
+ using TransactionBaseImpl::SingleDelete;
+ // `key` does NOT include timestamp even when it's enabled.
+ Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) override;
+ Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::SingleDeleteUntracked;
+ Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+
+ using TransactionBaseImpl::Merge;
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) override;
+
+ Status SetReadTimestampForValidation(TxnTimestamp ts) override;
+ Status SetCommitTimestamp(TxnTimestamp ts) override;
+ TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; }
+
+ private:
+ template <typename TValue>
+ Status GetForUpdateImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ TValue* value, bool exclusive,
+ const bool do_validate);
+
+ template <typename TKey, typename TOperation>
+ Status Operate(ColumnFamilyHandle* column_family, const TKey& key,
+ const bool do_validate, const bool assume_tracked,
+ TOperation&& operation);
+
+ Status PrepareInternal() override;
+
+ Status CommitWithoutPrepareInternal() override;
+
+ Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
+ Status CommitInternal() override;
+
+ Status RollbackInternal() override;
+
+ // Column families that enable timestamps and whose data are written when
+ // indexing_enabled_ is false. If a key is written when indexing_enabled_ is
+ // true, then the corresponding column family is not added to cfs_with_ts
+ // even if it enables timestamp.
+ std::unordered_set<uint32_t> cfs_with_ts_tracked_when_indexing_disabled_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
new file mode 100644
index 000000000..950ef8042
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
@@ -0,0 +1,782 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+#include <cinttypes>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PessimisticTransactionDB::PessimisticTransactionDB(
+ DB* db, const TransactionDBOptions& txn_db_options)
+ : TransactionDB(db),
+ db_impl_(static_cast_with_check<DBImpl>(db)),
+ txn_db_options_(txn_db_options),
+ lock_manager_(NewLockManager(this, txn_db_options)) {
+ assert(db_impl_ != nullptr);
+ info_log_ = db_impl_->GetDBOptions().info_log;
+}
+
+// Support initiliazing PessimisticTransactionDB from a stackable db
+//
+// PessimisticTransactionDB
+// ^ ^
+// | |
+// | +
+// | StackableDB
+// | ^
+// | |
+// + +
+// DBImpl
+// ^
+// |(inherit)
+// +
+// DB
+//
+PessimisticTransactionDB::PessimisticTransactionDB(
+ StackableDB* db, const TransactionDBOptions& txn_db_options)
+ : TransactionDB(db),
+ db_impl_(static_cast_with_check<DBImpl>(db->GetRootDB())),
+ txn_db_options_(txn_db_options),
+ lock_manager_(NewLockManager(this, txn_db_options)) {
+ assert(db_impl_ != nullptr);
+}
+
+PessimisticTransactionDB::~PessimisticTransactionDB() {
+ while (!transactions_.empty()) {
+ delete transactions_.begin()->second;
+ // TODO(myabandeh): this seems to be an unsafe approach as it is not quite
+ // clear whether delete would also remove the entry from transactions_.
+ }
+}
+
+Status PessimisticTransactionDB::VerifyCFOptions(
+ const ColumnFamilyOptions& cf_options) {
+ const Comparator* const ucmp = cf_options.comparator;
+ assert(ucmp);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (0 == ts_sz) {
+ return Status::OK();
+ }
+ if (ts_sz != sizeof(TxnTimestamp)) {
+ std::ostringstream oss;
+ oss << "Timestamp of transaction must have " << sizeof(TxnTimestamp)
+ << " bytes. CF comparator " << std::string(ucmp->Name())
+ << " timestamp size is " << ts_sz << " bytes";
+ return Status::InvalidArgument(oss.str());
+ }
+ if (txn_db_options_.write_policy != WRITE_COMMITTED) {
+ return Status::NotSupported("Only WriteCommittedTxn supports timestamp");
+ }
+ return Status::OK();
+}
+
+Status PessimisticTransactionDB::Initialize(
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles) {
+ for (auto cf_ptr : handles) {
+ AddColumnFamily(cf_ptr);
+ }
+ // Verify cf options
+ for (auto handle : handles) {
+ ColumnFamilyDescriptor cfd;
+ Status s = handle->GetDescriptor(&cfd);
+ if (!s.ok()) {
+ return s;
+ }
+ s = VerifyCFOptions(cfd.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Re-enable compaction for the column families that initially had
+ // compaction enabled.
+ std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+ compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+ for (auto index : compaction_enabled_cf_indices) {
+ compaction_enabled_cf_handles.push_back(handles[index]);
+ }
+
+ Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
+
+ // create 'real' transactions from recovered shell transactions
+ auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+ assert(dbimpl != nullptr);
+ auto rtrxs = dbimpl->recovered_transactions();
+
+ for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) {
+ auto recovered_trx = it->second;
+ assert(recovered_trx);
+ assert(recovered_trx->batches_.size() == 1);
+ const auto& seq = recovered_trx->batches_.begin()->first;
+ const auto& batch_info = recovered_trx->batches_.begin()->second;
+ assert(batch_info.log_number_);
+ assert(recovered_trx->name_.length());
+
+ WriteOptions w_options;
+ w_options.sync = true;
+ TransactionOptions t_options;
+ // This would help avoiding deadlock for keys that although exist in the WAL
+ // did not go through concurrency control. This includes the merge that
+ // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if
+ // there is a conflict between the keys of two transactions that must be
+ // avoided, it is already avoided by the application, MyRocks, before the
+ // restart (ii) application, MyRocks, guarntees to rollback/commit the
+ // recovered transactions before new transactions start.
+ t_options.skip_concurrency_control = true;
+
+ Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+ assert(real_trx);
+ real_trx->SetLogNumber(batch_info.log_number_);
+ assert(seq != kMaxSequenceNumber);
+ if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) {
+ real_trx->SetId(seq);
+ }
+
+ s = real_trx->SetName(recovered_trx->name_);
+ if (!s.ok()) {
+ break;
+ }
+
+ s = real_trx->RebuildFromWriteBatch(batch_info.batch_);
+ // WriteCommitted set this to to disable this check that is specific to
+ // WritePrepared txns
+ assert(batch_info.batch_cnt_ == 0 ||
+ real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_);
+ real_trx->SetState(Transaction::PREPARED);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ dbimpl->DeleteAllRecoveredTransactions();
+ }
+ return s;
+}
+
+Transaction* WriteCommittedTxnDB::BeginTransaction(
+ const WriteOptions& write_options, const TransactionOptions& txn_options,
+ Transaction* old_txn) {
+ if (old_txn != nullptr) {
+ ReinitializeTransaction(old_txn, write_options, txn_options);
+ return old_txn;
+ } else {
+ return new WriteCommittedTxn(this, write_options, txn_options);
+ }
+}
+
+TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions(
+ const TransactionDBOptions& txn_db_options) {
+ TransactionDBOptions validated = txn_db_options;
+
+ if (txn_db_options.num_stripes == 0) {
+ validated.num_stripes = 1;
+ }
+
+ return validated;
+}
+
+Status TransactionDB::Open(const Options& options,
+ const TransactionDBOptions& txn_db_options,
+ const std::string& dbname, TransactionDB** dbptr) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
+ column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ delete handles[0];
+ }
+
+ return s;
+}
+
+Status TransactionDB::Open(
+ const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
+ const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
+ Status s;
+ DB* db = nullptr;
+ if (txn_db_options.write_policy == WRITE_COMMITTED &&
+ db_options.unordered_write) {
+ return Status::NotSupported(
+ "WRITE_COMMITTED is incompatible with unordered_writes");
+ }
+ if (txn_db_options.write_policy == WRITE_UNPREPARED &&
+ db_options.unordered_write) {
+ // TODO(lth): support it
+ return Status::NotSupported(
+ "WRITE_UNPREPARED is currently incompatible with unordered_writes");
+ }
+ if (txn_db_options.write_policy == WRITE_PREPARED &&
+ db_options.unordered_write && !db_options.two_write_queues) {
+ return Status::NotSupported(
+ "WRITE_PREPARED is incompatible with unordered_writes if "
+ "two_write_queues is not enabled.");
+ }
+
+ std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+ std::vector<size_t> compaction_enabled_cf_indices;
+ DBOptions db_options_2pc = db_options;
+ PrepareWrap(&db_options_2pc, &column_families_copy,
+ &compaction_enabled_cf_indices);
+ const bool use_seq_per_batch =
+ txn_db_options.write_policy == WRITE_PREPARED ||
+ txn_db_options.write_policy == WRITE_UNPREPARED;
+ const bool use_batch_per_txn =
+ txn_db_options.write_policy == WRITE_COMMITTED ||
+ txn_db_options.write_policy == WRITE_PREPARED;
+ s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
+ use_seq_per_batch, use_batch_per_txn);
+ if (s.ok()) {
+ ROCKS_LOG_WARN(db->GetDBOptions().info_log,
+ "Transaction write_policy is %" PRId32,
+ static_cast<int>(txn_db_options.write_policy));
+ // if WrapDB return non-ok, db will be deleted in WrapDB() via
+ // ~StackableDB().
+ s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
+ dbptr);
+ }
+ return s;
+}
+
+void TransactionDB::PrepareWrap(
+ DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families,
+ std::vector<size_t>* compaction_enabled_cf_indices) {
+ compaction_enabled_cf_indices->clear();
+
+ // Enable MemTable History if not already enabled
+ for (size_t i = 0; i < column_families->size(); i++) {
+ ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
+
+ if (cf_options->max_write_buffer_size_to_maintain == 0 &&
+ cf_options->max_write_buffer_number_to_maintain == 0) {
+ // Setting to -1 will set the History size to
+ // max_write_buffer_number * write_buffer_size.
+ cf_options->max_write_buffer_size_to_maintain = -1;
+ }
+ if (!cf_options->disable_auto_compactions) {
+ // Disable compactions momentarily to prevent race with DB::Open
+ cf_options->disable_auto_compactions = true;
+ compaction_enabled_cf_indices->push_back(i);
+ }
+ }
+ db_options->allow_2pc = true;
+}
+
+namespace {
+template <typename DBType>
+Status WrapAnotherDBInternal(
+ DBType* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+ assert(db != nullptr);
+ assert(dbptr != nullptr);
+ *dbptr = nullptr;
+ std::unique_ptr<PessimisticTransactionDB> txn_db;
+ // txn_db owns object pointed to by the raw db pointer.
+ switch (txn_db_options.write_policy) {
+ case WRITE_UNPREPARED:
+ txn_db.reset(new WriteUnpreparedTxnDB(
+ db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+ break;
+ case WRITE_PREPARED:
+ txn_db.reset(new WritePreparedTxnDB(
+ db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+ break;
+ case WRITE_COMMITTED:
+ default:
+ txn_db.reset(new WriteCommittedTxnDB(
+ db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+ }
+ txn_db->UpdateCFComparatorMap(handles);
+ Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+ // In case of a failure at this point, db is deleted via the txn_db destructor
+ // and set to nullptr.
+ if (s.ok()) {
+ *dbptr = txn_db.release();
+ } else {
+ for (auto* h : handles) {
+ delete h;
+ }
+ // txn_db still owns db, and ~StackableDB() will be called when txn_db goes
+ // out of scope, deleting the input db pointer.
+ ROCKS_LOG_FATAL(db->GetDBOptions().info_log,
+ "Failed to initialize txn_db: %s", s.ToString().c_str());
+ }
+ return s;
+}
+} // namespace
+
+Status TransactionDB::WrapDB(
+ // make sure this db is already opened with memtable history enabled,
+ // auto compaction distabled and 2 phase commit enabled
+ DB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+ return WrapAnotherDBInternal(db, txn_db_options,
+ compaction_enabled_cf_indices, handles, dbptr);
+}
+
+Status TransactionDB::WrapStackableDB(
+ // make sure this stackable_db is already opened with memtable history
+ // enabled, auto compaction distabled and 2 phase commit enabled
+ StackableDB* db, const TransactionDBOptions& txn_db_options,
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+ return WrapAnotherDBInternal(db, txn_db_options,
+ compaction_enabled_cf_indices, handles, dbptr);
+}
+
+// Let LockManager know that this column family exists so it can
+// allocate a LockMap for it.
+void PessimisticTransactionDB::AddColumnFamily(
+ const ColumnFamilyHandle* handle) {
+ lock_manager_->AddColumnFamily(handle);
+}
+
+Status PessimisticTransactionDB::CreateColumnFamily(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ ColumnFamilyHandle** handle) {
+ InstrumentedMutexLock l(&column_family_mutex_);
+ Status s = VerifyCFOptions(options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = db_->CreateColumnFamily(options, column_family_name, handle);
+ if (s.ok()) {
+ lock_manager_->AddColumnFamily(*handle);
+ UpdateCFComparatorMap(*handle);
+ }
+
+ return s;
+}
+
+Status PessimisticTransactionDB::CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ InstrumentedMutexLock l(&column_family_mutex_);
+
+ Status s = VerifyCFOptions(options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = db_->CreateColumnFamilies(options, column_family_names, handles);
+ if (s.ok()) {
+ for (auto* handle : *handles) {
+ lock_manager_->AddColumnFamily(handle);
+ UpdateCFComparatorMap(handle);
+ }
+ }
+
+ return s;
+}
+
+Status PessimisticTransactionDB::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ InstrumentedMutexLock l(&column_family_mutex_);
+
+ for (auto& cf_desc : column_families) {
+ Status s = VerifyCFOptions(cf_desc.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ Status s = db_->CreateColumnFamilies(column_families, handles);
+ if (s.ok()) {
+ for (auto* handle : *handles) {
+ lock_manager_->AddColumnFamily(handle);
+ UpdateCFComparatorMap(handle);
+ }
+ }
+
+ return s;
+}
+
+// Let LockManager know that it can deallocate the LockMap for this
+// column family.
+Status PessimisticTransactionDB::DropColumnFamily(
+ ColumnFamilyHandle* column_family) {
+ InstrumentedMutexLock l(&column_family_mutex_);
+
+ Status s = db_->DropColumnFamily(column_family);
+ if (s.ok()) {
+ lock_manager_->RemoveColumnFamily(column_family);
+ }
+
+ return s;
+}
+
+Status PessimisticTransactionDB::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ InstrumentedMutexLock l(&column_family_mutex_);
+
+ Status s = db_->DropColumnFamilies(column_families);
+ if (s.ok()) {
+ for (auto* handle : column_families) {
+ lock_manager_->RemoveColumnFamily(handle);
+ }
+ }
+
+ return s;
+}
+
+Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn,
+ uint32_t cfh_id,
+ const std::string& key,
+ bool exclusive) {
+ return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+}
+
+Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn,
+ uint32_t cfh_id,
+ const Endpoint& start_endp,
+ const Endpoint& end_endp) {
+ return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(),
+ /*exclusive=*/true);
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+ const LockTracker& keys) {
+ lock_manager_->UnLock(txn, keys, GetEnv());
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+ uint32_t cfh_id, const std::string& key) {
+ lock_manager_->UnLock(txn, cfh_id, key, GetEnv());
+}
+
+// Used when wrapping DB write operations in a transaction
+Transaction* PessimisticTransactionDB::BeginInternalTransaction(
+ const WriteOptions& options) {
+ TransactionOptions txn_options;
+ Transaction* txn = BeginTransaction(options, txn_options, nullptr);
+
+ // Use default timeout for non-transactional writes
+ txn->SetLockTimeout(txn_db_options_.default_lock_timeout);
+ return txn;
+}
+
+// All user Put, Merge, Delete, and Write requests must be intercepted to make
+// sure that they lock all keys that they are writing to avoid causing conflicts
+// with any concurrent transactions. The easiest way to do this is to wrap all
+// write operations in a transaction.
+//
+// Put(), Merge(), and Delete() only lock a single key per call. Write() will
+// sort its keys before locking them. This guarantees that TransactionDB write
+// methods cannot deadlock with each other (but still could deadlock with a
+// Transaction).
+Status PessimisticTransactionDB::Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ Status s = FailIfCfEnablesTs(this, column_family);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Transaction* txn = BeginInternalTransaction(options);
+ txn->DisableIndexing();
+
+ // Since the client didn't create a transaction, they don't care about
+ // conflict checking for this write. So we just need to do PutUntracked().
+ s = txn->PutUntracked(column_family, key, val);
+
+ if (s.ok()) {
+ s = txn->Commit();
+ }
+
+ delete txn;
+
+ return s;
+}
+
+Status PessimisticTransactionDB::Delete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ Status s = FailIfCfEnablesTs(this, column_family);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Transaction* txn = BeginInternalTransaction(wopts);
+ txn->DisableIndexing();
+
+ // Since the client didn't create a transaction, they don't care about
+ // conflict checking for this write. So we just need to do
+ // DeleteUntracked().
+ s = txn->DeleteUntracked(column_family, key);
+
+ if (s.ok()) {
+ s = txn->Commit();
+ }
+
+ delete txn;
+
+ return s;
+}
+
+Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ Status s = FailIfCfEnablesTs(this, column_family);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Transaction* txn = BeginInternalTransaction(wopts);
+ txn->DisableIndexing();
+
+ // Since the client didn't create a transaction, they don't care about
+ // conflict checking for this write. So we just need to do
+ // SingleDeleteUntracked().
+ s = txn->SingleDeleteUntracked(column_family, key);
+
+ if (s.ok()) {
+ s = txn->Commit();
+ }
+
+ delete txn;
+
+ return s;
+}
+
+Status PessimisticTransactionDB::Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ Status s = FailIfCfEnablesTs(this, column_family);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Transaction* txn = BeginInternalTransaction(options);
+ txn->DisableIndexing();
+
+ // Since the client didn't create a transaction, they don't care about
+ // conflict checking for this write. So we just need to do
+ // MergeUntracked().
+ s = txn->MergeUntracked(column_family, key, value);
+
+ if (s.ok()) {
+ s = txn->Commit();
+ }
+
+ delete txn;
+
+ return s;
+}
+
+Status PessimisticTransactionDB::Write(const WriteOptions& opts,
+ WriteBatch* updates) {
+ return WriteWithConcurrencyControl(opts, updates);
+}
+
+Status WriteCommittedTxnDB::Write(const WriteOptions& opts,
+ WriteBatch* updates) {
+ Status s = FailIfBatchHasTs(updates);
+ if (!s.ok()) {
+ return s;
+ }
+ if (txn_db_options_.skip_concurrency_control) {
+ return db_impl_->Write(opts, updates);
+ } else {
+ return WriteWithConcurrencyControl(opts, updates);
+ }
+}
+
+Status WriteCommittedTxnDB::Write(
+ const WriteOptions& opts,
+ const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+ Status s = FailIfBatchHasTs(updates);
+ if (!s.ok()) {
+ return s;
+ }
+ if (optimizations.skip_concurrency_control) {
+ return db_impl_->Write(opts, updates);
+ } else {
+ return WriteWithConcurrencyControl(opts, updates);
+ }
+}
+
+void PessimisticTransactionDB::InsertExpirableTransaction(
+ TransactionID tx_id, PessimisticTransaction* tx) {
+ assert(tx->GetExpirationTime() > 0);
+ std::lock_guard<std::mutex> lock(map_mutex_);
+ expirable_transactions_map_.insert({tx_id, tx});
+}
+
+void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) {
+ std::lock_guard<std::mutex> lock(map_mutex_);
+ expirable_transactions_map_.erase(tx_id);
+}
+
+bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks(
+ TransactionID tx_id) {
+ std::lock_guard<std::mutex> lock(map_mutex_);
+
+ auto tx_it = expirable_transactions_map_.find(tx_id);
+ if (tx_it == expirable_transactions_map_.end()) {
+ return true;
+ }
+ PessimisticTransaction& tx = *(tx_it->second);
+ return tx.TryStealingLocks();
+}
+
+void PessimisticTransactionDB::ReinitializeTransaction(
+ Transaction* txn, const WriteOptions& write_options,
+ const TransactionOptions& txn_options) {
+ auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
+
+ txn_impl->Reinitialize(this, write_options, txn_options);
+}
+
+Transaction* PessimisticTransactionDB::GetTransactionByName(
+ const TransactionName& name) {
+ std::lock_guard<std::mutex> lock(name_map_mutex_);
+ auto it = transactions_.find(name);
+ if (it == transactions_.end()) {
+ return nullptr;
+ } else {
+ return it->second;
+ }
+}
+
+void PessimisticTransactionDB::GetAllPreparedTransactions(
+ std::vector<Transaction*>* transv) {
+ assert(transv);
+ transv->clear();
+ std::lock_guard<std::mutex> lock(name_map_mutex_);
+ for (auto it = transactions_.begin(); it != transactions_.end(); ++it) {
+ if (it->second->GetState() == Transaction::PREPARED) {
+ transv->push_back(it->second);
+ }
+ }
+}
+
+LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() {
+ return lock_manager_->GetPointLockStatus();
+}
+
+std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() {
+ return lock_manager_->GetDeadlockInfoBuffer();
+}
+
+void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
+ lock_manager_->Resize(target_size);
+}
+
+void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
+ assert(txn);
+ assert(txn->GetName().length() > 0);
+ assert(GetTransactionByName(txn->GetName()) == nullptr);
+ assert(txn->GetState() == Transaction::STARTED);
+ std::lock_guard<std::mutex> lock(name_map_mutex_);
+ transactions_[txn->GetName()] = txn;
+}
+
+void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) {
+ assert(txn);
+ std::lock_guard<std::mutex> lock(name_map_mutex_);
+ auto it = transactions_.find(txn->GetName());
+ assert(it != transactions_.end());
+ transactions_.erase(it);
+}
+
+std::pair<Status, std::shared_ptr<const Snapshot>>
+PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) {
+ if (kMaxTxnTimestamp == ts) {
+ return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr);
+ }
+ assert(db_impl_);
+ return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts);
+}
+
+std::shared_ptr<const Snapshot>
+PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const {
+ assert(db_impl_);
+ return db_impl_->GetTimestampedSnapshot(ts);
+}
+
+void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan(
+ TxnTimestamp ts) {
+ assert(db_impl_);
+ db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts);
+}
+
+Status PessimisticTransactionDB::GetTimestampedSnapshots(
+ TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const {
+ assert(db_impl_);
+ return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots);
+}
+
+Status SnapshotCreationCallback::operator()(SequenceNumber seq,
+ bool disable_memtable) {
+ assert(db_impl_);
+ assert(commit_ts_ != kMaxTxnTimestamp);
+
+ const bool two_write_queues =
+ db_impl_->immutable_db_options().two_write_queues;
+ assert(!two_write_queues || !disable_memtable);
+#ifdef NDEBUG
+ (void)two_write_queues;
+ (void)disable_memtable;
+#endif
+
+ const bool seq_per_batch = db_impl_->seq_per_batch();
+ if (!seq_per_batch) {
+ assert(db_impl_->GetLastPublishedSequence() <= seq);
+ } else {
+ assert(db_impl_->GetLastPublishedSequence() < seq);
+ }
+
+ // Create a snapshot which can also be used for write conflict checking.
+ auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_);
+ snapshot_creation_status_ = ret.first;
+ snapshot_ = ret.second;
+ if (snapshot_creation_status_.ok()) {
+ assert(snapshot_);
+ } else {
+ assert(!snapshot_);
+ }
+ if (snapshot_ && snapshot_notifier_) {
+ snapshot_notifier_->SnapshotCreated(snapshot_.get());
+ }
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h
new file mode 100644
index 000000000..25cd11054
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h
@@ -0,0 +1,318 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/db_iter.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB : public TransactionDB {
+ public:
+ explicit PessimisticTransactionDB(DB* db,
+ const TransactionDBOptions& txn_db_options);
+
+ explicit PessimisticTransactionDB(StackableDB* db,
+ const TransactionDBOptions& txn_db_options);
+
+ virtual ~PessimisticTransactionDB();
+
+ virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+ virtual Status Initialize(
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles);
+
+ Transaction* BeginTransaction(const WriteOptions& write_options,
+ const TransactionOptions& txn_options,
+ Transaction* old_txn) override = 0;
+
+ using StackableDB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& val) override;
+
+ using StackableDB::Delete;
+ virtual Status Delete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+
+ using StackableDB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& wopts,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+
+ using StackableDB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ using TransactionDB::Write;
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+ inline Status WriteWithConcurrencyControl(const WriteOptions& opts,
+ WriteBatch* updates) {
+ Status s;
+ if (opts.protection_bytes_per_key > 0) {
+ s = WriteBatchInternal::UpdateProtectionInfo(
+ updates, opts.protection_bytes_per_key);
+ }
+ if (s.ok()) {
+ // Need to lock all keys in this batch to prevent write conflicts with
+ // concurrent transactions.
+ Transaction* txn = BeginInternalTransaction(opts);
+ txn->DisableIndexing();
+
+ auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
+
+ // Since commitBatch sorts the keys before locking, concurrent Write()
+ // operations will not cause a deadlock.
+ // In order to avoid a deadlock with a concurrent Transaction,
+ // Transactions should use a lock timeout.
+ s = txn_impl->CommitBatch(updates);
+
+ delete txn;
+ }
+
+ return s;
+ }
+
+ using StackableDB::CreateColumnFamily;
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) override;
+
+ Status CreateColumnFamilies(
+ const ColumnFamilyOptions& options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+
+ Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+
+ using StackableDB::DropColumnFamily;
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+
+ Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+ Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id,
+ const std::string& key, bool exclusive);
+ Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id,
+ const Endpoint& start_endp, const Endpoint& end_endp);
+
+ void UnLock(PessimisticTransaction* txn, const LockTracker& keys);
+ void UnLock(PessimisticTransaction* txn, uint32_t cfh_id,
+ const std::string& key);
+
+ void AddColumnFamily(const ColumnFamilyHandle* handle);
+
+ static TransactionDBOptions ValidateTxnDBOptions(
+ const TransactionDBOptions& txn_db_options);
+
+ const TransactionDBOptions& GetTxnDBOptions() const {
+ return txn_db_options_;
+ }
+
+ void InsertExpirableTransaction(TransactionID tx_id,
+ PessimisticTransaction* tx);
+ void RemoveExpirableTransaction(TransactionID tx_id);
+
+ // If transaction is no longer available, locks can be stolen
+ // If transaction is available, try stealing locks directly from transaction
+ // It is the caller's responsibility to ensure that the referred transaction
+ // is expirable (GetExpirationTime() > 0) and that it is expired.
+ bool TryStealingExpiredTransactionLocks(TransactionID tx_id);
+
+ Transaction* GetTransactionByName(const TransactionName& name) override;
+
+ void RegisterTransaction(Transaction* txn);
+ void UnregisterTransaction(Transaction* txn);
+
+ // not thread safe. current use case is during recovery (single thread)
+ void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
+
+ LockManager::PointLockStatus GetLockStatusData() override;
+
+ std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+ void SetDeadlockInfoBufferSize(uint32_t target_size) override;
+
+ // The default implementation does nothing. The actual implementation is moved
+ // to the child classes that actually need this information. This was due to
+ // an odd performance drop we observed when the added std::atomic member to
+ // the base class even when the subclass do not read it in the fast path.
+ virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {}
+ virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {}
+
+ // Use the returned factory to create LockTrackers in transactions.
+ const LockTrackerFactory& GetLockTrackerFactory() const {
+ return lock_manager_->GetLockTrackerFactory();
+ }
+
+ std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
+ TxnTimestamp ts) override;
+
+ std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
+ TxnTimestamp ts) const override;
+
+ void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override;
+
+ Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+ std::vector<std::shared_ptr<const Snapshot>>&
+ timestamped_snapshots) const override;
+
+ protected:
+ DBImpl* db_impl_;
+ std::shared_ptr<Logger> info_log_;
+ const TransactionDBOptions txn_db_options_;
+
+ static Status FailIfBatchHasTs(const WriteBatch* wb);
+
+ static Status FailIfCfEnablesTs(const DB* db,
+ const ColumnFamilyHandle* column_family);
+
+ void ReinitializeTransaction(
+ Transaction* txn, const WriteOptions& write_options,
+ const TransactionOptions& txn_options = TransactionOptions());
+
+ virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options);
+
+ private:
+ friend class WritePreparedTxnDB;
+ friend class WritePreparedTxnDBMock;
+ friend class WriteUnpreparedTxn;
+ friend class TransactionTest_DoubleCrashInRecovery_Test;
+ friend class TransactionTest_DoubleEmptyWrite_Test;
+ friend class TransactionTest_DuplicateKeys_Test;
+ friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test;
+ friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;
+ friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test;
+ friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+ friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
+
+ Transaction* BeginInternalTransaction(const WriteOptions& options);
+
+ std::shared_ptr<LockManager> lock_manager_;
+
+ // Must be held when adding/dropping column families.
+ InstrumentedMutex column_family_mutex_;
+
+ // Used to ensure that no locks are stolen from an expirable transaction
+ // that has started a commit. Only transactions with an expiration time
+ // should be in this map.
+ std::mutex map_mutex_;
+ std::unordered_map<TransactionID, PessimisticTransaction*>
+ expirable_transactions_map_;
+
+ // map from name to two phase transaction instance
+ std::mutex name_map_mutex_;
+ std::unordered_map<TransactionName, Transaction*> transactions_;
+
+ // Signal that we are testing a crash scenario. Some asserts could be relaxed
+ // in such cases.
+ virtual void TEST_Crash() {}
+};
+
+// A PessimisticTransactionDB that writes the data to the DB after the commit.
+// In this way the DB only contains the committed data.
+class WriteCommittedTxnDB : public PessimisticTransactionDB {
+ public:
+ explicit WriteCommittedTxnDB(DB* db,
+ const TransactionDBOptions& txn_db_options)
+ : PessimisticTransactionDB(db, txn_db_options) {}
+
+ explicit WriteCommittedTxnDB(StackableDB* db,
+ const TransactionDBOptions& txn_db_options)
+ : PessimisticTransactionDB(db, txn_db_options) {}
+
+ virtual ~WriteCommittedTxnDB() {}
+
+ Transaction* BeginTransaction(const WriteOptions& write_options,
+ const TransactionOptions& txn_options,
+ Transaction* old_txn) override;
+
+ // Optimized version of ::Write that makes use of skip_concurrency_control
+ // hint
+ using TransactionDB::Write;
+ virtual Status Write(const WriteOptions& opts,
+ const TransactionDBWriteOptimizations& optimizations,
+ WriteBatch* updates) override;
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+};
+
+inline Status PessimisticTransactionDB::FailIfBatchHasTs(
+ const WriteBatch* batch) {
+ if (batch != nullptr && WriteBatchInternal::HasKeyWithTimestamp(*batch)) {
+ return Status::NotSupported(
+ "Writes with timestamp must go through transaction API instead of "
+ "TransactionDB.");
+ }
+ return Status::OK();
+}
+
+inline Status PessimisticTransactionDB::FailIfCfEnablesTs(
+ const DB* db, const ColumnFamilyHandle* column_family) {
+ assert(db);
+ column_family = column_family ? column_family : db->DefaultColumnFamily();
+ assert(column_family);
+ const Comparator* const ucmp = column_family->GetComparator();
+ assert(ucmp);
+ if (ucmp->timestamp_size() > 0) {
+ return Status::NotSupported(
+ "Write operation with user timestamp must go through the transaction "
+ "API instead of TransactionDB.");
+ }
+ return Status::OK();
+}
+
+class SnapshotCreationCallback : public PostMemTableCallback {
+ public:
+ explicit SnapshotCreationCallback(
+ DBImpl* dbi, TxnTimestamp commit_ts,
+ const std::shared_ptr<TransactionNotifier>& notifier,
+ std::shared_ptr<const Snapshot>& snapshot)
+ : db_impl_(dbi),
+ commit_ts_(commit_ts),
+ snapshot_notifier_(notifier),
+ snapshot_(snapshot) {
+ assert(db_impl_);
+ }
+
+ ~SnapshotCreationCallback() override {
+ snapshot_creation_status_.PermitUncheckedError();
+ }
+
+ Status operator()(SequenceNumber seq, bool disable_memtable) override;
+
+ private:
+ DBImpl* const db_impl_;
+ const TxnTimestamp commit_ts_;
+ std::shared_ptr<TransactionNotifier> snapshot_notifier_;
+ std::shared_ptr<const Snapshot>& snapshot_;
+
+ Status snapshot_creation_status_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/snapshot_checker.cc b/src/rocksdb/utilities/transactions/snapshot_checker.cc
new file mode 100644
index 000000000..76d16681a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/snapshot_checker.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/snapshot_checker.h"
+
+#ifdef ROCKSDB_LITE
+#include <assert.h>
+#endif // ROCKSDB_LITE
+
+#include "port/lang.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_LITE
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+ WritePreparedTxnDB* /*txn_db*/) {}
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+ SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const {
+ // Should never be called in LITE mode.
+ assert(false);
+ return SnapshotCheckerResult::kInSnapshot;
+}
+
+#else
+
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+ WritePreparedTxnDB* txn_db)
+ : txn_db_(txn_db){};
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+ SequenceNumber sequence, SequenceNumber snapshot_sequence) const {
+ bool snapshot_released = false;
+ // TODO(myabandeh): set min_uncommitted
+ bool in_snapshot = txn_db_->IsInSnapshot(
+ sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released);
+ if (snapshot_released) {
+ return SnapshotCheckerResult::kSnapshotReleased;
+ }
+ return in_snapshot ? SnapshotCheckerResult::kInSnapshot
+ : SnapshotCheckerResult::kNotInSnapshot;
+}
+
+#endif // ROCKSDB_LITE
+
+DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() {
+ STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance);
+ return &instance;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc
new file mode 100644
index 000000000..e9b474415
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc
@@ -0,0 +1,466 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifdef ROCKSDB_LITE
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n");
+ return 0;
+}
+#else // ROCKSDB_LITE
+#include <cassert>
+
+#include "util/cast_util.h"
+#include "utilities/transactions/transaction_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+INSTANTIATE_TEST_CASE_P(
+ Unsupported, TimestampedSnapshotWithTsSanityCheck,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+ std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)));
+
+INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest,
+ ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+ ::testing::Values(WRITE_COMMITTED),
+ ::testing::Values(kOrderedWrite)));
+
+namespace {
+// Not thread-safe. Caller needs to provide external synchronization.
+class TsCheckingTxnNotifier : public TransactionNotifier {
+ public:
+ explicit TsCheckingTxnNotifier() = default;
+
+ ~TsCheckingTxnNotifier() override {}
+
+ void SnapshotCreated(const Snapshot* new_snapshot) override {
+ assert(new_snapshot);
+ if (prev_snapshot_seq_ != kMaxSequenceNumber) {
+ assert(prev_snapshot_seq_ <= new_snapshot->GetSequenceNumber());
+ }
+ prev_snapshot_seq_ = new_snapshot->GetSequenceNumber();
+ if (prev_snapshot_ts_ != kMaxTxnTimestamp) {
+ assert(prev_snapshot_ts_ <= new_snapshot->GetTimestamp());
+ }
+ prev_snapshot_ts_ = new_snapshot->GetTimestamp();
+ }
+
+ TxnTimestamp prev_snapshot_ts() const { return prev_snapshot_ts_; }
+
+ private:
+ SequenceNumber prev_snapshot_seq_ = kMaxSequenceNumber;
+ TxnTimestamp prev_snapshot_ts_ = kMaxTxnTimestamp;
+};
+} // anonymous namespace
+
+TEST_P(TimestampedSnapshotWithTsSanityCheck, WithoutCommitTs) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ ASSERT_OK(txn->Prepare());
+ Status s = txn->CommitAndTryCreateSnapshot();
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_OK(txn->Rollback());
+
+ txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ s = txn->CommitAndTryCreateSnapshot();
+ ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(TimestampedSnapshotWithTsSanityCheck, SetCommitTs) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ ASSERT_OK(txn->Prepare());
+ std::shared_ptr<const Snapshot> snapshot;
+ Status s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot);
+ ASSERT_TRUE(s.IsNotSupported());
+ ASSERT_OK(txn->Rollback());
+
+ txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot);
+ ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_P(TransactionTest, WithoutCommitTs) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ ASSERT_OK(txn->Prepare());
+ Status s = txn->CommitAndTryCreateSnapshot();
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_OK(txn->Rollback());
+
+ txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v"));
+ s = txn->CommitAndTryCreateSnapshot();
+ ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(TransactionTest, ReuseExistingTxn) {
+ Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+ assert(txn);
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("a", "v1"));
+ ASSERT_OK(txn->Prepare());
+
+ auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+ std::shared_ptr<const Snapshot> snapshot1;
+ Status s =
+ txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/100, &snapshot1);
+ ASSERT_OK(s);
+ ASSERT_EQ(100, snapshot1->GetTimestamp());
+
+ Transaction* txn1 =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), txn);
+ assert(txn1 == txn);
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn->Put("a", "v2"));
+ ASSERT_OK(txn->Prepare());
+ std::shared_ptr<const Snapshot> snapshot2;
+ s = txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/110, &snapshot2);
+ ASSERT_OK(s);
+ ASSERT_EQ(110, snapshot2->GetTimestamp());
+ delete txn;
+
+ {
+ std::string value;
+ ReadOptions read_opts;
+ read_opts.snapshot = snapshot1.get();
+ ASSERT_OK(db->Get(read_opts, "a", &value));
+ ASSERT_EQ("v1", value);
+
+ read_opts.snapshot = snapshot2.get();
+ ASSERT_OK(db->Get(read_opts, "a", &value));
+ ASSERT_EQ("v2", value);
+ }
+}
+
+TEST_P(TransactionTest, CreateSnapshotWhenCommit) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn);
+
+ constexpr int batch_size = 10;
+ for (int i = 0; i < batch_size; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), "v0"));
+ }
+ const SequenceNumber seq0 = db->GetLatestSequenceNumber();
+ ASSERT_EQ(static_cast<SequenceNumber>(batch_size), seq0);
+
+ txn->SetSnapshot();
+ {
+ const Snapshot* const snapshot = txn->GetSnapshot();
+ assert(snapshot);
+ ASSERT_EQ(seq0, snapshot->GetSequenceNumber());
+ }
+
+ for (int i = 0; i < batch_size; ++i) {
+ ASSERT_OK(txn->Put("k" + std::to_string(i), "v1"));
+ }
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Prepare());
+
+ std::shared_ptr<const Snapshot> snapshot;
+ constexpr TxnTimestamp timestamp = 1;
+ auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+ Status s = txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot);
+ ASSERT_OK(s);
+ ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+ assert(snapshot);
+ ASSERT_EQ(timestamp, snapshot->GetTimestamp());
+ ASSERT_EQ(seq0 + batch_size, snapshot->GetSequenceNumber());
+ const Snapshot* const raw_snapshot_ptr = txn->GetSnapshot();
+ ASSERT_EQ(raw_snapshot_ptr, snapshot.get());
+ ASSERT_EQ(snapshot, txn->GetTimestampedSnapshot());
+
+ {
+ std::shared_ptr<const Snapshot> snapshot1 =
+ db->GetLatestTimestampedSnapshot();
+ ASSERT_EQ(snapshot, snapshot1);
+ }
+ {
+ std::shared_ptr<const Snapshot> snapshot1 =
+ db->GetTimestampedSnapshot(timestamp);
+ ASSERT_EQ(snapshot, snapshot1);
+ }
+ {
+ std::vector<std::shared_ptr<const Snapshot> > snapshots;
+ s = db->GetAllTimestampedSnapshots(snapshots);
+ ASSERT_OK(s);
+ ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{snapshot},
+ snapshots);
+ }
+}
+
+TEST_P(TransactionTest, CreateSnapshot) {
+ // First create a non-timestamped snapshot
+ ManagedSnapshot snapshot_guard(db);
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i),
+ "v0_" + std::to_string(i)));
+ }
+ {
+ auto ret = db->CreateTimestampedSnapshot(kMaxTxnTimestamp);
+ ASSERT_TRUE(ret.first.IsInvalidArgument());
+ auto snapshot = ret.second;
+ ASSERT_EQ(nullptr, snapshot.get());
+ }
+ constexpr TxnTimestamp timestamp = 100;
+ Status s;
+ std::shared_ptr<const Snapshot> ts_snap0;
+ std::tie(s, ts_snap0) = db->CreateTimestampedSnapshot(timestamp);
+ ASSERT_OK(s);
+ assert(ts_snap0);
+ ASSERT_EQ(timestamp, ts_snap0->GetTimestamp());
+ for (int i = 0; i < 10; ++i) {
+ ASSERT_OK(db->Delete(WriteOptions(), "k" + std::to_string(i)));
+ }
+ {
+ ReadOptions read_opts;
+ read_opts.snapshot = ts_snap0.get();
+ for (int i = 0; i < 10; ++i) {
+ std::string value;
+ s = db->Get(read_opts, "k" + std::to_string(i), &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("v0_" + std::to_string(i), value);
+ }
+ }
+ {
+ std::shared_ptr<const Snapshot> snapshot =
+ db->GetLatestTimestampedSnapshot();
+ ASSERT_EQ(ts_snap0, snapshot);
+ }
+ {
+ std::shared_ptr<const Snapshot> snapshot =
+ db->GetTimestampedSnapshot(timestamp);
+ ASSERT_OK(s);
+ ASSERT_EQ(ts_snap0, snapshot);
+ }
+ {
+ std::vector<std::shared_ptr<const Snapshot> > snapshots;
+ s = db->GetAllTimestampedSnapshots(snapshots);
+ ASSERT_OK(s);
+ ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{ts_snap0},
+ snapshots);
+ }
+}
+
+TEST_P(TransactionTest, SequenceAndTsOrder) {
+ Status s;
+ std::shared_ptr<const Snapshot> snapshot;
+ std::tie(s, snapshot) = db->CreateTimestampedSnapshot(100);
+ ASSERT_OK(s);
+ assert(snapshot);
+ {
+ // Cannot request smaller timestamp for the new timestamped snapshot.
+ std::shared_ptr<const Snapshot> tmp_snapshot;
+ std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(50);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_EQ(nullptr, tmp_snapshot.get());
+ }
+
+ // If requesting a new timestamped snapshot with the same timestamp and
+ // sequence number, we avoid creating new snapshot object but reuse
+ // exisisting one.
+ std::shared_ptr<const Snapshot> snapshot1;
+ std::tie(s, snapshot1) = db->CreateTimestampedSnapshot(100);
+ ASSERT_OK(s);
+ ASSERT_EQ(snapshot.get(), snapshot1.get());
+
+ // If there is no write, but we request a larger timestamp, we still create
+ // a new snapshot object.
+ std::shared_ptr<const Snapshot> snapshot2;
+ std::tie(s, snapshot2) = db->CreateTimestampedSnapshot(200);
+ ASSERT_OK(s);
+ assert(snapshot2);
+ ASSERT_NE(snapshot.get(), snapshot2.get());
+ ASSERT_EQ(snapshot2->GetSequenceNumber(), snapshot->GetSequenceNumber());
+ ASSERT_EQ(200, snapshot2->GetTimestamp());
+
+ // Increase sequence number.
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "v0"));
+ {
+ // We are requesting the same timestamp for a larger sequence number, thus
+ // we cannot create timestamped snapshot.
+ std::shared_ptr<const Snapshot> tmp_snapshot;
+ std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(200);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_EQ(nullptr, tmp_snapshot.get());
+ }
+ {
+ std::unique_ptr<Transaction> txn1(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn1->Put("bar", "v0"));
+ std::shared_ptr<const Snapshot> ss;
+ ASSERT_OK(txn1->CommitAndTryCreateSnapshot(nullptr, 200, &ss));
+ // Cannot create snapshot because requested timestamp is the same as the
+ // latest timestamped snapshot while sequence number is strictly higher.
+ ASSERT_EQ(nullptr, ss);
+ }
+ {
+ std::unique_ptr<Transaction> txn2(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn2->Put("bar", "v0"));
+ std::shared_ptr<const Snapshot> ss;
+ // Application should never do this. This is just to demonstrate error
+ // handling.
+ ASSERT_OK(txn2->CommitAndTryCreateSnapshot(nullptr, 100, &ss));
+ // Cannot create snapshot because requested timestamp is smaller than
+ // latest timestamped snapshot.
+ ASSERT_EQ(nullptr, ss);
+ }
+}
+
+TEST_P(TransactionTest, CloseDbWithSnapshots) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn->SetName("txn0"));
+ ASSERT_OK(txn->Put("foo", "v"));
+ ASSERT_OK(txn->Prepare());
+ std::shared_ptr<const Snapshot> snapshot;
+ constexpr TxnTimestamp timestamp = 121;
+ auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+ ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot));
+ assert(snapshot);
+ ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+ ASSERT_EQ(timestamp, snapshot->GetTimestamp());
+ ASSERT_TRUE(db->Close().IsAborted());
+}
+
+TEST_P(TransactionTest, MultipleTimestampedSnapshots) {
+ auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ assert(dbimpl);
+ const bool seq_per_batch = dbimpl->seq_per_batch();
+ // TODO: remove the following assert(!seq_per_batch) once timestamped snapshot
+ // is supported in write-prepared/write-unprepared transactions.
+ assert(!seq_per_batch);
+ constexpr size_t txn_size = 10;
+ constexpr TxnTimestamp ts_delta = 10;
+ constexpr size_t num_txns = 100;
+ std::vector<std::shared_ptr<const Snapshot> > snapshots(num_txns);
+ constexpr TxnTimestamp start_ts = 10000;
+ auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+ for (size_t i = 0; i < num_txns; ++i) {
+ std::unique_ptr<Transaction> txn(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+ for (size_t j = 0; j < txn_size; ++j) {
+ ASSERT_OK(txn->Put("k" + std::to_string(j),
+ "v" + std::to_string(j) + "_" + std::to_string(i)));
+ }
+ if (0 == (i % 2)) {
+ ASSERT_OK(txn->Prepare());
+ }
+ ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, start_ts + i * ts_delta,
+ &snapshots[i]));
+ assert(snapshots[i]);
+ ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+ ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp());
+ }
+
+ {
+ auto snapshot = db->GetTimestampedSnapshot(start_ts + 1);
+ ASSERT_EQ(nullptr, snapshot);
+ }
+
+ constexpr TxnTimestamp max_ts = start_ts + num_txns * ts_delta;
+ for (size_t i = 0; i < num_txns; ++i) {
+ auto snapshot = db->GetTimestampedSnapshot(start_ts + i * ts_delta);
+ ASSERT_EQ(snapshots[i], snapshot);
+
+ std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+ Status s = db->GetTimestampedSnapshots(max_ts, start_ts + i * ts_delta,
+ tmp_snapshots);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_TRUE(tmp_snapshots.empty());
+
+ for (size_t j = i; j < num_txns; ++j) {
+ std::vector<std::shared_ptr<const Snapshot> > expected_snapshots(
+ snapshots.begin() + i, snapshots.begin() + j);
+ tmp_snapshots.clear();
+ s = db->GetTimestampedSnapshots(start_ts + i * ts_delta,
+ start_ts + j * ts_delta, tmp_snapshots);
+ if (i < j) {
+ ASSERT_OK(s);
+ } else {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ }
+ ASSERT_EQ(expected_snapshots, tmp_snapshots);
+ }
+ }
+
+ {
+ std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+ const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots);
+ ASSERT_OK(s);
+ ASSERT_EQ(snapshots, tmp_snapshots);
+
+ const std::shared_ptr<const Snapshot> latest_snapshot =
+ db->GetLatestTimestampedSnapshot();
+ ASSERT_EQ(snapshots.back(), latest_snapshot);
+ }
+
+ for (size_t i = 0; i <= num_txns; ++i) {
+ std::vector<std::shared_ptr<const Snapshot> > snapshots1(
+ snapshots.begin() + i, snapshots.end());
+ if (i > 0) {
+ auto snapshot1 =
+ db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta);
+ assert(snapshot1);
+ ASSERT_EQ(start_ts + (i - 1) * ts_delta, snapshot1->GetTimestamp());
+ }
+
+ db->ReleaseTimestampedSnapshotsOlderThan(start_ts + i * ts_delta);
+
+ if (i > 0) {
+ auto snapshot1 =
+ db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta);
+ ASSERT_EQ(nullptr, snapshot1);
+ }
+
+ std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+ const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots);
+ ASSERT_OK(s);
+ ASSERT_EQ(snapshots1, tmp_snapshots);
+ }
+
+ // Even after released by db, the applications still hold reference to shared
+ // snapshots.
+ for (size_t i = 0; i < num_txns; ++i) {
+ assert(snapshots[i]);
+ ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp());
+ }
+
+ snapshots.clear();
+ ASSERT_OK(db->Close());
+ delete db;
+ db = nullptr;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.cc b/src/rocksdb/utilities/transactions/transaction_base.cc
new file mode 100644
index 000000000..83fd94ac8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.cc
@@ -0,0 +1,731 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_base.h"
+
+#include <cinttypes>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Transaction::CommitAndTryCreateSnapshot(
+ std::shared_ptr<TransactionNotifier> notifier, TxnTimestamp ts,
+ std::shared_ptr<const Snapshot>* snapshot) {
+ if (snapshot) {
+ snapshot->reset();
+ }
+ TxnTimestamp commit_ts = GetCommitTimestamp();
+ if (commit_ts == kMaxTxnTimestamp) {
+ if (ts == kMaxTxnTimestamp) {
+ return Status::InvalidArgument("Commit timestamp unset");
+ } else {
+ const Status s = SetCommitTimestamp(ts);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ } else if (ts != kMaxTxnTimestamp) {
+ if (ts != commit_ts) {
+ // For now we treat this as error.
+ return Status::InvalidArgument("Different commit ts specified");
+ }
+ }
+ SetSnapshotOnNextOperation(notifier);
+ Status s = Commit();
+ if (!s.ok()) {
+ return s;
+ }
+ assert(s.ok());
+ // If we reach here, we must return ok status for this function.
+ std::shared_ptr<const Snapshot> new_snapshot = GetTimestampedSnapshot();
+
+ if (snapshot) {
+ *snapshot = new_snapshot;
+ }
+ return Status::OK();
+}
+
+TransactionBaseImpl::TransactionBaseImpl(
+ DB* db, const WriteOptions& write_options,
+ const LockTrackerFactory& lock_tracker_factory)
+ : db_(db),
+ dbimpl_(static_cast_with_check<DBImpl>(db)),
+ write_options_(write_options),
+ cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
+ lock_tracker_factory_(lock_tracker_factory),
+ start_time_(dbimpl_->GetSystemClock()->NowMicros()),
+ write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key),
+ tracked_locks_(lock_tracker_factory_.Create()),
+ commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */,
+ write_options.protection_bytes_per_key,
+ 0 /* default_cf_ts_sz */),
+ indexing_enabled_(true) {
+ assert(dynamic_cast<DBImpl*>(db_) != nullptr);
+ log_number_ = 0;
+ if (dbimpl_->allow_2pc()) {
+ InitWriteBatch();
+ }
+}
+
+TransactionBaseImpl::~TransactionBaseImpl() {
+ // Release snapshot if snapshot is set
+ SetSnapshotInternal(nullptr);
+}
+
+void TransactionBaseImpl::Clear() {
+ save_points_.reset(nullptr);
+ write_batch_.Clear();
+ commit_time_batch_.Clear();
+ tracked_locks_->Clear();
+ num_puts_ = 0;
+ num_deletes_ = 0;
+ num_merges_ = 0;
+
+ if (dbimpl_->allow_2pc()) {
+ InitWriteBatch();
+ }
+}
+
+void TransactionBaseImpl::Reinitialize(DB* db,
+ const WriteOptions& write_options) {
+ Clear();
+ ClearSnapshot();
+ id_ = 0;
+ db_ = db;
+ name_.clear();
+ log_number_ = 0;
+ write_options_ = write_options;
+ start_time_ = dbimpl_->GetSystemClock()->NowMicros();
+ indexing_enabled_ = true;
+ cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily());
+ WriteBatchInternal::UpdateProtectionInfo(
+ write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key)
+ .PermitUncheckedError();
+ WriteBatchInternal::UpdateProtectionInfo(
+ &commit_time_batch_, write_options_.protection_bytes_per_key)
+ .PermitUncheckedError();
+}
+
+void TransactionBaseImpl::SetSnapshot() {
+ const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary();
+ SetSnapshotInternal(snapshot);
+}
+
+void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) {
+ // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to
+ // be released, not deleted when it is no longer referenced.
+ snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot,
+ this, std::placeholders::_1, db_));
+ snapshot_needed_ = false;
+ snapshot_notifier_ = nullptr;
+}
+
+void TransactionBaseImpl::SetSnapshotOnNextOperation(
+ std::shared_ptr<TransactionNotifier> notifier) {
+ snapshot_needed_ = true;
+ snapshot_notifier_ = notifier;
+}
+
+void TransactionBaseImpl::SetSnapshotIfNeeded() {
+ if (snapshot_needed_) {
+ std::shared_ptr<TransactionNotifier> notifier = snapshot_notifier_;
+ SetSnapshot();
+ if (notifier != nullptr) {
+ notifier->SnapshotCreated(GetSnapshot());
+ }
+ }
+}
+
+Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
+ const SliceParts& key, bool read_only,
+ bool exclusive, const bool do_validate,
+ const bool assume_tracked) {
+ size_t key_size = 0;
+ for (int i = 0; i < key.num_parts; ++i) {
+ key_size += key.parts[i].size();
+ }
+
+ std::string str;
+ str.reserve(key_size);
+
+ for (int i = 0; i < key.num_parts; ++i) {
+ str.append(key.parts[i].data(), key.parts[i].size());
+ }
+
+ return TryLock(column_family, str, read_only, exclusive, do_validate,
+ assume_tracked);
+}
+
+void TransactionBaseImpl::SetSavePoint() {
+ if (save_points_ == nullptr) {
+ save_points_.reset(
+ new std::stack<TransactionBaseImpl::SavePoint,
+ autovector<TransactionBaseImpl::SavePoint>>());
+ }
+ save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_,
+ num_puts_, num_deletes_, num_merges_,
+ lock_tracker_factory_);
+ write_batch_.SetSavePoint();
+}
+
+Status TransactionBaseImpl::RollbackToSavePoint() {
+ if (save_points_ != nullptr && save_points_->size() > 0) {
+ // Restore saved SavePoint
+ TransactionBaseImpl::SavePoint& save_point = save_points_->top();
+ snapshot_ = save_point.snapshot_;
+ snapshot_needed_ = save_point.snapshot_needed_;
+ snapshot_notifier_ = save_point.snapshot_notifier_;
+ num_puts_ = save_point.num_puts_;
+ num_deletes_ = save_point.num_deletes_;
+ num_merges_ = save_point.num_merges_;
+
+ // Rollback batch
+ Status s = write_batch_.RollbackToSavePoint();
+ assert(s.ok());
+
+ // Rollback any keys that were tracked since the last savepoint
+ tracked_locks_->Subtract(*save_point.new_locks_);
+
+ save_points_->pop();
+
+ return s;
+ } else {
+ assert(write_batch_.RollbackToSavePoint().IsNotFound());
+ return Status::NotFound();
+ }
+}
+
+Status TransactionBaseImpl::PopSavePoint() {
+ if (save_points_ == nullptr || save_points_->empty()) {
+ // No SavePoint yet.
+ assert(write_batch_.PopSavePoint().IsNotFound());
+ return Status::NotFound();
+ }
+
+ assert(!save_points_->empty());
+ // If there is another savepoint A below the current savepoint B, then A needs
+ // to inherit tracked_keys in B so that if we rollback to savepoint A, we
+ // remember to unlock keys in B. If there is no other savepoint below, then we
+ // can safely discard savepoint info.
+ if (save_points_->size() == 1) {
+ save_points_->pop();
+ } else {
+ TransactionBaseImpl::SavePoint top(lock_tracker_factory_);
+ std::swap(top, save_points_->top());
+ save_points_->pop();
+
+ save_points_->top().new_locks_->Merge(*top.new_locks_);
+ }
+
+ return write_batch_.PopSavePoint();
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s = Get(read_options, column_family, key, &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val) {
+ return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
+ pinnable_val);
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value,
+ bool exclusive,
+ const bool do_validate) {
+ if (!do_validate && read_options.snapshot != nullptr) {
+ return Status::InvalidArgument(
+ "If do_validate is false then GetForUpdate with snapshot is not "
+ "defined.");
+ }
+ Status s =
+ TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
+
+ if (s.ok() && value != nullptr) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ s = Get(read_options, column_family, key, &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ }
+ return s;
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key,
+ PinnableSlice* pinnable_val,
+ bool exclusive,
+ const bool do_validate) {
+ if (!do_validate && read_options.snapshot != nullptr) {
+ return Status::InvalidArgument(
+ "If do_validate is false then GetForUpdate with snapshot is not "
+ "defined.");
+ }
+ Status s =
+ TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
+
+ if (s.ok() && pinnable_val != nullptr) {
+ s = Get(read_options, column_family, key, pinnable_val);
+ }
+ return s;
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGet(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ size_t num_keys = keys.size();
+ values->resize(num_keys);
+
+ std::vector<Status> stat_list(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
+ }
+
+ return stat_list;
+}
+
+void TransactionBaseImpl::MultiGet(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
+ num_keys, keys, values, statuses,
+ sorted_input);
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGetForUpdate(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ // Regardless of whether the MultiGet succeeded, track these keys.
+ size_t num_keys = keys.size();
+ values->resize(num_keys);
+
+ // Lock all keys
+ for (size_t i = 0; i < num_keys; ++i) {
+ Status s = TryLock(column_family[i], keys[i], true /* read_only */,
+ true /* exclusive */);
+ if (!s.ok()) {
+ // Fail entire multiget if we cannot lock all keys
+ return std::vector<Status>(num_keys, s);
+ }
+ }
+
+ // TODO(agiardullo): optimize multiget?
+ std::vector<Status> stat_list(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
+ }
+
+ return stat_list;
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) {
+ Iterator* db_iter = db_->NewIterator(read_options);
+ assert(db_iter);
+
+ return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter,
+ &read_options);
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ Iterator* db_iter = db_->NewIterator(read_options, column_family);
+ assert(db_iter);
+
+ return write_batch_.NewIteratorWithBase(column_family, db_iter,
+ &read_options);
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ num_puts_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ num_puts_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Merge(column_family, key, value);
+ if (s.ok()) {
+ num_merges_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ const bool do_validate = !assume_tracked;
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, do_validate, assume_tracked);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ num_puts_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const SliceParts& value) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Put(column_family, key, value);
+ if (s.ok()) {
+ num_puts_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const Slice& value) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Merge(column_family, key, value);
+ if (s.ok()) {
+ num_merges_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->Delete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+Status TransactionBaseImpl::SingleDeleteUntracked(
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ Status s = TryLock(column_family, key, false /* read_only */,
+ true /* exclusive */, false /* do_validate */);
+
+ if (s.ok()) {
+ s = GetBatchForWrite()->SingleDelete(column_family, key);
+ if (s.ok()) {
+ num_deletes_++;
+ }
+ }
+
+ return s;
+}
+
+void TransactionBaseImpl::PutLogData(const Slice& blob) {
+ auto s = write_batch_.PutLogData(blob);
+ (void)s;
+ assert(s.ok());
+}
+
+WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
+ return &write_batch_;
+}
+
+uint64_t TransactionBaseImpl::GetElapsedTime() const {
+ return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000;
+}
+
+uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
+
+uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; }
+
+uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; }
+
+uint64_t TransactionBaseImpl::GetNumKeys() const {
+ return tracked_locks_->GetNumPointLocks();
+}
+
+void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
+ SequenceNumber seq, bool read_only,
+ bool exclusive) {
+ PointLockRequest r;
+ r.column_family_id = cfh_id;
+ r.key = key;
+ r.seq = seq;
+ r.read_only = read_only;
+ r.exclusive = exclusive;
+
+ // Update map of all tracked keys for this transaction
+ tracked_locks_->Track(r);
+
+ if (save_points_ != nullptr && !save_points_->empty()) {
+ // Update map of tracked keys in this SavePoint
+ save_points_->top().new_locks_->Track(r);
+ }
+}
+
+// Gets the write batch that should be used for Put/Merge/Deletes.
+//
+// Returns either a WriteBatch or WriteBatchWithIndex depending on whether
+// DisableIndexing() has been called.
+WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() {
+ if (indexing_enabled_) {
+ // Use WriteBatchWithIndex
+ return &write_batch_;
+ } else {
+ // Don't use WriteBatchWithIndex. Return base WriteBatch.
+ return write_batch_.GetWriteBatch();
+ }
+}
+
+void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) {
+ if (snapshot != nullptr) {
+ ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log,
+ "ReleaseSnapshot %" PRIu64 " Set",
+ snapshot->GetSequenceNumber());
+ db->ReleaseSnapshot(snapshot);
+ }
+}
+
+void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ PointLockRequest r;
+ r.column_family_id = GetColumnFamilyID(column_family);
+ r.key = key.ToString();
+ r.read_only = true;
+
+ bool can_untrack = false;
+ if (save_points_ != nullptr && !save_points_->empty()) {
+ // If there is no GetForUpdate of the key in this save point,
+ // then cannot untrack from the global lock tracker.
+ UntrackStatus s = save_points_->top().new_locks_->Untrack(r);
+ can_untrack = (s != UntrackStatus::NOT_TRACKED);
+ } else {
+ // No save point, so can untrack from the global lock tracker.
+ can_untrack = true;
+ }
+
+ if (can_untrack) {
+ // If erased from the global tracker, then can unlock the key.
+ UntrackStatus s = tracked_locks_->Untrack(r);
+ bool can_unlock = (s == UntrackStatus::REMOVED);
+ if (can_unlock) {
+ UnlockGetForUpdate(column_family, key);
+ }
+ }
+}
+
+Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) {
+ struct IndexedWriteBatchBuilder : public WriteBatch::Handler {
+ Transaction* txn_;
+ DBImpl* db_;
+ IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db)
+ : txn_(txn), db_(db) {
+ assert(dynamic_cast<TransactionBaseImpl*>(txn_) != nullptr);
+ }
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
+ return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return txn_->Delete(db_->GetColumnFamilyHandle(cf), key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
+ return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val);
+ }
+
+ // this is used for reconstructing prepared transactions upon
+ // recovery. there should not be any meta markers in the batches
+ // we are processing.
+ Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+ Status MarkEndPrepare(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ };
+
+ IndexedWriteBatchBuilder copycat(this, dbimpl_);
+ return src_batch->Iterate(&copycat);
+}
+
+WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() {
+ return &commit_time_batch_;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.h b/src/rocksdb/utilities/transactions/transaction_base.h
new file mode 100644
index 000000000..1bcb20ca9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.h
@@ -0,0 +1,384 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionBaseImpl : public Transaction {
+ public:
+ TransactionBaseImpl(DB* db, const WriteOptions& write_options,
+ const LockTrackerFactory& lock_tracker_factory);
+
+ ~TransactionBaseImpl() override;
+
+ // Remove pending operations queued in this transaction.
+ virtual void Clear();
+
+ void Reinitialize(DB* db, const WriteOptions& write_options);
+
+ // Called before executing Put, Merge, Delete, and GetForUpdate. If TryLock
+ // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed.
+ // do_validate will be false if called from PutUntracked, DeleteUntracked,
+ // MergeUntracked, or GetForUpdate(do_validate=false)
+ virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+ bool read_only, bool exclusive,
+ const bool do_validate = true,
+ const bool assume_tracked = false) = 0;
+
+ void SetSavePoint() override;
+
+ Status RollbackToSavePoint() override;
+
+ Status PopSavePoint() override;
+
+ using Transaction::Get;
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value) override;
+
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) override;
+
+ Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) override {
+ return Get(options, db_->DefaultColumnFamily(), key, value);
+ }
+
+ using Transaction::GetForUpdate;
+ Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, bool exclusive,
+ const bool do_validate) override;
+
+ Status GetForUpdate(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val, bool exclusive,
+ const bool do_validate) override;
+
+ Status GetForUpdate(const ReadOptions& options, const Slice& key,
+ std::string* value, bool exclusive,
+ const bool do_validate) override {
+ return GetForUpdate(options, db_->DefaultColumnFamily(), key, value,
+ exclusive, do_validate);
+ }
+
+ using Transaction::MultiGet;
+ std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ std::vector<Status> MultiGet(const ReadOptions& options,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override {
+ return MultiGet(options,
+ std::vector<ColumnFamilyHandle*>(
+ keys.size(), db_->DefaultColumnFamily()),
+ keys, values);
+ }
+
+ void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys, PinnableSlice* values,
+ Status* statuses, const bool sorted_input = false) override;
+
+ using Transaction::MultiGetForUpdate;
+ std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ std::vector<Status> MultiGetForUpdate(
+ const ReadOptions& options, const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override {
+ return MultiGetForUpdate(options,
+ std::vector<ColumnFamilyHandle*>(
+ keys.size(), db_->DefaultColumnFamily()),
+ keys, values);
+ }
+
+ Iterator* GetIterator(const ReadOptions& read_options) override;
+ Iterator* GetIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) override;
+
+ Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) override;
+ Status Put(const Slice& key, const Slice& value) override {
+ return Put(nullptr, key, value);
+ }
+
+ Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value,
+ const bool assume_tracked = false) override;
+ Status Put(const SliceParts& key, const SliceParts& value) override {
+ return Put(nullptr, key, value);
+ }
+
+ Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value, const bool assume_tracked = false) override;
+ Status Merge(const Slice& key, const Slice& value) override {
+ return Merge(nullptr, key, value);
+ }
+
+ Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) override;
+ Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+ Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const bool assume_tracked = false) override;
+ Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+ Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) override;
+ Status SingleDelete(const Slice& key) override {
+ return SingleDelete(nullptr, key);
+ }
+ Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const bool assume_tracked = false) override;
+ Status SingleDelete(const SliceParts& key) override {
+ return SingleDelete(nullptr, key);
+ }
+
+ Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status PutUntracked(const Slice& key, const Slice& value) override {
+ return PutUntracked(nullptr, key, value);
+ }
+
+ Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) override;
+ Status PutUntracked(const SliceParts& key, const SliceParts& value) override {
+ return PutUntracked(nullptr, key, value);
+ }
+
+ Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ Status MergeUntracked(const Slice& key, const Slice& value) override {
+ return MergeUntracked(nullptr, key, value);
+ }
+
+ Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status DeleteUntracked(const Slice& key) override {
+ return DeleteUntracked(nullptr, key);
+ }
+ Status DeleteUntracked(ColumnFamilyHandle* column_family,
+ const SliceParts& key) override;
+ Status DeleteUntracked(const SliceParts& key) override {
+ return DeleteUntracked(nullptr, key);
+ }
+
+ Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ Status SingleDeleteUntracked(const Slice& key) override {
+ return SingleDeleteUntracked(nullptr, key);
+ }
+
+ void PutLogData(const Slice& blob) override;
+
+ WriteBatchWithIndex* GetWriteBatch() override;
+
+ virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */
+ }
+
+ const Snapshot* GetSnapshot() const override {
+ // will return nullptr when there is no snapshot
+ return snapshot_.get();
+ }
+
+ std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const override {
+ return snapshot_;
+ }
+
+ virtual void SetSnapshot() override;
+ void SetSnapshotOnNextOperation(
+ std::shared_ptr<TransactionNotifier> notifier = nullptr) override;
+
+ void ClearSnapshot() override {
+ snapshot_.reset();
+ snapshot_needed_ = false;
+ snapshot_notifier_ = nullptr;
+ }
+
+ void DisableIndexing() override { indexing_enabled_ = false; }
+
+ void EnableIndexing() override { indexing_enabled_ = true; }
+
+ bool IndexingEnabled() const { return indexing_enabled_; }
+
+ uint64_t GetElapsedTime() const override;
+
+ uint64_t GetNumPuts() const override;
+
+ uint64_t GetNumDeletes() const override;
+
+ uint64_t GetNumMerges() const override;
+
+ uint64_t GetNumKeys() const override;
+
+ void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ void UndoGetForUpdate(const Slice& key) override {
+ return UndoGetForUpdate(nullptr, key);
+ };
+
+ WriteOptions* GetWriteOptions() override { return &write_options_; }
+
+ void SetWriteOptions(const WriteOptions& write_options) override {
+ write_options_ = write_options;
+ }
+
+ // Used for memory management for snapshot_
+ void ReleaseSnapshot(const Snapshot* snapshot, DB* db);
+
+ // iterates over the given batch and makes the appropriate inserts.
+ // used for rebuilding prepared transactions after recovery.
+ virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
+
+ WriteBatch* GetCommitTimeWriteBatch() override;
+
+ LockTracker& GetTrackedLocks() { return *tracked_locks_; }
+
+ protected:
+ // Add a key to the list of tracked keys.
+ //
+ // seqno is the earliest seqno this key was involved with this transaction.
+ // readonly should be set to true if no data was written for this key
+ void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno,
+ bool readonly, bool exclusive);
+
+ // Called when UndoGetForUpdate determines that this key can be unlocked.
+ virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ // Sets a snapshot if SetSnapshotOnNextOperation() has been called.
+ void SetSnapshotIfNeeded();
+
+ // Initialize write_batch_ for 2PC by inserting Noop.
+ inline void InitWriteBatch(bool clear = false) {
+ if (clear) {
+ write_batch_.Clear();
+ }
+ assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
+ auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+ assert(s.ok());
+ }
+
+ WriteBatchBase* GetBatchForWrite();
+
+ DB* db_;
+ DBImpl* dbimpl_;
+
+ WriteOptions write_options_;
+
+ const Comparator* cmp_;
+
+ const LockTrackerFactory& lock_tracker_factory_;
+
+ // Stores that time the txn was constructed, in microseconds.
+ uint64_t start_time_;
+
+ // Stores the current snapshot that was set by SetSnapshot or null if
+ // no snapshot is currently set.
+ std::shared_ptr<const Snapshot> snapshot_;
+
+ // Count of various operations pending in this transaction
+ uint64_t num_puts_ = 0;
+ uint64_t num_deletes_ = 0;
+ uint64_t num_merges_ = 0;
+
+ struct SavePoint {
+ std::shared_ptr<const Snapshot> snapshot_;
+ bool snapshot_needed_ = false;
+ std::shared_ptr<TransactionNotifier> snapshot_notifier_;
+ uint64_t num_puts_ = 0;
+ uint64_t num_deletes_ = 0;
+ uint64_t num_merges_ = 0;
+
+ // Record all locks tracked since the last savepoint
+ std::shared_ptr<LockTracker> new_locks_;
+
+ SavePoint(std::shared_ptr<const Snapshot> snapshot, bool snapshot_needed,
+ std::shared_ptr<TransactionNotifier> snapshot_notifier,
+ uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges,
+ const LockTrackerFactory& lock_tracker_factory)
+ : snapshot_(snapshot),
+ snapshot_needed_(snapshot_needed),
+ snapshot_notifier_(snapshot_notifier),
+ num_puts_(num_puts),
+ num_deletes_(num_deletes),
+ num_merges_(num_merges),
+ new_locks_(lock_tracker_factory.Create()) {}
+
+ explicit SavePoint(const LockTrackerFactory& lock_tracker_factory)
+ : new_locks_(lock_tracker_factory.Create()) {}
+ };
+
+ // Records writes pending in this transaction
+ WriteBatchWithIndex write_batch_;
+
+ // For Pessimistic Transactions this is the set of acquired locks.
+ // Optimistic Transactions will keep note the requested locks (not actually
+ // locked), and do conflict checking until commit time based on the tracked
+ // lock requests.
+ std::unique_ptr<LockTracker> tracked_locks_;
+
+ // Stack of the Snapshot saved at each save point. Saved snapshots may be
+ // nullptr if there was no snapshot at the time SetSavePoint() was called.
+ std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint,
+ autovector<TransactionBaseImpl::SavePoint>>>
+ save_points_;
+
+ private:
+ friend class WriteCommittedTxn;
+ friend class WritePreparedTxn;
+
+ // Extra data to be persisted with the commit. Note this is only used when
+ // prepare phase is not skipped.
+ WriteBatch commit_time_batch_;
+
+ // If true, future Put/Merge/Deletes will be indexed in the
+ // WriteBatchWithIndex.
+ // If false, future Put/Merge/Deletes will be inserted directly into the
+ // underlying WriteBatch and not indexed in the WriteBatchWithIndex.
+ bool indexing_enabled_;
+
+ // SetSnapshotOnNextOperation() has been called and the snapshot has not yet
+ // been reset.
+ bool snapshot_needed_ = false;
+
+ // SetSnapshotOnNextOperation() has been called and the caller would like
+ // a notification through the TransactionNotifier interface
+ std::shared_ptr<TransactionNotifier> snapshot_notifier_ = nullptr;
+
+ Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key,
+ bool read_only, bool exclusive, const bool do_validate = true,
+ const bool assume_tracked = false);
+
+ void SetSnapshotInternal(const Snapshot* snapshot);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
new file mode 100644
index 000000000..345c4be90
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexImpl : public TransactionDBMutex {
+ public:
+ TransactionDBMutexImpl() {}
+ ~TransactionDBMutexImpl() override {}
+
+ Status Lock() override;
+
+ Status TryLockFor(int64_t timeout_time) override;
+
+ void UnLock() override { mutex_.unlock(); }
+
+ friend class TransactionDBCondVarImpl;
+
+ private:
+ std::mutex mutex_;
+};
+
+class TransactionDBCondVarImpl : public TransactionDBCondVar {
+ public:
+ TransactionDBCondVarImpl() {}
+ ~TransactionDBCondVarImpl() override {}
+
+ Status Wait(std::shared_ptr<TransactionDBMutex> mutex) override;
+
+ Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+ int64_t timeout_time) override;
+
+ void Notify() override { cv_.notify_one(); }
+
+ void NotifyAll() override { cv_.notify_all(); }
+
+ private:
+ std::condition_variable cv_;
+};
+
+std::shared_ptr<TransactionDBMutex>
+TransactionDBMutexFactoryImpl::AllocateMutex() {
+ return std::shared_ptr<TransactionDBMutex>(new TransactionDBMutexImpl());
+}
+
+std::shared_ptr<TransactionDBCondVar>
+TransactionDBMutexFactoryImpl::AllocateCondVar() {
+ return std::shared_ptr<TransactionDBCondVar>(new TransactionDBCondVarImpl());
+}
+
+Status TransactionDBMutexImpl::Lock() {
+ mutex_.lock();
+ return Status::OK();
+}
+
+Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) {
+ bool locked = true;
+
+ if (timeout_time == 0) {
+ locked = mutex_.try_lock();
+ } else {
+ // Previously, this code used a std::timed_mutex. However, this was changed
+ // due to known bugs in gcc versions < 4.9.
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54562
+ //
+ // Since this mutex isn't held for long and only a single mutex is ever
+ // held at a time, it is reasonable to ignore the lock timeout_time here
+ // and only check it when waiting on the condition_variable.
+ mutex_.lock();
+ }
+
+ if (!locked) {
+ // timeout acquiring mutex
+ return Status::TimedOut(Status::SubCode::kMutexTimeout);
+ }
+
+ return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::Wait(
+ std::shared_ptr<TransactionDBMutex> mutex) {
+ auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+
+ std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock);
+ cv_.wait(lock);
+
+ // Make sure unique_lock doesn't unlock mutex when it destructs
+ lock.release();
+
+ return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::WaitFor(
+ std::shared_ptr<TransactionDBMutex> mutex, int64_t timeout_time) {
+ Status s;
+
+ auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+ std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock);
+
+ if (timeout_time < 0) {
+ // If timeout is negative, do not use a timeout
+ cv_.wait(lock);
+ } else {
+ auto duration = std::chrono::microseconds(timeout_time);
+ auto cv_status = cv_.wait_for(lock, duration);
+
+ // Check if the wait stopped due to timing out.
+ if (cv_status == std::cv_status::timeout) {
+ s = Status::TimedOut(Status::SubCode::kMutexTimeout);
+ }
+ }
+
+ // Make sure unique_lock doesn't unlock mutex when it destructs
+ lock.release();
+
+ // CV was signaled, or we spuriously woke up (but didn't time out)
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
new file mode 100644
index 000000000..fbee92832
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutex;
+class TransactionDBCondVar;
+
+// Default implementation of TransactionDBMutexFactory. May be overridden
+// by TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory {
+ public:
+ std::shared_ptr<TransactionDBMutex> AllocateMutex() override;
+ std::shared_ptr<TransactionDBCondVar> AllocateCondVar() override;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_test.cc b/src/rocksdb/utilities/transactions/transaction_test.cc
new file mode 100644
index 000000000..caf1566b9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_test.cc
@@ -0,0 +1,6550 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_test.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+INSTANTIATE_TEST_CASE_P(
+ DBAsBaseDB, TransactionTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+ std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+INSTANTIATE_TEST_CASE_P(
+ DBAsBaseDB, TransactionStressTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+ std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+INSTANTIATE_TEST_CASE_P(
+ StackableDBAsBaseDB, TransactionTest,
+ ::testing::Values(
+ std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+ std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
+
+// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
+// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+ MySQLStyleTransactionTest, MySQLStyleTransactionTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+ std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+ std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+ std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+ std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+ std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, DoubleEmptyWrite) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+
+ WriteBatch batch;
+
+ ASSERT_OK(db->Write(write_options, &batch));
+ ASSERT_OK(db->Write(write_options, &batch));
+
+ // Also test committing empty transactions in 2PC
+ TransactionOptions txn_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Prepare());
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+
+ // Also test that it works during recovery
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid2"));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ assert(db != nullptr);
+ txn0 = db->GetTransactionByName("xid2");
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+}
+
+TEST_P(TransactionTest, SuccessTest) {
+ ASSERT_OK(db->ResetStats());
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+ ASSERT_TRUE(txn);
+
+ ASSERT_EQ(0, txn->GetNumPuts());
+ ASSERT_LE(0, txn->GetID());
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ ASSERT_OK(txn->Commit());
+
+ ASSERT_OK(db->Get(read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
+ const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
+
+ if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
+ ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
+ return;
+ }
+
+ ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
+
+ TransactionOptions txn_opts;
+ txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
+ Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
+ assert(txn);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "FlushJob::WriteLevel0Table", [&](void* arg) {
+ // db mutex not held.
+ auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+ assert(mems);
+ ASSERT_EQ(1, mems->size());
+ auto* ctwb = txn->GetCommitTimeWriteBatch();
+ ASSERT_OK(ctwb->Put("gtid", "123"));
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(txn->Put("key1", "value"));
+ ASSERT_OK(txn->SetName("txn1"));
+
+ ASSERT_OK(txn->Prepare());
+
+ auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
+ ASSERT_OK(dbimpl->TEST_FlushMemTable(
+ /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
+
+ ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
+
+ {
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+ ASSERT_EQ("value", value);
+ }
+
+ delete db;
+ db = nullptr;
+ Status s;
+ if (use_stackable_db_ == false) {
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ } else {
+ s = OpenWithStackableDB();
+ }
+ ASSERT_OK(s);
+ assert(db);
+
+ {
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
+ ASSERT_EQ("123", value);
+
+ ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+ ASSERT_EQ("value", value);
+ }
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// The test clarifies the contract of do_validate and assume_tracked
+// in GetForUpdate and Put/Merge/Delete
+TEST_P(TransactionTest, AssumeExclusiveTracked) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+ TransactionOptions txn_options;
+ txn_options.lock_timeout = 1;
+ const bool EXCLUSIVE = true;
+ const bool DO_VALIDATE = true;
+ const bool ASSUME_LOCKED = true;
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+ txn->SetSnapshot();
+
+ // commit a value after the snapshot is taken
+ ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+
+ // By default write should fail to the commit after our snapshot
+ s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE);
+ ASSERT_TRUE(s.IsBusy());
+ // But the user could direct the db to skip validating the snapshot. The read
+ // value then should be the most recently committed
+ ASSERT_OK(
+ txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE));
+ ASSERT_EQ(value, "bar");
+
+ // Although ValidateSnapshot is skipped the key must have still got locked
+ s = db->Put(write_options, Slice("foo"), Slice("bar"));
+ ASSERT_TRUE(s.IsTimedOut());
+
+ // By default the write operations should fail due to the commit after the
+ // snapshot
+ s = txn->Put(Slice("foo"), Slice("bar1"));
+ ASSERT_TRUE(s.IsBusy());
+ s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+ !ASSUME_LOCKED);
+ ASSERT_TRUE(s.IsBusy());
+ // But the user could direct the db that it already assumes exclusive lock on
+ // the key due to the previous GetForUpdate call.
+ ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+ ASSUME_LOCKED));
+ ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"),
+ ASSUME_LOCKED));
+ ASSERT_OK(
+ txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED));
+ ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
+ ASSUME_LOCKED));
+
+ ASSERT_OK(txn->Rollback());
+ delete txn;
+}
+
+// This test clarifies the contract of ValidateSnapshot
+TEST_P(TransactionTest, ValidateSnapshotTest) {
+ for (bool with_flush : {true}) {
+ for (bool with_2pc : {true}) {
+ ASSERT_OK(ReOpen());
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ assert(db != nullptr);
+ Transaction* txn1 =
+ db->BeginTransaction(write_options, TransactionOptions());
+ ASSERT_TRUE(txn1);
+ ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1")));
+ if (with_2pc) {
+ ASSERT_OK(txn1->SetName("xid1"));
+ ASSERT_OK(txn1->Prepare());
+ }
+
+ if (with_flush) {
+ auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+ // Make sure the flushed memtable is not kept in memory
+ int max_memtable_in_history =
+ std::max(
+ options.max_write_buffer_number,
+ static_cast<int>(options.max_write_buffer_size_to_maintain) /
+ static_cast<int>(options.write_buffer_size)) +
+ 1;
+ for (int i = 0; i < max_memtable_in_history; i++) {
+ ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value")));
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+ }
+ }
+
+ Transaction* txn2 =
+ db->BeginTransaction(write_options, TransactionOptions());
+ ASSERT_TRUE(txn2);
+ txn2->SetSnapshot();
+
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+
+ auto pes_txn2 = dynamic_cast<PessimisticTransaction*>(txn2);
+ // Test the simple case where the key is not tracked yet
+ auto trakced_seq = kMaxSequenceNumber;
+ auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo",
+ &trakced_seq);
+ ASSERT_TRUE(s.IsBusy());
+ delete txn2;
+ }
+ }
+}
+
+TEST_P(TransactionTest, WaitingTxn) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ txn_options.lock_timeout = 1;
+ s = db->Put(write_options, Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+
+ /* create second cf */
+ ColumnFamilyHandle* cfa;
+ ColumnFamilyOptions cf_options;
+ s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+ ASSERT_OK(s);
+ s = db->Put(write_options, cfa, Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ TransactionID id1 = txn1->GetID();
+ ASSERT_TRUE(txn1);
+ ASSERT_TRUE(txn2);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
+ std::string key;
+ uint32_t cf_id;
+ std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
+ ASSERT_EQ(key, "foo");
+ ASSERT_EQ(wait.size(), 1);
+ ASSERT_EQ(wait[0], id1);
+ ASSERT_EQ(cf_id, 0U);
+ });
+
+ get_perf_context()->Reset();
+ // lock key in default cf
+ s = txn1->GetForUpdate(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+ ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
+
+ // lock key in cfa
+ s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+ ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
+
+ auto lock_data = db->GetLockStatusData();
+ // Locked keys exist in both column family.
+ ASSERT_EQ(lock_data.size(), 2);
+
+ auto cf_iterator = lock_data.begin();
+
+ // The iterator points to an unordered_multimap
+ // thus the test can not assume any particular order.
+
+ // Column family is 1 or 0 (cfa).
+ if (cf_iterator->first != 1 && cf_iterator->first != 0) {
+ FAIL();
+ }
+ // The locked key is "foo" and is locked by txn1
+ ASSERT_EQ(cf_iterator->second.key, "foo");
+ ASSERT_EQ(cf_iterator->second.ids.size(), 1);
+ ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
+
+ cf_iterator++;
+
+ // Column family is 0 (default) or 1.
+ if (cf_iterator->first != 1 && cf_iterator->first != 0) {
+ FAIL();
+ }
+ // The locked key is "foo" and is locked by txn1
+ ASSERT_EQ(cf_iterator->second.key, "foo");
+ ASSERT_EQ(cf_iterator->second.ids.size(), 1);
+ ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ s = txn2->GetForUpdate(read_options, "foo", &value);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+ ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
+ ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ delete cfa;
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, SharedLocks) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ Status s;
+
+ txn_options.lock_timeout = 1;
+ s = db->Put(write_options, Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+ ASSERT_TRUE(txn2);
+ ASSERT_TRUE(txn3);
+
+ // Test shared access between txns
+ s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ auto lock_data = db->GetLockStatusData();
+ ASSERT_EQ(lock_data.size(), 1);
+
+ auto cf_iterator = lock_data.begin();
+ ASSERT_EQ(cf_iterator->second.key, "foo");
+
+ // We compare whether the set of txns locking this key is the same. To do
+ // this, we need to sort both vectors so that the comparison is done
+ // correctly.
+ std::vector<TransactionID> expected_txns = {txn1->GetID(), txn2->GetID(),
+ txn3->GetID()};
+ std::vector<TransactionID> lock_txns = cf_iterator->second.ids;
+ ASSERT_EQ(expected_txns, lock_txns);
+ ASSERT_FALSE(cf_iterator->second.exclusive);
+
+ ASSERT_OK(txn1->Rollback());
+ ASSERT_OK(txn2->Rollback());
+ ASSERT_OK(txn3->Rollback());
+
+ // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it.
+ s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn3->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ txn1->UndoGetForUpdate("foo");
+ s = txn3->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ txn2->UndoGetForUpdate("foo");
+ s = txn3->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->Rollback());
+ ASSERT_OK(txn2->Rollback());
+ ASSERT_OK(txn3->Rollback());
+
+ // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock.
+ s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ txn1->UndoGetForUpdate("foo");
+ s = txn2->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->Rollback());
+ ASSERT_OK(txn2->Rollback());
+
+ // Test txn1 trying to downgrade its lock.
+ s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ // Should still fail after "downgrading".
+ s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ ASSERT_OK(txn1->Rollback());
+ ASSERT_OK(txn2->Rollback());
+
+ // Test txn1 holding an exclusive lock and txn2 trying to obtain shared
+ // access.
+ s = txn1->GetForUpdate(read_options, "foo", nullptr);
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ txn1->UndoGetForUpdate("foo");
+ s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+
+ delete txn1;
+ delete txn2;
+ delete txn3;
+}
+
+TEST_P(TransactionTest, DeadlockCycleShared) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+
+ txn_options.lock_timeout = 1000000;
+ txn_options.deadlock_detect = true;
+
+ // Set up a wait for chain like this:
+ //
+ // Tn -> T(n*2)
+ // Tn -> T(n*2 + 1)
+ //
+ // So we have:
+ // T1 -> T2 -> T4 ...
+ // | |> T5 ...
+ // |> T3 -> T6 ...
+ // |> T7 ...
+ // up to T31, then T[16 - 31] -> T1.
+ // Note that Tn holds lock on floor(n / 2).
+
+ std::vector<Transaction*> txns(31);
+
+ for (uint32_t i = 0; i < 31; i++) {
+ txns[i] = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txns[i]);
+ auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2),
+ nullptr, false /* exclusive */);
+ ASSERT_OK(s);
+ }
+
+ std::atomic<uint32_t> checkpoints(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PointLockManager::AcquireWithTimeout:WaitingTxn",
+ [&](void* /*arg*/) { checkpoints.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // We want the leaf transactions to block and hold everyone back.
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < 15; i++) {
+ std::function<void()> blocking_thread = [&, i] {
+ auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1),
+ nullptr, true /* exclusive */);
+ ASSERT_OK(s);
+ ASSERT_OK(txns[i]->Rollback());
+ delete txns[i];
+ };
+ threads.emplace_back(blocking_thread);
+ }
+
+ // Wait until all threads are waiting on each other.
+ while (checkpoints.load() != 15) {
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Complete the cycle T[16 - 31] -> T1
+ for (uint32_t i = 15; i < 31; i++) {
+ auto s =
+ txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */);
+ ASSERT_TRUE(s.IsDeadlock());
+
+ // Calculate next buffer len, plateau at 5 when 5 records are inserted.
+ const uint32_t curr_dlock_buffer_len_ =
+ (i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14);
+
+ auto dlock_buffer = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_);
+ auto dlock_entry = dlock_buffer[0].path;
+ ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks);
+ int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+ int64_t cur_deadlock_time = 0;
+ for (auto const& dl_path_rec : dlock_buffer) {
+ cur_deadlock_time = dl_path_rec.deadlock_time;
+ ASSERT_NE(cur_deadlock_time, 0);
+ ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+ pre_deadlock_time = cur_deadlock_time;
+ }
+
+ int64_t curr_waiting_key = 0;
+
+ // Offset of each txn id from the root of the shared dlock tree's txn id.
+ int64_t offset_root = dlock_entry[0].m_txn_id - 1;
+ // Offset of the final entry in the dlock path from the root's txn id.
+ TransactionID leaf_id =
+ dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
+
+ for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
+ auto dl_node = *it;
+ ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
+ ASSERT_EQ(dl_node.m_cf_id, 0U);
+ ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
+ ASSERT_EQ(dl_node.m_exclusive, true);
+
+ if (curr_waiting_key == 0) {
+ curr_waiting_key = leaf_id;
+ }
+ curr_waiting_key /= 2;
+ leaf_id /= 2;
+ }
+ }
+
+ // Rollback the leaf transaction.
+ for (uint32_t i = 15; i < 31; i++) {
+ ASSERT_OK(txns[i]->Rollback());
+ delete txns[i];
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ // Downsize the buffer and verify the 3 latest deadlocks are preserved.
+ auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+ db->SetDeadlockInfoBufferSize(3);
+ auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+ for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) {
+ for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) {
+ ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+ dlock_buffer_before_resize[i].path[j].m_txn_id);
+ }
+ }
+
+ // Upsize the buffer and verify the 3 latest dealocks are preserved.
+ dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+ db->SetDeadlockInfoBufferSize(5);
+ dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+ for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) {
+ for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) {
+ ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+ dlock_buffer_before_resize[i].path[j].m_txn_id);
+ }
+ }
+
+ // Downsize to 0 and verify the size is consistent.
+ dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+ db->SetDeadlockInfoBufferSize(0);
+ dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+ // Upsize from 0 to verify the size is persistent.
+ dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+ db->SetDeadlockInfoBufferSize(3);
+ dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+ // Contrived case of shared lock of cycle size 2 to verify that a shared
+ // lock causing a deadlock is correctly reported as "shared" in the buffer.
+ std::vector<Transaction*> txns_shared(2);
+
+ // Create a cycle of size 2.
+ for (uint32_t i = 0; i < 2; i++) {
+ txns_shared[i] = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txns_shared[i]);
+ auto s =
+ txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
+ ASSERT_OK(s);
+ }
+
+ std::atomic<uint32_t> checkpoints_shared(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PointLockManager::AcquireWithTimeout:WaitingTxn",
+ [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ std::vector<port::Thread> threads_shared;
+ for (uint32_t i = 0; i < 1; i++) {
+ std::function<void()> blocking_thread = [&, i] {
+ auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1),
+ nullptr);
+ ASSERT_OK(s);
+ ASSERT_OK(txns_shared[i]->Rollback());
+ delete txns_shared[i];
+ };
+ threads_shared.emplace_back(blocking_thread);
+ }
+
+ // Wait until all threads are waiting on each other.
+ while (checkpoints_shared.load() != 1) {
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Complete the cycle T2 -> T1 with a shared lock.
+ auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false);
+ ASSERT_TRUE(s.IsDeadlock());
+
+ auto dlock_buffer = db->GetDeadlockInfoBuffer();
+
+ // Verify the size of the buffer and the single path.
+ ASSERT_EQ(dlock_buffer.size(), 1);
+ ASSERT_EQ(dlock_buffer[0].path.size(), 2);
+
+ // Verify the exclusivity field of the transactions in the deadlock path.
+ ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
+ ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
+ ASSERT_OK(txns_shared[1]->Rollback());
+ delete txns_shared[1];
+
+ for (auto& t : threads_shared) {
+ t.join();
+ }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, DeadlockCycle) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+
+ // offset by 2 from the max depth to test edge case
+ const uint32_t kMaxCycleLength = 52;
+
+ txn_options.lock_timeout = 1000000;
+ txn_options.deadlock_detect = true;
+
+ for (uint32_t len = 2; len < kMaxCycleLength; len++) {
+ // Set up a long wait for chain like this:
+ //
+ // T1 -> T2 -> T3 -> ... -> Tlen
+
+ std::vector<Transaction*> txns(len);
+
+ for (uint32_t i = 0; i < len; i++) {
+ txns[i] = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txns[i]);
+ auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
+ ASSERT_OK(s);
+ }
+
+ std::atomic<uint32_t> checkpoints(0);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PointLockManager::AcquireWithTimeout:WaitingTxn",
+ [&](void* /*arg*/) { checkpoints.fetch_add(1); });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // We want the last transaction in the chain to block and hold everyone
+ // back.
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i + 1 < len; i++) {
+ std::function<void()> blocking_thread = [&, i] {
+ auto s =
+ txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr);
+ ASSERT_OK(s);
+ ASSERT_OK(txns[i]->Rollback());
+ delete txns[i];
+ };
+ threads.emplace_back(blocking_thread);
+ }
+
+ // Wait until all threads are waiting on each other.
+ while (checkpoints.load() != len - 1) {
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Complete the cycle Tlen -> T1
+ auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr);
+ ASSERT_TRUE(s.IsDeadlock());
+
+ const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1);
+ uint32_t curr_waiting_key = 0;
+ TransactionID curr_txn_id = txns[0]->GetID();
+
+ auto dlock_buffer = db->GetDeadlockInfoBuffer();
+ ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_);
+ uint32_t check_len = len;
+ bool check_limit_flag = false;
+
+ // Special case for a deadlock path that exceeds the maximum depth.
+ if (len > 50) {
+ check_len = 0;
+ check_limit_flag = true;
+ }
+ auto dlock_entry = dlock_buffer[0].path;
+ ASSERT_EQ(dlock_entry.size(), check_len);
+ ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag);
+
+ int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+ int64_t cur_deadlock_time = 0;
+ for (auto const& dl_path_rec : dlock_buffer) {
+ cur_deadlock_time = dl_path_rec.deadlock_time;
+ ASSERT_NE(cur_deadlock_time, 0);
+ ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+ pre_deadlock_time = cur_deadlock_time;
+ }
+
+ // Iterates backwards over path verifying decreasing txn_ids.
+ for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
+ auto dl_node = *it;
+ ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
+ ASSERT_EQ(dl_node.m_cf_id, 0u);
+ ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
+ ASSERT_EQ(dl_node.m_exclusive, true);
+
+ curr_txn_id--;
+ if (curr_waiting_key == 0) {
+ curr_waiting_key = len;
+ }
+ curr_waiting_key--;
+ }
+
+ // Rollback the last transaction.
+ ASSERT_OK(txns[len - 1]->Rollback());
+ delete txns[len - 1];
+
+ for (auto& t : threads) {
+ t.join();
+ }
+ }
+}
+
+TEST_P(TransactionStressTest, DeadlockStress) {
+ const uint32_t NUM_TXN_THREADS = 10;
+ const uint32_t NUM_KEYS = 100;
+ const uint32_t NUM_ITERS = 1000;
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+
+ txn_options.lock_timeout = 1000000;
+ txn_options.deadlock_detect = true;
+ std::vector<std::string> keys;
+
+ for (uint32_t i = 0; i < NUM_KEYS; i++) {
+ ASSERT_OK(db->Put(write_options, Slice(std::to_string(i)), Slice("")));
+ keys.push_back(std::to_string(i));
+ }
+
+ size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random rnd(static_cast<uint32_t>(tid));
+ std::function<void(uint32_t)> stress_thread = [&](uint32_t seed) {
+ std::default_random_engine g(seed);
+
+ Transaction* txn;
+ for (uint32_t i = 0; i < NUM_ITERS; i++) {
+ txn = db->BeginTransaction(write_options, txn_options);
+ auto random_keys = keys;
+ std::shuffle(random_keys.begin(), random_keys.end(), g);
+
+ // Lock keys in random order.
+ for (const auto& k : random_keys) {
+ // Lock mostly for shared access, but exclusive 1/4 of the time.
+ auto s =
+ txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0);
+ if (!s.ok()) {
+ ASSERT_TRUE(s.IsDeadlock());
+ ASSERT_OK(txn->Rollback());
+ break;
+ }
+ }
+
+ delete txn;
+ }
+ };
+
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
+ threads.emplace_back(stress_thread, rnd.Next());
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, CommitTimeBatchFailTest) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog"));
+
+ s = txn1->Put("foo", "bar");
+ ASSERT_OK(s);
+
+ // fails due to non-empty commit-time batch
+ s = txn1->Commit();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ delete txn1;
+}
+
+TEST_P(TransactionTest, LogMarkLeakTest) {
+ TransactionOptions txn_options;
+ WriteOptions write_options;
+ options.write_buffer_size = 1024;
+ ASSERT_OK(ReOpenNoDelete());
+ assert(db != nullptr);
+ Random rnd(47);
+ std::vector<Transaction*> txns;
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ // At the beginning there should be no log containing prepare data
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+ for (size_t i = 0; i < 100; i++) {
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+ ASSERT_OK(txn->Put(Slice("foo" + std::to_string(i)), Slice("bar")));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+ if (rnd.OneIn(5)) {
+ txns.push_back(txn);
+ } else {
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ }
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+ }
+ for (auto txn : txns) {
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ }
+ // At the end there should be no log left containing prepare data
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+ // Make sure that the underlying data structures are properly truncated and
+ // cause not leak
+ ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0);
+ ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0);
+}
+
+TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
+ for (bool cwb4recovery : {true, false}) {
+ ASSERT_OK(ReOpen());
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ TransactionOptions txn_options;
+ txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery;
+
+ std::string value;
+ Status s;
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("xid");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+ // transaction put
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ // regular db put
+ s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
+ ASSERT_OK(s);
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ // regular db read
+ ASSERT_OK(db->Get(read_options, "foo2", &value));
+ ASSERT_EQ(value, "bar2");
+
+ // commit time put
+ if (cwb4recovery) {
+ ASSERT_OK(
+ txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs")));
+ ASSERT_OK(
+ txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats")));
+ }
+
+ // nothing has been prepped yet
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ // data not im mem yet
+ s = db->Get(read_options, Slice("foo"), &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(read_options, Slice("gtid"), &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // find trans in list of prepared transactions
+ std::vector<Transaction*> prepared_trans;
+ db->GetAllPreparedTransactions(&prepared_trans);
+ ASSERT_EQ(prepared_trans.size(), 1);
+ ASSERT_EQ(prepared_trans.front()->GetName(), "xid");
+
+ auto log_containing_prep =
+ db_impl->TEST_FindMinLogContainingOutstandingPrep();
+ ASSERT_GT(log_containing_prep, 0);
+
+ // make commit
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ // value is now available
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+
+ // we already committed
+ s = txn->Commit();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // no longer is prepared results
+ db->GetAllPreparedTransactions(&prepared_trans);
+ ASSERT_EQ(prepared_trans.size(), 0);
+ ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+
+ // heap should not care about prepared section anymore
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // but now our memtable should be referencing the prep section
+ ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+ ASSERT_EQ(log_containing_prep,
+ db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ default:
+ assert(false);
+ }
+
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+ // After flush the recoverable state must be visible
+ if (cwb4recovery) {
+ s = db->Get(read_options, "gtid", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "dogs");
+
+ s = db->Get(read_options, "gtid2", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "cats");
+ }
+
+ // after memtable flush we can now relese the log
+ ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+
+ delete txn;
+
+ if (cwb4recovery) {
+ // kill and reopen to trigger recovery
+ s = ReOpenNoDelete();
+ ASSERT_OK(s);
+ assert(db != nullptr);
+ s = db->Get(read_options, "gtid", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "dogs");
+
+ s = db->Get(read_options, "gtid2", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "cats");
+ }
+ }
+}
+
+TEST_P(TransactionTest, TwoPhaseNameTest) {
+ Status s;
+
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn3);
+ delete txn3;
+
+ // cant prepare txn without name
+ s = txn1->Prepare();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // name too short
+ s = txn1->SetName("");
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // name too long
+ s = txn1->SetName(std::string(513, 'x'));
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // valid set name
+ s = txn1->SetName("name1");
+ ASSERT_OK(s);
+
+ // cant have duplicate name
+ s = txn2->SetName("name1");
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // shouldn't be able to prepare
+ s = txn2->Prepare();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // valid name set
+ s = txn2->SetName("name2");
+ ASSERT_OK(s);
+
+ // cant reset name
+ s = txn2->SetName("name3");
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ ASSERT_EQ(txn1->GetName(), "name1");
+ ASSERT_EQ(txn2->GetName(), "name2");
+
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ // can't rename after prepare
+ s = txn1->SetName("name4");
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ ASSERT_OK(txn1->Rollback());
+ ASSERT_OK(txn2->Rollback());
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
+ for (bool cwb4recovery : {true, false}) {
+ for (bool test_with_empty_wal : {true, false}) {
+ if (!cwb4recovery && test_with_empty_wal) {
+ continue;
+ }
+ ASSERT_OK(ReOpen());
+ Status s;
+ std::string value;
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ txn_options.use_only_the_last_commit_time_batch_for_recovery =
+ cwb4recovery;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn1->SetName("joe");
+ ASSERT_OK(s);
+
+ s = txn2->SetName("bob");
+ ASSERT_OK(s);
+
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+
+ if (cwb4recovery) {
+ ASSERT_OK(
+ txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar")));
+ }
+
+ s = txn2->Prepare();
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ delete txn2;
+ if (cwb4recovery) {
+ if (test_with_empty_wal) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+ // After flush the state must be visible
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+ }
+ ASSERT_OK(db->FlushWAL(true));
+ // kill and reopen to trigger recovery
+ s = ReOpenNoDelete();
+ ASSERT_OK(s);
+ assert(db != nullptr);
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+ }
+ }
+ }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
+ Status s;
+
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ txn_options.expiration = 500; // 500ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+ ASSERT_TRUE(txn1);
+
+ s = txn1->SetName("joe");
+ ASSERT_OK(s);
+ s = txn2->SetName("bob");
+ ASSERT_OK(s);
+
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->Prepare();
+ ASSERT_EQ(s, Status::Expired());
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, TwoPhaseRollbackTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("xid");
+ ASSERT_OK(s);
+
+ // transaction put
+ s = txn->Put(Slice("tfoo"), Slice("tbar"));
+ ASSERT_OK(s);
+
+ // value is readable form txn
+ s = txn->Get(read_options, Slice("tfoo"), &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "tbar");
+
+ // issue rollback
+ s = txn->Rollback();
+ ASSERT_OK(s);
+
+ // value is nolonger readable
+ s = txn->Get(read_options, Slice("tfoo"), &value);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(txn->GetNumPuts(), 0);
+
+ // put new txn values
+ s = txn->Put(Slice("tfoo2"), Slice("tbar2"));
+ ASSERT_OK(s);
+
+ // new value is readable from txn
+ s = txn->Get(read_options, Slice("tfoo2"), &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "tbar2");
+
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ // flush to next wal
+ s = db->Put(write_options, Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+ // issue rollback (marker written to WAL)
+ s = txn->Rollback();
+ ASSERT_OK(s);
+
+ // value is nolonger readable
+ s = txn->Get(read_options, Slice("tfoo2"), &value);
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(txn->GetNumPuts(), 0);
+
+ // make commit
+ s = txn->Commit();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // try rollback again
+ s = txn->Rollback();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ ReadOptions read_options;
+
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("xid");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+ // transaction put
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ // txn read
+ s = txn->Get(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+
+ // regular db put
+ s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
+ ASSERT_OK(s);
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+ // regular db read
+ db->Get(read_options, "foo2", &value);
+ ASSERT_EQ(value, "bar2");
+
+ // nothing has been prepped yet
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ // prepare
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ // still not available to db
+ s = db->Get(read_options, Slice("foo"), &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(db->FlushWAL(false));
+ delete txn;
+ // kill and reopen
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ s = ReOpenNoDelete();
+ ASSERT_OK(s);
+ assert(db != nullptr);
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ // find trans in list of prepared transactions
+ std::vector<Transaction*> prepared_trans;
+ db->GetAllPreparedTransactions(&prepared_trans);
+ ASSERT_EQ(prepared_trans.size(), 1);
+
+ txn = prepared_trans.front();
+ ASSERT_TRUE(txn);
+ ASSERT_EQ(txn->GetName(), "xid");
+ ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+ // log has been marked
+ auto log_containing_prep =
+ db_impl->TEST_FindMinLogContainingOutstandingPrep();
+ ASSERT_GT(log_containing_prep, 0);
+
+ // value is readable from txn
+ s = txn->Get(read_options, "foo", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar");
+
+ // make commit
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ // value is now available
+ db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "bar");
+
+ // we already committed
+ s = txn->Commit();
+ ASSERT_EQ(s, Status::InvalidArgument());
+
+ // no longer is prepared results
+ prepared_trans.clear();
+ db->GetAllPreparedTransactions(&prepared_trans);
+ ASSERT_EQ(prepared_trans.size(), 0);
+
+ // transaction should no longer be visible
+ ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+
+ // heap should not care about prepared section anymore
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // but now our memtable should be referencing the prep section
+ ASSERT_EQ(log_containing_prep,
+ db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ default:
+ assert(false);
+ }
+
+ // Add a dummy record to memtable before a flush. Otherwise, the
+ // memtable will be empty and flush will be skipped.
+ s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
+ ASSERT_OK(s);
+
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+ // after memtable flush we can now release the log
+ ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+
+ delete txn;
+
+ // deleting transaction should unregister transaction
+ ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// TODO this test needs to be updated with serial commits
+TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
+ // mix transaction writes and regular writes
+ const uint32_t NUM_TXN_THREADS = 50;
+ std::atomic<uint32_t> txn_thread_num(0);
+
+ std::function<void()> txn_write_thread = [&]() {
+ uint32_t id = txn_thread_num.fetch_add(1);
+
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ TransactionOptions txn_options;
+ txn_options.lock_timeout = 1000000;
+ if (id % 2 == 0) {
+ txn_options.expiration = 1000000;
+ }
+ TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(id)));
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName(name));
+ for (int i = 0; i < 10; i++) {
+ std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
+ ASSERT_OK(txn->Put(key, "val"));
+ }
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ };
+
+ // assure that all thread are in the same write group
+ std::atomic<uint32_t> t_wait_on_prepare(0);
+ std::atomic<uint32_t> t_wait_on_commit(0);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+ auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+ if (writer->ShouldWriteToWAL()) {
+ t_wait_on_prepare.fetch_add(1);
+ // wait for friends
+ while (t_wait_on_prepare.load() < NUM_TXN_THREADS) {
+ env->SleepForMicroseconds(10);
+ }
+ } else if (writer->ShouldWriteToMemtable()) {
+ t_wait_on_commit.fetch_add(1);
+ // wait for friends
+ while (t_wait_on_commit.load() < NUM_TXN_THREADS) {
+ env->SleepForMicroseconds(10);
+ }
+ } else {
+ FAIL();
+ }
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // do all the writes
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
+ threads.emplace_back(txn_write_thread);
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+ for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) {
+ TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(t)));
+ for (int i = 0; i < 10; i++) {
+ std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
+ s = db->Get(read_options, key, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "val");
+ }
+ }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("bob");
+ ASSERT_OK(s);
+
+ // transaction put
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+
+ // prepare
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ delete txn;
+
+ for (int i = 0; i < 1000; i++) {
+ std::string key(i, 'k');
+ std::string val(1000, 'v');
+ assert(db != nullptr);
+ s = db->Put(write_options, key, val);
+ ASSERT_OK(s);
+
+ if (i % 29 == 0) {
+ // crash
+ env->SetFilesystemActive(false);
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ReOpenNoDelete();
+ } else if (i % 37 == 0) {
+ // close
+ ReOpenNoDelete();
+ }
+ }
+
+ // commit old txn
+ txn = db->GetTransactionByName("bob");
+ ASSERT_TRUE(txn);
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ // verify data txn data
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, "bar");
+
+ // verify non txn data
+ for (int i = 0; i < 1000; i++) {
+ std::string key(i, 'k');
+ std::string val(1000, 'v');
+ s = db->Get(read_options, key, &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, val);
+ }
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, TwoPhaseSequenceTest) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ ReadOptions read_options;
+
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("xid");
+ ASSERT_OK(s);
+
+ // transaction put
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+ s = txn->Put(Slice("foo2"), Slice("bar2"));
+ ASSERT_OK(s);
+ s = txn->Put(Slice("foo3"), Slice("bar3"));
+ ASSERT_OK(s);
+ s = txn->Put(Slice("foo4"), Slice("bar4"));
+ ASSERT_OK(s);
+
+ // prepare
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ // make commit
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+
+ // kill and reopen
+ env->SetFilesystemActive(false);
+ ReOpenNoDelete();
+ assert(db != nullptr);
+
+ // value is now available
+ s = db->Get(read_options, "foo4", &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, "bar4");
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ ReadOptions read_options;
+
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("a");
+ ASSERT_OK(s);
+
+ // transaction put
+ s = txn->Put(Slice("foo"), Slice("bar"));
+ ASSERT_OK(s);
+
+ // prepare
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ delete txn;
+
+ // kill and reopen
+ env->SetFilesystemActive(false);
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ReOpenNoDelete();
+
+ // commit old txn
+ assert(db != nullptr); // Make clang analyze happy.
+ txn = db->GetTransactionByName("a");
+ assert(txn != nullptr);
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, "bar");
+
+ delete txn;
+
+ txn = db->BeginTransaction(write_options, txn_options);
+ s = txn->SetName("b");
+ ASSERT_OK(s);
+
+ s = txn->Put(Slice("foo2"), Slice("bar2"));
+ ASSERT_OK(s);
+
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+
+ // kill and reopen
+ env->SetFilesystemActive(false);
+ ASSERT_OK(ReOpenNoDelete());
+ assert(db != nullptr);
+
+ // value is now available
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, "bar");
+
+ s = db->Get(read_options, "foo2", &value);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(value, "bar2");
+}
+
+TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ Status s;
+ std::string v;
+ ColumnFamilyHandle *cfa, *cfb;
+
+ // Create 2 new column families
+ ColumnFamilyOptions cf_options;
+ s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+ ASSERT_OK(s);
+ s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+ ASSERT_OK(s);
+
+ WriteOptions wopts;
+ wopts.disableWAL = false;
+ wopts.sync = true;
+
+ TransactionOptions topts1;
+ Transaction* txn1 = db->BeginTransaction(wopts, topts1);
+ s = txn1->SetName("xid1");
+ ASSERT_OK(s);
+
+ TransactionOptions topts2;
+ Transaction* txn2 = db->BeginTransaction(wopts, topts2);
+ s = txn2->SetName("xid2");
+ ASSERT_OK(s);
+
+ // transaction put in two column families
+ s = txn1->Put(cfa, "ka1", "va1");
+ ASSERT_OK(s);
+
+ // transaction put in two column families
+ s = txn2->Put(cfa, "ka2", "va2");
+ ASSERT_OK(s);
+ s = txn2->Put(cfb, "kb2", "vb2");
+ ASSERT_OK(s);
+
+ // write prep section to wal
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ // our log should be in the heap
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn1->GetLogNumber());
+ ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
+
+ // flush default cf to crate new log
+ s = db->Put(wopts, "foo", "bar");
+ ASSERT_OK(s);
+ s = db_impl->TEST_FlushMemTable(true);
+ ASSERT_OK(s);
+
+ // make sure we are on a new log
+ ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
+
+ // put txn2 prep section in this log
+ s = txn2->Prepare();
+ ASSERT_OK(s);
+ ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
+
+ // heap should still see first log
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn1->GetLogNumber());
+
+ // commit txn1
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ // heap should now show txn2s log
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn2->GetLogNumber());
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // we should see txn1s log refernced by the memtables
+ ASSERT_EQ(txn1->GetLogNumber(),
+ db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ default:
+ assert(false);
+ }
+
+ // flush default cf to crate new log
+ s = db->Put(wopts, "foo", "bar2");
+ ASSERT_OK(s);
+ s = db_impl->TEST_FlushMemTable(true);
+ ASSERT_OK(s);
+
+ // make sure we are on a new log
+ ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
+
+ // commit txn2
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ // heap should not show any logs
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // should show the first txn log
+ ASSERT_EQ(txn1->GetLogNumber(),
+ db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ default:
+ assert(false);
+ }
+
+ // flush only cfa memtable
+ s = db_impl->TEST_FlushMemTable(true, false, cfa);
+ ASSERT_OK(s);
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // should show the first txn log
+ ASSERT_EQ(txn2->GetLogNumber(),
+ db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+ break;
+ default:
+ assert(false);
+ }
+
+ // flush only cfb memtable
+ s = db_impl->TEST_FlushMemTable(true, false, cfb);
+ ASSERT_OK(s);
+
+ // should show not dependency on logs
+ ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ delete txn1;
+ delete txn2;
+ delete cfa;
+ delete cfb;
+}
+
+TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ Status s;
+ ColumnFamilyHandle *cfa, *cfb;
+
+ ColumnFamilyOptions cf_options;
+ s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+ ASSERT_OK(s);
+ s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+ ASSERT_OK(s);
+
+ WriteOptions wopts;
+ wopts.disableWAL = false;
+ wopts.sync = true;
+
+ auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(cfa);
+ auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(cfb);
+
+ TransactionOptions topts1;
+ Transaction* txn1 = db->BeginTransaction(wopts, topts1);
+ s = txn1->SetName("xid1");
+ ASSERT_OK(s);
+ s = txn1->Put(cfa, "boys", "girls1");
+ ASSERT_OK(s);
+
+ Transaction* txn2 = db->BeginTransaction(wopts, topts1);
+ s = txn2->SetName("xid2");
+ ASSERT_OK(s);
+ s = txn2->Put(cfb, "up", "down1");
+ ASSERT_OK(s);
+
+ // prepre transaction in LOG A
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ // prepre transaction in LOG A
+ s = txn2->Prepare();
+ ASSERT_OK(s);
+
+ // regular put so that mem table can actually be flushed for log rolling
+ s = db->Put(wopts, "cats", "dogs1");
+ ASSERT_OK(s);
+
+ auto prepare_log_no = txn1->GetLastLogNumber();
+
+ // roll to LOG B
+ s = db_impl->TEST_FlushMemTable(true);
+ ASSERT_OK(s);
+
+ // now we pause background work so that
+ // imm()s are not flushed before we can check their status
+ s = db_impl->PauseBackgroundWork();
+ ASSERT_OK(s);
+
+ ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no);
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // This cf is empty and should ref the latest log
+ ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+ ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // This cf is not flushed yet and should ref the log that has its data
+ ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+ break;
+ default:
+ assert(false);
+ }
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn1->GetLogNumber());
+ ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+
+ // commit in LOG B
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
+ prepare_log_no);
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // In these modes memtable do not ref the prep sections
+ ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+ break;
+ default:
+ assert(false);
+ }
+
+ ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
+
+ // request a flush for all column families such that the earliest
+ // alive log file can be killed
+ ASSERT_OK(db_impl->TEST_SwitchWAL());
+ // log cannot be flushed because txn2 has not been commited
+ ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
+ ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
+
+ // assert that cfa has a flush requested
+ ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested());
+
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ // cfb should not be flushed becuse it has no data from LOG A
+ ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested());
+ break;
+ case WRITE_PREPARED:
+ case WRITE_UNPREPARED:
+ // cfb should be flushed becuse it has prepared data from LOG A
+ ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
+ break;
+ default:
+ assert(false);
+ }
+
+ // cfb now has data from LOG A
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ ASSERT_OK(db_impl->TEST_SwitchWAL());
+ ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
+
+ // we should see that cfb now has a flush requested
+ ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
+
+ // all data in LOG A resides in a memtable that has been
+ // requested for a flush
+ ASSERT_TRUE(db_impl->TEST_IsLogGettingFlushed());
+
+ delete txn1;
+ delete txn2;
+ delete cfa;
+ delete cfb;
+}
+/*
+ * 1) use prepare to keep first log around to determine starting sequence
+ * during recovery.
+ * 2) insert many values, skipping wal, to increase seqid.
+ * 3) insert final value into wal
+ * 4) recover and see that final value was properly recovered - not
+ * hidden behind improperly summed sequence ids
+ */
+TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ WriteOptions wal_on, wal_off;
+ wal_on.sync = true;
+ wal_on.disableWAL = false;
+ wal_off.disableWAL = true;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+
+ std::string value;
+ Status s;
+
+ Transaction* txn1 = db->BeginTransaction(wal_on, txn_options);
+
+ s = txn1->SetName("1");
+ ASSERT_OK(s);
+
+ s = db->Put(wal_on, "first", "first");
+ ASSERT_OK(s);
+
+ s = txn1->Put(Slice("dummy"), Slice("dummy"));
+ ASSERT_OK(s);
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ s = db->Put(wal_off, "cats", "dogs1");
+ ASSERT_OK(s);
+ s = db->Put(wal_off, "cats", "dogs2");
+ ASSERT_OK(s);
+ s = db->Put(wal_off, "cats", "dogs3");
+ ASSERT_OK(s);
+
+ s = db_impl->TEST_FlushMemTable(true);
+ ASSERT_OK(s);
+
+ s = db->Put(wal_on, "cats", "dogs4");
+ ASSERT_OK(s);
+
+ ASSERT_OK(db->FlushWAL(false));
+
+ // kill and reopen
+ env->SetFilesystemActive(false);
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ assert(db != nullptr);
+
+ s = db->Get(read_options, "first", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "first");
+
+ s = db->Get(read_options, "cats", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "dogs4");
+}
+
+TEST_P(TransactionTest, FirstWriteTest) {
+ WriteOptions write_options;
+
+ // Test conflict checking against the very first write to a db.
+ // The transaction's snapshot will have seq 1 and the following write
+ // will have sequence 1.
+ Status s = db->Put(write_options, "A", "a");
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+
+ ASSERT_OK(s);
+
+ s = txn->Put("A", "b");
+ ASSERT_OK(s);
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, FirstWriteTest2) {
+ WriteOptions write_options;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+
+ // Test conflict checking against the very first write to a db.
+ // The transaction's snapshot is a seq 0 while the following write
+ // will have sequence 1.
+ Status s = db->Put(write_options, "A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Put("A", "b");
+ ASSERT_TRUE(s.IsBusy());
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, WriteOptionsTest) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = true;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ ASSERT_TRUE(txn->GetWriteOptions()->sync);
+
+ write_options.sync = false;
+ txn->SetWriteOptions(write_options);
+ ASSERT_FALSE(txn->GetWriteOptions()->sync);
+ ASSERT_TRUE(txn->GetWriteOptions()->disableWAL);
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, WriteConflictTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, "foo", "A"));
+ ASSERT_OK(db->Put(write_options, "foo2", "B"));
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->Put("foo", "A2");
+ ASSERT_OK(s);
+
+ s = txn->Put("foo2", "B2");
+ ASSERT_OK(s);
+
+ // This Put outside of a transaction will conflict with the previous write
+ s = db->Put(write_options, "foo", "xxx");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "A");
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "A2");
+ db->Get(read_options, "foo2", &value);
+ ASSERT_EQ(value, "B2");
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, WriteConflictTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, "foo", "bar"));
+
+ txn_options.set_snapshot = true;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+
+ // This Put outside of a transaction will conflict with a later write
+ s = db->Put(write_options, "foo", "barz");
+ ASSERT_OK(s);
+
+ s = txn->Put("foo2", "X");
+ ASSERT_OK(s);
+
+ s = txn->Put("foo",
+ "bar2"); // Conflicts with write done after snapshot taken
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->Put("foo3", "Y");
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "barz");
+
+ ASSERT_EQ(2, txn->GetNumKeys());
+
+ s = txn->Commit();
+ ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3
+
+ // Verify that transaction wrote foo2 and foo3 but not foo
+ db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "barz");
+
+ db->Get(read_options, "foo2", &value);
+ ASSERT_EQ(value, "X");
+
+ db->Get(read_options, "foo3", &value);
+ ASSERT_EQ(value, "Y");
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, ReadConflictTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, "foo", "bar"));
+ ASSERT_OK(db->Put(write_options, "foo2", "bar"));
+
+ txn_options.set_snapshot = true;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ // This Put outside of a transaction will conflict with the previous read
+ s = db->Put(write_options, "foo", "barz");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "bar");
+
+ s = txn->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "bar");
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, TxnOnlyTest) {
+ // Test to make sure transactions work when there are no other writes in an
+ // empty db.
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->Put("x", "y");
+ ASSERT_OK(s);
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, FlushTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ s = txn->Put(Slice("foo"), Slice("bar2"));
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+
+ // Put a random key so we have a memtable to flush
+ s = db->Put(write_options, "dummy", "dummy");
+ ASSERT_OK(s);
+
+ // force a memtable flush
+ FlushOptions flush_ops;
+ db->Flush(flush_ops);
+
+ s = txn->Commit();
+ // txn should commit since the flushed table is still in MemtableList History
+ ASSERT_OK(s);
+
+ db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, FlushTest2) {
+ const size_t num_tests = 3;
+
+ for (size_t n = 0; n < num_tests; n++) {
+ // Test different table factories
+ switch (n) {
+ case 0:
+ break;
+ case 1:
+ options.table_factory.reset(new mock::MockTableFactory());
+ break;
+ case 2: {
+ PlainTableOptions pt_opts;
+ pt_opts.hash_table_ratio = 0;
+ options.table_factory.reset(NewPlainTableFactory(pt_opts));
+ break;
+ }
+ }
+
+ Status s = ReOpen();
+ ASSERT_OK(s);
+ assert(db != nullptr);
+
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ TransactionOptions txn_options;
+ std::string value;
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+ ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2")));
+ ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3")));
+
+ txn_options.set_snapshot = true;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar");
+
+ s = txn->Put(Slice("foo"), Slice("bar2"));
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+ ASSERT_EQ(value, "bar2");
+ // verify foo is locked by txn
+ s = db->Delete(write_options, "foo");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = db->Put(write_options, "Z", "z");
+ ASSERT_OK(s);
+ s = db->Put(write_options, "dummy", "dummy");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "S", "s");
+ ASSERT_OK(s);
+ s = db->SingleDelete(write_options, "S");
+ ASSERT_OK(s);
+
+ s = txn->Delete("S");
+ // Should fail after encountering a write to S in memtable
+ ASSERT_TRUE(s.IsBusy());
+
+ // force a memtable flush
+ s = db_impl->TEST_FlushMemTable(true);
+ ASSERT_OK(s);
+
+ // Put a random key so we have a MemTable to flush
+ s = db->Put(write_options, "dummy", "dummy2");
+ ASSERT_OK(s);
+
+ // force a memtable flush
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+ s = db->Put(write_options, "dummy", "dummy3");
+ ASSERT_OK(s);
+
+ // force a memtable flush
+ // Since our test db has max_write_buffer_number=2, this flush will cause
+ // the first memtable to get purged from the MemtableList history.
+ ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+ s = txn->Put("X", "Y");
+ // Should succeed after verifying there is no write to X in SST file
+ ASSERT_OK(s);
+
+ s = txn->Put("Z", "zz");
+ // Should fail after encountering a write to Z in SST file
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->GetForUpdate(read_options, "foo2", &value);
+ // should succeed since key was written before txn started
+ ASSERT_OK(s);
+ // verify foo2 is locked by txn
+ s = db->Delete(write_options, "foo2");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn->Delete("S");
+ // Should fail after encountering a write to S in SST file
+ ASSERT_TRUE(s.IsBusy());
+
+ // Write a bunch of keys to db to force a compaction
+ Random rnd(47);
+ for (int i = 0; i < 1000; i++) {
+ s = db->Put(write_options, std::to_string(i),
+ test::CompressibleString(&rnd, 0.8, 100, &value));
+ ASSERT_OK(s);
+ }
+
+ s = txn->Put("X", "yy");
+ // Should succeed after verifying there is no write to X in SST file
+ ASSERT_OK(s);
+
+ s = txn->Put("Z", "zzz");
+ // Should fail after encountering a write to Z in SST file
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->Delete("S");
+ // Should fail after encountering a write to S in SST file
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->GetForUpdate(read_options, "foo3", &value);
+ // should succeed since key was written before txn started
+ ASSERT_OK(s);
+ // verify foo3 is locked by txn
+ s = db->Delete(write_options, "foo3");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ ASSERT_OK(db_impl->TEST_WaitForCompact());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ // Transaction should only write the keys that succeeded.
+ s = db->Get(read_options, "foo", &value);
+ ASSERT_EQ(value, "bar2");
+
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("yy", value);
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("z", value);
+
+ delete txn;
+ }
+}
+
+TEST_P(TransactionTest, NoSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, "AAA", "bar"));
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ // Modify key after transaction start
+ ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
+
+ // Read and write without a snap
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar1");
+ s = txn->Put("AAA", "bar2");
+ ASSERT_OK(s);
+
+ // Should commit since read/write was done after data changed
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar2");
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, MultipleSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ ASSERT_OK(db->Put(write_options, "AAA", "bar"));
+ ASSERT_OK(db->Put(write_options, "BBB", "bar"));
+ ASSERT_OK(db->Put(write_options, "CCC", "bar"));
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
+
+ // Read and write without a snapshot
+ ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+ ASSERT_EQ(value, "bar1");
+ s = txn->Put("AAA", "bar2");
+ ASSERT_OK(s);
+
+ // Modify BBB before snapshot is taken
+ ASSERT_OK(db->Put(write_options, "BBB", "bar1"));
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ // Read and write with snapshot
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
+ ASSERT_EQ(value, "bar1");
+ s = txn->Put("BBB", "bar2");
+ ASSERT_OK(s);
+
+ ASSERT_OK(db->Put(write_options, "CCC", "bar1"));
+
+ // Set a new snapshot
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ // Read and write with snapshot
+ ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
+ ASSERT_EQ(value, "bar1");
+ s = txn->Put("CCC", "bar2");
+ ASSERT_OK(s);
+
+ s = txn->GetForUpdate(read_options, "AAA", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+ s = txn->GetForUpdate(read_options, "BBB", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+ s = txn->GetForUpdate(read_options, "CCC", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+
+ s = db->Get(read_options, "AAA", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar1");
+ s = db->Get(read_options, "BBB", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar1");
+ s = db->Get(read_options, "CCC", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar1");
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "AAA", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+ s = db->Get(read_options, "BBB", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+ s = db->Get(read_options, "CCC", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "bar2");
+
+ // verify that we track multiple writes to the same key at different snapshots
+ delete txn;
+ txn = db->BeginTransaction(write_options);
+
+ // Potentially conflicting writes
+ ASSERT_OK(db->Put(write_options, "ZZZ", "zzz"));
+ ASSERT_OK(db->Put(write_options, "XXX", "xxx"));
+
+ txn->SetSnapshot();
+
+ TransactionOptions txn_options;
+ txn_options.set_snapshot = true;
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ txn2->SetSnapshot();
+
+ // This should not conflict in txn since the snapshot is later than the
+ // previous write (spoiler alert: it will later conflict with txn2).
+ s = txn->Put("ZZZ", "zzzz");
+ ASSERT_OK(s);
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+
+ // This will conflict since the snapshot is earlier than another write to ZZZ
+ s = txn2->Put("ZZZ", "xxxxx");
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "ZZZ", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "zzzz");
+
+ delete txn2;
+}
+
+TEST_P(TransactionTest, ColumnFamiliesTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ ColumnFamilyHandle *cfa, *cfb;
+ ColumnFamilyOptions cf_options;
+
+ // Create 2 new column families
+ s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+ ASSERT_OK(s);
+ s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+ ASSERT_OK(s);
+
+ delete cfa;
+ delete cfb;
+ delete db;
+ db = nullptr;
+
+ // open DB with three column families
+ std::vector<ColumnFamilyDescriptor> column_families;
+ // have to open default column family
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ // open the new column families
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+
+ std::vector<ColumnFamilyHandle*> handles;
+
+ ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+ assert(db != nullptr);
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ txn_options.set_snapshot = true;
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ // Write some data to the db
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("foo", "foo"));
+ ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+ ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+ s = db->Write(write_options, &batch);
+ ASSERT_OK(s);
+ ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ"));
+
+ // These keys do not conflict with existing writes since they're in
+ // different column families
+ s = txn->Delete("AAA");
+ ASSERT_OK(s);
+ s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ Slice key_slice("AAAZZZ");
+ Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+ s = txn->Put(handles[2], SliceParts(&key_slice, 1),
+ SliceParts(value_slices, 2));
+ ASSERT_OK(s);
+ ASSERT_EQ(3, txn->GetNumKeys());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ s = db->Get(read_options, "AAA", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(read_options, handles[2], "AAAZZZ", &value);
+ ASSERT_EQ(value, "barbar");
+
+ Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+ Slice value_slice("barbarbar");
+
+ s = txn2->Delete(handles[2], "XXX");
+ ASSERT_OK(s);
+ s = txn2->Delete(handles[1], "XXX");
+ ASSERT_OK(s);
+
+ // This write will cause a conflict with the earlier batch write
+ s = txn2->Put(handles[1], SliceParts(key_slices, 3),
+ SliceParts(&value_slice, 1));
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ // In the above the latest change to AAAZZZ in handles[1] is delete.
+ s = db->Get(read_options, handles[1], "AAAZZZ", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+ delete txn2;
+
+ txn = db->BeginTransaction(write_options, txn_options);
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+
+ std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+ handles[0], handles[2]};
+ std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+ std::vector<std::string> values(4);
+ std::vector<Status> results = txn->MultiGetForUpdate(
+ snapshot_read_options, multiget_cfh, multiget_keys, &values);
+ ASSERT_OK(results[0]);
+ ASSERT_OK(results[1]);
+ ASSERT_OK(results[2]);
+ ASSERT_TRUE(results[3].IsNotFound());
+ ASSERT_EQ(values[0], "bar");
+ ASSERT_EQ(values[1], "barbar");
+ ASSERT_EQ(values[2], "foo");
+
+ s = txn->SingleDelete(handles[2], "ZZZ");
+ ASSERT_OK(s);
+ s = txn->Put(handles[2], "ZZZ", "YYY");
+ ASSERT_OK(s);
+ s = txn->Put(handles[2], "ZZZ", "YYYY");
+ ASSERT_OK(s);
+ s = txn->Delete(handles[2], "ZZZ");
+ ASSERT_OK(s);
+ s = txn->Put(handles[2], "AAAZZZ", "barbarbar");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(5, txn->GetNumKeys());
+
+ // Txn should commit
+ s = txn->Commit();
+ ASSERT_OK(s);
+ s = db->Get(read_options, handles[2], "ZZZ", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Put a key which will conflict with the next txn using the previous snapshot
+ ASSERT_OK(db->Put(write_options, handles[2], "foo", "000"));
+
+ results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+ multiget_keys, &values);
+ // All results should fail since there was a conflict
+ ASSERT_TRUE(results[0].IsBusy());
+ ASSERT_TRUE(results[1].IsBusy());
+ ASSERT_TRUE(results[2].IsBusy());
+ ASSERT_TRUE(results[3].IsBusy());
+
+ s = db->Get(read_options, handles[2], "foo", &value);
+ ASSERT_EQ(value, "000");
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->DropColumnFamily(handles[1]);
+ ASSERT_OK(s);
+ s = db->DropColumnFamily(handles[2]);
+ ASSERT_OK(s);
+
+ delete txn;
+ delete txn2;
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+}
+
+TEST_P(TransactionTest, MultiGetBatchedTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ ColumnFamilyHandle* cf;
+ ColumnFamilyOptions cf_options;
+
+ // Create a new column families
+ s = db->CreateColumnFamily(cf_options, "CF", &cf);
+ ASSERT_OK(s);
+
+ delete cf;
+ delete db;
+ db = nullptr;
+
+ // open DB with three column families
+ std::vector<ColumnFamilyDescriptor> column_families;
+ // have to open default column family
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ // open the new column families
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
+
+ std::vector<ColumnFamilyHandle*> handles;
+
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+ assert(db != nullptr);
+
+ // Write some data to the db
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(handles[1], "aaa", "val1"));
+ ASSERT_OK(batch.Put(handles[1], "bbb", "val2"));
+ ASSERT_OK(batch.Put(handles[1], "ccc", "val3"));
+ ASSERT_OK(batch.Put(handles[1], "ddd", "foo"));
+ ASSERT_OK(batch.Put(handles[1], "eee", "val5"));
+ ASSERT_OK(batch.Put(handles[1], "fff", "val6"));
+ ASSERT_OK(batch.Merge(handles[1], "ggg", "foo"));
+ s = db->Write(write_options, &batch);
+ ASSERT_OK(s);
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+
+ txn_options.set_snapshot = true;
+ // Write some data to the db
+ s = txn->Delete(handles[1], "bbb");
+ ASSERT_OK(s);
+ s = txn->Put(handles[1], "ccc", "val3_new");
+ ASSERT_OK(s);
+ s = txn->Merge(handles[1], "ddd", "bar");
+ ASSERT_OK(s);
+
+ std::vector<Slice> keys = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"};
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<Status> statuses(keys.size());
+
+ txn->MultiGet(snapshot_read_options, handles[1], keys.size(), keys.data(),
+ values.data(), statuses.data());
+ ASSERT_TRUE(statuses[0].ok());
+ ASSERT_EQ(values[0], "val1");
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ ASSERT_TRUE(statuses[2].ok());
+ ASSERT_EQ(values[2], "val3_new");
+ ASSERT_TRUE(statuses[3].ok());
+ ASSERT_EQ(values[3], "foo,bar");
+ ASSERT_TRUE(statuses[4].ok());
+ ASSERT_EQ(values[4], "val5");
+ ASSERT_TRUE(statuses[5].ok());
+ ASSERT_EQ(values[5], "val6");
+ ASSERT_TRUE(statuses[6].ok());
+ ASSERT_EQ(values[6], "foo");
+ delete txn;
+ for (auto handle : handles) {
+ delete handle;
+ }
+}
+
+// This test calls WriteBatchWithIndex::MultiGetFromBatchAndDB with a large
+// number of keys, i.e greater than MultiGetContext::MAX_BATCH_SIZE, which is
+// is 32. This forces autovector allocations in the MultiGet code paths
+// to use std::vector in addition to stack allocations. The MultiGet keys
+// includes Merges, which are handled specially in MultiGetFromBatchAndDB by
+// allocating an autovector of MergeContexts
+TEST_P(TransactionTest, MultiGetLargeBatchedTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ ColumnFamilyHandle* cf;
+ ColumnFamilyOptions cf_options;
+
+ std::vector<std::string> key_str;
+ for (int i = 0; i < 100; ++i) {
+ key_str.emplace_back(std::to_string(i));
+ }
+ // Create a new column families
+ s = db->CreateColumnFamily(cf_options, "CF", &cf);
+ ASSERT_OK(s);
+
+ delete cf;
+ delete db;
+ db = nullptr;
+
+ // open DB with three column families
+ std::vector<ColumnFamilyDescriptor> column_families;
+ // have to open default column family
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ // open the new column families
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
+
+ std::vector<ColumnFamilyHandle*> handles;
+
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+ assert(db != nullptr);
+
+ // Write some data to the db
+ WriteBatch batch;
+ for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) {
+ std::string val = "val" + std::to_string(i);
+ ASSERT_OK(batch.Put(handles[1], key_str[i], val));
+ }
+ s = db->Write(write_options, &batch);
+ ASSERT_OK(s);
+
+ WriteBatchWithIndex wb;
+ // Write some data to the db
+ s = wb.Delete(handles[1], std::to_string(1));
+ ASSERT_OK(s);
+ s = wb.Put(handles[1], std::to_string(2), "new_val" + std::to_string(2));
+ ASSERT_OK(s);
+ // Write a lot of merges so when we call MultiGetFromBatchAndDB later on,
+ // it is forced to use std::vector in ROCKSDB_NAMESPACE::autovector to
+ // allocate MergeContexts. The number of merges needs to be >
+ // MultiGetContext::MAX_BATCH_SIZE
+ for (int i = 8; i < MultiGetContext::MAX_BATCH_SIZE + 24; ++i) {
+ s = wb.Merge(handles[1], std::to_string(i), "merge");
+ ASSERT_OK(s);
+ }
+
+ // MultiGet a lot of keys in order to force std::vector reallocations
+ std::vector<Slice> keys;
+ for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE + 32; ++i) {
+ keys.emplace_back(key_str[i]);
+ }
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<Status> statuses(keys.size());
+
+ wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(),
+ keys.data(), values.data(), statuses.data(), false);
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if (i == 1) {
+ ASSERT_TRUE(statuses[1].IsNotFound());
+ } else if (i == 2) {
+ ASSERT_TRUE(statuses[2].ok());
+ ASSERT_EQ(values[2], "new_val" + std::to_string(2));
+ } else if (i >= 8 && i < 56) {
+ ASSERT_TRUE(statuses[i].ok());
+ ASSERT_EQ(values[i], "val" + std::to_string(i) + ",merge");
+ } else {
+ ASSERT_TRUE(statuses[i].ok());
+ if (values[i] != "val" + std::to_string(i)) {
+ ASSERT_EQ(values[i], "val" + std::to_string(i));
+ }
+ }
+ }
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+}
+
+TEST_P(TransactionTest, MultiGetSnapshot) {
+ WriteOptions write_options;
+ TransactionOptions transaction_options;
+ Transaction* txn1 = db->BeginTransaction(write_options, transaction_options);
+
+ Slice key = "foo";
+
+ Status s = txn1->Put(key, "bar");
+ ASSERT_OK(s);
+
+ s = txn1->SetName("test");
+ ASSERT_OK(s);
+
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+
+ // Get snapshot between prepare and commit
+ // Un-committed data should be invisible to other transactions
+ const Snapshot* s1 = db->GetSnapshot();
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+
+ Transaction* txn2 = db->BeginTransaction(write_options, transaction_options);
+ ReadOptions read_options;
+ read_options.snapshot = s1;
+
+ std::vector<Slice> keys;
+ std::vector<PinnableSlice> values(1);
+ std::vector<Status> statuses(1);
+ keys.push_back(key);
+ auto cfd = db->DefaultColumnFamily();
+ txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(),
+ statuses.data());
+ ASSERT_TRUE(statuses[0].IsNotFound());
+ delete txn2;
+
+ db->ReleaseSnapshot(s1);
+}
+
+TEST_P(TransactionTest, ColumnFamiliesTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ ColumnFamilyHandle *one, *two;
+ ColumnFamilyOptions cf_options;
+
+ // Create 2 new column families
+ s = db->CreateColumnFamily(cf_options, "ONE", &one);
+ ASSERT_OK(s);
+ s = db->CreateColumnFamily(cf_options, "TWO", &two);
+ ASSERT_OK(s);
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn1);
+ Transaction* txn2 = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn1->Put(one, "X", "1");
+ ASSERT_OK(s);
+ s = txn1->Put(two, "X", "2");
+ ASSERT_OK(s);
+ s = txn1->Put("X", "0");
+ ASSERT_OK(s);
+
+ s = txn2->Put(one, "X", "11");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ // Drop first column family
+ s = db->DropColumnFamily(one);
+ ASSERT_OK(s);
+
+ // Should fail since column family was dropped.
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+ txn1 = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn1);
+
+ // Should fail since column family was dropped
+ s = txn1->Put(one, "X", "111");
+ ASSERT_TRUE(s.IsInvalidArgument());
+
+ s = txn1->Put(two, "X", "222");
+ ASSERT_OK(s);
+
+ s = txn1->Put("X", "000");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, two, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("222", value);
+
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("000", value);
+
+ s = db->DropColumnFamily(two);
+ ASSERT_OK(s);
+
+ delete txn1;
+ delete txn2;
+
+ delete one;
+ delete two;
+}
+
+TEST_P(TransactionTest, EmptyTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ s = db->Put(write_options, "aaa", "aaa");
+ ASSERT_OK(s);
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ txn = db->BeginTransaction(write_options);
+ ASSERT_OK(txn->Rollback());
+ delete txn;
+
+ txn = db->BeginTransaction(write_options);
+ s = txn->GetForUpdate(read_options, "aaa", &value);
+ ASSERT_EQ(value, "aaa");
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ txn = db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+
+ s = txn->GetForUpdate(read_options, "aaa", &value);
+ ASSERT_EQ(value, "aaa");
+
+ // Conflicts with previous GetForUpdate
+ s = db->Put(write_options, "aaa", "xxx");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ // transaction expired!
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+}
+
+TEST_P(TransactionTest, PredicateManyPreceders) {
+ WriteOptions write_options;
+ ReadOptions read_options1, read_options2;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ txn_options.set_snapshot = true;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ Transaction* txn2 = db->BeginTransaction(write_options);
+ txn2->SetSnapshot();
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ std::vector<Slice> multiget_keys = {"1", "2", "3"};
+ std::vector<std::string> multiget_values;
+
+ std::vector<Status> results =
+ txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+ ASSERT_EQ(results.size(), 3);
+ ASSERT_TRUE(results[0].IsNotFound());
+ ASSERT_TRUE(results[1].IsNotFound());
+ ASSERT_TRUE(results[2].IsNotFound());
+
+ s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate
+ ASSERT_TRUE(s.IsTimedOut());
+
+ ASSERT_OK(txn2->Rollback());
+
+ multiget_values.clear();
+ results =
+ txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+ ASSERT_EQ(results.size(), 3);
+ ASSERT_TRUE(results[0].IsNotFound());
+ ASSERT_TRUE(results[1].IsNotFound());
+ ASSERT_TRUE(results[2].IsNotFound());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ s = txn1->Put("4", "x");
+ ASSERT_OK(s);
+
+ s = txn2->Delete("4"); // conflict
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->GetForUpdate(read_options2, "4", &value);
+ ASSERT_TRUE(s.IsBusy());
+
+ ASSERT_OK(txn2->Rollback());
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, LostUpdate) {
+ WriteOptions write_options;
+ ReadOptions read_options, read_options1, read_options2;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ // Test 2 transactions writing to the same key in multiple orders and
+ // with/without snapshots
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+ Transaction* txn2 = db->BeginTransaction(write_options);
+
+ s = txn1->Put("1", "1");
+ ASSERT_OK(s);
+
+ s = txn2->Put("1", "2"); // conflict
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("1", value);
+
+ delete txn1;
+ delete txn2;
+
+ txn_options.set_snapshot = true;
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ s = txn1->Put("1", "3");
+ ASSERT_OK(s);
+ s = txn2->Put("1", "4"); // conflict
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("3", value);
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ s = txn1->Put("1", "5");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->Put("1", "6");
+ ASSERT_TRUE(s.IsBusy());
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ read_options1.snapshot = txn1->GetSnapshot();
+
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ read_options2.snapshot = txn2->GetSnapshot();
+
+ s = txn1->Put("1", "7");
+ ASSERT_OK(s);
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ txn2->SetSnapshot();
+ s = txn2->Put("1", "8");
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("8", value);
+
+ delete txn1;
+ delete txn2;
+
+ txn1 = db->BeginTransaction(write_options);
+ txn2 = db->BeginTransaction(write_options);
+
+ s = txn1->Put("1", "9");
+ ASSERT_OK(s);
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->Put("1", "10");
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+ delete txn2;
+
+ s = db->Get(read_options, "1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "10");
+}
+
+TEST_P(TransactionTest, UntrackedWrites) {
+ if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+ // TODO(lth): For WriteUnprepared, validate that untracked writes are
+ // not supported.
+ return;
+ }
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ // Verify transaction rollback works for untracked keys.
+ Transaction* txn = db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+
+ s = txn->PutUntracked("untracked", "0");
+ ASSERT_OK(s);
+ ASSERT_OK(txn->Rollback());
+ s = db->Get(read_options, "untracked", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+ txn = db->BeginTransaction(write_options);
+ txn->SetSnapshot();
+
+ s = db->Put(write_options, "untracked", "x");
+ ASSERT_OK(s);
+
+ // Untracked writes should succeed even though key was written after snapshot
+ s = txn->PutUntracked("untracked", "1");
+ ASSERT_OK(s);
+ s = txn->MergeUntracked("untracked", "2");
+ ASSERT_OK(s);
+ s = txn->DeleteUntracked("untracked");
+ ASSERT_OK(s);
+
+ // Conflict
+ s = txn->Put("untracked", "3");
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "untracked", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, ExpiredTransaction) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ // Set txn expiration timeout to 0 microseconds (expires instantly)
+ txn_options.expiration = 0;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ s = txn1->Put("X", "1");
+ ASSERT_OK(s);
+
+ s = txn1->Put("Y", "1");
+ ASSERT_OK(s);
+
+ Transaction* txn2 = db->BeginTransaction(write_options);
+
+ // txn2 should be able to write to X since txn1 has expired
+ s = txn2->Put("X", "2");
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("2", value);
+
+ s = txn1->Put("Z", "1");
+ ASSERT_OK(s);
+
+ // txn1 should fail to commit since it is expired
+ s = txn1->Commit();
+ ASSERT_TRUE(s.IsExpired());
+
+ s = db->Get(read_options, "Y", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, ReinitializeTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ // Set txn expiration timeout to 0 microseconds (expires instantly)
+ txn_options.expiration = 0;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ // Reinitialize transaction to no long expire
+ txn_options.expiration = -1;
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+ s = txn1->Put("Z", "z");
+ ASSERT_OK(s);
+
+ // Should commit since not expired
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+ s = txn1->Put("Z", "zz");
+ ASSERT_OK(s);
+
+ // Reinitilize txn1 and verify that Z gets unlocked
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr);
+ s = txn2->Put("Z", "zzz");
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "zzz");
+
+ // Verify snapshots get reinitialized correctly
+ txn1->SetSnapshot();
+ s = txn1->Put("Z", "zzzz");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "zzzz");
+
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+ const Snapshot* snapshot = txn1->GetSnapshot();
+ ASSERT_FALSE(snapshot);
+
+ txn_options.set_snapshot = true;
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+ snapshot = txn1->GetSnapshot();
+ ASSERT_TRUE(snapshot);
+
+ s = txn1->Put("Z", "a");
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->Rollback());
+
+ s = txn1->Put("Y", "y");
+ ASSERT_OK(s);
+
+ txn_options.set_snapshot = false;
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+ snapshot = txn1->GetSnapshot();
+ ASSERT_FALSE(snapshot);
+
+ s = txn1->Put("X", "x");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(value, "zzzz");
+
+ s = db->Get(read_options, "Y", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+ s = txn1->SetName("name");
+ ASSERT_OK(s);
+
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+ s = txn1->SetName("name");
+ ASSERT_OK(s);
+
+ delete txn1;
+}
+
+TEST_P(TransactionTest, Rollback) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ ASSERT_OK(s);
+
+ s = txn1->Put("X", "1");
+ ASSERT_OK(s);
+
+ Transaction* txn2 = db->BeginTransaction(write_options);
+
+ // txn2 should not be able to write to X since txn1 has it locked
+ s = txn2->Put("X", "2");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ ASSERT_OK(txn1->Rollback());
+ delete txn1;
+
+ // txn2 should now be able to write to X
+ s = txn2->Put("X", "3");
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("3", value);
+
+ delete txn2;
+}
+
+TEST_P(TransactionTest, LockLimitTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ delete db;
+ db = nullptr;
+
+ // Open DB with a lock limit of 3
+ txn_db_options.max_num_locks = 3;
+ ASSERT_OK(ReOpen());
+ assert(db != nullptr);
+ ASSERT_OK(s);
+
+ // Create a txn and verify we can only lock up to 3 keys
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->Put("X", "x");
+ ASSERT_OK(s);
+
+ s = txn->Put("Y", "y");
+ ASSERT_OK(s);
+
+ s = txn->Put("Z", "z");
+ ASSERT_OK(s);
+
+ // lock limit reached
+ s = txn->Put("W", "w");
+ ASSERT_TRUE(s.IsBusy());
+
+ // re-locking same key shouldn't put us over the limit
+ s = txn->Put("X", "xx");
+ ASSERT_OK(s);
+
+ s = txn->GetForUpdate(read_options, "W", &value);
+ ASSERT_TRUE(s.IsBusy());
+ s = txn->GetForUpdate(read_options, "V", &value);
+ ASSERT_TRUE(s.IsBusy());
+
+ // re-locking same key shouldn't put us over the limit
+ s = txn->GetForUpdate(read_options, "Y", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("y", value);
+
+ s = txn->Get(read_options, "W", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ // "X" currently locked
+ s = txn2->Put("X", "x");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ // lock limit reached
+ s = txn2->Put("M", "m");
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("xx", value);
+
+ s = db->Get(read_options, "W", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Committing txn should release its locks and allow txn2 to proceed
+ s = txn2->Put("X", "x2");
+ ASSERT_OK(s);
+
+ s = txn2->Delete("X");
+ ASSERT_OK(s);
+
+ s = txn2->Put("M", "m");
+ ASSERT_OK(s);
+
+ s = txn2->Put("Z", "z2");
+ ASSERT_OK(s);
+
+ // lock limit reached
+ s = txn2->Delete("Y");
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "Z", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("z2", value);
+
+ s = db->Get(read_options, "Y", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("y", value);
+
+ s = db->Get(read_options, "X", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, IteratorTest) {
+ // This test does writes without snapshot validation, and then tries to create
+ // iterator later, which is unsupported in write unprepared.
+ if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+ return;
+ }
+
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ // Write some keys to the db
+ s = db->Put(write_options, "A", "a");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "G", "g");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "F", "f");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "C", "c");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "D", "d");
+ ASSERT_OK(s);
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ // Write some keys in a txn
+ s = txn->Put("B", "b");
+ ASSERT_OK(s);
+
+ s = txn->Put("H", "h");
+ ASSERT_OK(s);
+
+ s = txn->Delete("D");
+ ASSERT_OK(s);
+
+ s = txn->Put("E", "e");
+ ASSERT_OK(s);
+
+ txn->SetSnapshot();
+ const Snapshot* snapshot = txn->GetSnapshot();
+
+ // Write some keys to the db after the snapshot
+ s = db->Put(write_options, "BB", "xx");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "C", "xx");
+ ASSERT_OK(s);
+
+ read_options.snapshot = snapshot;
+ Iterator* iter = txn->GetIterator(read_options);
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+
+ // Read all keys via iter and lock them all
+ std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+ for (int i = 0; i < 7; i++) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(results[i], iter->value().ToString());
+
+ s = txn->GetForUpdate(read_options, iter->key(), nullptr);
+ if (i == 2) {
+ // "C" was modified after txn's snapshot
+ ASSERT_TRUE(s.IsBusy());
+ } else {
+ ASSERT_OK(s);
+ }
+
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+
+ iter->Seek("G");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("g", iter->value().ToString());
+
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("f", iter->value().ToString());
+
+ iter->Seek("D");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("e", iter->value().ToString());
+
+ iter->Seek("C");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("c", iter->value().ToString());
+
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("e", iter->value().ToString());
+
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", iter->value().ToString());
+
+ iter->Seek("X");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+
+ iter->SeekToLast();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("h", iter->value().ToString());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete iter;
+ delete txn;
+}
+
+TEST_P(TransactionTest, DisableIndexingTest) {
+ // Skip this test for write unprepared. It does not solely rely on WBWI for
+ // read your own writes, so depending on whether batches are flushed or not,
+ // only some writes will be visible.
+ //
+ // Also, write unprepared does not support creating iterators if there has
+ // been txn->Put() without snapshot validation.
+ if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+ return;
+ }
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ txn->DisableIndexing();
+
+ s = txn->Put("B", "b");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ Iterator* iter = txn->GetIterator(read_options);
+ ASSERT_OK(iter->status());
+
+ iter->Seek("B");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+
+ s = txn->Delete("A");
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ txn->EnableIndexing();
+
+ s = txn->Put("B", "bb");
+ ASSERT_OK(s);
+
+ iter->Seek("B");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bb", iter->value().ToString());
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("bb", value);
+
+ s = txn->Put("A", "aa");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("aa", value);
+
+ delete iter;
+ delete txn;
+}
+
+TEST_P(TransactionTest, SavepointTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ ASSERT_EQ(0, txn->GetNumPuts());
+
+ s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn->SetSavePoint(); // 1
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn
+ s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Put("B", "b");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(1, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ delete txn;
+ txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Put("B", "bb");
+ ASSERT_OK(s);
+
+ s = txn->Put("C", "c");
+ ASSERT_OK(s);
+
+ txn->SetSavePoint(); // 2
+
+ s = txn->Delete("B");
+ ASSERT_OK(s);
+
+ s = txn->Put("C", "cc");
+ ASSERT_OK(s);
+
+ s = txn->Put("D", "d");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(5, txn->GetNumPuts());
+ ASSERT_EQ(1, txn->GetNumDeletes());
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2
+
+ ASSERT_EQ(3, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("bb", value);
+
+ s = txn->Get(read_options, "C", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("c", value);
+
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Put("E", "e");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(5, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ // Rollback to beginning of txn
+ s = txn->RollbackToSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_OK(txn->Rollback());
+
+ ASSERT_EQ(0, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Get(read_options, "E", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Put("A", "aa");
+ ASSERT_OK(s);
+
+ s = txn->Put("F", "f");
+ ASSERT_OK(s);
+
+ ASSERT_EQ(2, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ txn->SetSavePoint(); // 3
+ txn->SetSavePoint(); // 4
+
+ s = txn->Put("G", "g");
+ ASSERT_OK(s);
+
+ s = txn->SingleDelete("F");
+ ASSERT_OK(s);
+
+ s = txn->Delete("B");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("aa", value);
+
+ s = txn->Get(read_options, "F", &value);
+ // According to db.h, doing a SingleDelete on a key that has been
+ // overwritten will have undefinied behavior. So it is unclear what the
+ // result of fetching "F" should be. The current implementation will
+ // return NotFound in this case.
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Get(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_EQ(3, txn->GetNumPuts());
+ ASSERT_EQ(2, txn->GetNumDeletes());
+
+ ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3
+
+ ASSERT_EQ(2, txn->GetNumPuts());
+ ASSERT_EQ(0, txn->GetNumDeletes());
+
+ s = txn->Get(read_options, "F", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("f", value);
+
+ s = txn->Get(read_options, "G", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "F", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("f", value);
+
+ s = db->Get(read_options, "G", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("aa", value);
+
+ s = db->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ s = db->Get(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(read_options, "E", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, SavepointTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ Status s;
+
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ s = txn1->Put("A", "");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 1
+
+ s = txn1->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn1->Put("C", "c");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 2
+
+ s = txn1->Put("A", "a");
+ ASSERT_OK(s);
+ s = txn1->Put("B", "b");
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 2
+
+ // Verify that "A" and "C" is still locked while "B" is not
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn2->Put("A", "a2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b2");
+ ASSERT_OK(s);
+
+ s = txn1->Put("A", "aa");
+ ASSERT_OK(s);
+ s = txn1->Put("B", "bb");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = txn1->Put("A", "aaa");
+ ASSERT_OK(s);
+ s = txn1->Put("B", "bbb");
+ ASSERT_OK(s);
+ s = txn1->Put("C", "ccc");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 3
+ ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 3
+
+ // Verify that "A", "B", "C" are still locked
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn2->Put("A", "a2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c2");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1
+
+ // Verify that only "A" is locked
+ s = txn2->Put("A", "a3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b3");
+ ASSERT_OK(s);
+ s = txn2->Put("C", "c3po");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+
+ // Verify "A" "C" "B" are no longer locked
+ s = txn2->Put("A", "a4");
+ ASSERT_OK(s);
+ s = txn2->Put("B", "b4");
+ ASSERT_OK(s);
+ s = txn2->Put("C", "c4");
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+}
+
+TEST_P(TransactionTest, SavepointTest3) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ Status s;
+
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ s = txn1->PopSavePoint(); // No SavePoint present
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Put("A", "");
+ ASSERT_OK(s);
+
+ s = txn1->PopSavePoint(); // Still no SavePoint present
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn1->SetSavePoint(); // 1
+
+ s = txn1->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn1->PopSavePoint(); // Remove 1
+ ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound());
+
+ // Verify that "A" is still locked
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn2->Put("A", "a2");
+ ASSERT_TRUE(s.IsTimedOut());
+ delete txn2;
+
+ txn1->SetSavePoint(); // 2
+
+ s = txn1->Put("B", "b");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 3
+
+ s = txn1->Put("B", "b2");
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // Roll back to 2
+
+ s = txn1->PopSavePoint();
+ ASSERT_OK(s);
+
+ s = txn1->PopSavePoint();
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+
+ std::string value;
+
+ // tnx1 should have modified "A" to "a"
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ // tnx1 should have set "B" to just "b"
+ s = db->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ s = db->Get(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(TransactionTest, SavepointTest4) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ Status s;
+
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ txn1->SetSavePoint(); // 1
+ s = txn1->Put("A", "a");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 2
+ s = txn1->Put("B", "b");
+ ASSERT_OK(s);
+
+ s = txn1->PopSavePoint(); // Remove 2
+ ASSERT_OK(s);
+
+ // Verify that A/B still exists.
+ std::string value;
+ ASSERT_OK(txn1->Get(read_options, "A", &value));
+ ASSERT_EQ("a", value);
+
+ ASSERT_OK(txn1->Get(read_options, "B", &value));
+ ASSERT_EQ("b", value);
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1
+
+ // Verify that everything was rolled back.
+ s = txn1->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Get(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Nothing should be locked
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ s = txn2->Put("A", "");
+ ASSERT_OK(s);
+
+ s = txn2->Put("B", "");
+ ASSERT_OK(s);
+
+ delete txn2;
+ delete txn1;
+}
+
+TEST_P(TransactionTest, UndoGetForUpdateTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ txn1->UndoGetForUpdate("A");
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+
+ txn1 = db->BeginTransaction(write_options, txn_options);
+
+ txn1->UndoGetForUpdate("A");
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Verify that A is locked
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ s = txn2->Put("A", "a");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ txn1->UndoGetForUpdate("A");
+
+ // Verify that A is now unlocked
+ s = txn2->Put("A", "a2");
+ ASSERT_OK(s);
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a2", value);
+
+ s = txn1->Delete("A");
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Put("B", "b3");
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "B", &value);
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+
+ // Verify that A and B are still locked
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ s = txn2->Put("A", "a4");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b4");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ ASSERT_OK(txn1->Rollback());
+ delete txn1;
+
+ // Verify that A and B are no longer locked
+ s = txn2->Put("A", "a5");
+ ASSERT_OK(s);
+ s = txn2->Put("B", "b5");
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ delete txn2;
+ ASSERT_OK(s);
+
+ txn1 = db->BeginTransaction(write_options, txn_options);
+
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn1->GetForUpdate(read_options, "B", &value);
+ ASSERT_OK(s);
+ s = txn1->Put("B", "b5");
+ s = txn1->GetForUpdate(read_options, "B", &value);
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("X");
+
+ // Verify A,B,C are locked
+ txn2 = db->BeginTransaction(write_options, txn_options);
+ s = txn2->Put("A", "a6");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Delete("B");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c6");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("X", "x6");
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("X");
+
+ // Verify A,B are locked and C is not
+ s = txn2->Put("A", "a6");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Delete("B");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c6");
+ ASSERT_OK(s);
+ s = txn2->Put("X", "x6");
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("X");
+
+ // Verify B is locked and A and C are not
+ s = txn2->Put("A", "a7");
+ ASSERT_OK(s);
+ s = txn2->Delete("B");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c7");
+ ASSERT_OK(s);
+ s = txn2->Put("X", "x7");
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+}
+
+TEST_P(TransactionTest, UndoGetForUpdateTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ std::string value;
+ Status s;
+
+ s = db->Put(write_options, "A", "");
+ ASSERT_OK(s);
+
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn1);
+
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Put("F", "f");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 1
+
+ txn1->UndoGetForUpdate("A");
+
+ s = txn1->GetForUpdate(read_options, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn1->GetForUpdate(read_options, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->Put("E", "e");
+ ASSERT_OK(s);
+ s = txn1->GetForUpdate(read_options, "E", &value);
+ ASSERT_OK(s);
+
+ s = txn1->GetForUpdate(read_options, "F", &value);
+ ASSERT_OK(s);
+
+ // Verify A,B,C,D,E,F are still locked
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ s = txn2->Put("A", "a1");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b1");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c1");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("D", "d1");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("E", "e1");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f1");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("E");
+
+ // Verify A,B,D,E,F are still locked and C is not.
+ s = txn2->Put("A", "a2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("D", "d2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("E", "e2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c2");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 2
+
+ s = txn1->Put("H", "h");
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("D");
+ txn1->UndoGetForUpdate("E");
+ txn1->UndoGetForUpdate("F");
+ txn1->UndoGetForUpdate("G");
+ txn1->UndoGetForUpdate("H");
+
+ // Verify A,B,D,E,F,H are still locked and C,G are not.
+ s = txn2->Put("A", "a3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("D", "d3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("E", "e3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("H", "h3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c3");
+ ASSERT_OK(s);
+ s = txn2->Put("G", "g3");
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 2
+
+ // Verify A,B,D,E,F are still locked and C,G,H are not.
+ s = txn2->Put("A", "a3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("D", "d3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("E", "e3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c3");
+ ASSERT_OK(s);
+ s = txn2->Put("G", "g3");
+ ASSERT_OK(s);
+ s = txn2->Put("H", "h3");
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("D");
+ txn1->UndoGetForUpdate("E");
+ txn1->UndoGetForUpdate("F");
+ txn1->UndoGetForUpdate("G");
+ txn1->UndoGetForUpdate("H");
+
+ // Verify A,B,E,F are still locked and C,D,G,H are not.
+ s = txn2->Put("A", "a3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("E", "e3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c3");
+ ASSERT_OK(s);
+ s = txn2->Put("D", "d3");
+ ASSERT_OK(s);
+ s = txn2->Put("G", "g3");
+ ASSERT_OK(s);
+ s = txn2->Put("H", "h3");
+ ASSERT_OK(s);
+
+ ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 1
+
+ // Verify A,B,F are still locked and C,D,E,G,H are not.
+ s = txn2->Put("A", "a3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("B", "b3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("F", "f3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("C", "c3");
+ ASSERT_OK(s);
+ s = txn2->Put("D", "d3");
+ ASSERT_OK(s);
+ s = txn2->Put("E", "e3");
+ ASSERT_OK(s);
+ s = txn2->Put("G", "g3");
+ ASSERT_OK(s);
+ s = txn2->Put("H", "h3");
+ ASSERT_OK(s);
+
+ txn1->UndoGetForUpdate("A");
+ txn1->UndoGetForUpdate("B");
+ txn1->UndoGetForUpdate("C");
+ txn1->UndoGetForUpdate("D");
+ txn1->UndoGetForUpdate("E");
+ txn1->UndoGetForUpdate("F");
+ txn1->UndoGetForUpdate("G");
+ txn1->UndoGetForUpdate("H");
+
+ // Verify F is still locked and A,B,C,D,E,G,H are not.
+ s = txn2->Put("F", "f3");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Put("A", "a3");
+ ASSERT_OK(s);
+ s = txn2->Put("B", "b3");
+ ASSERT_OK(s);
+ s = txn2->Put("C", "c3");
+ ASSERT_OK(s);
+ s = txn2->Put("D", "d3");
+ ASSERT_OK(s);
+ s = txn2->Put("E", "e3");
+ ASSERT_OK(s);
+ s = txn2->Put("G", "g3");
+ ASSERT_OK(s);
+ s = txn2->Put("H", "h3");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, TimeoutTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ delete db;
+ db = nullptr;
+
+ // transaction writes have an infinite timeout,
+ // but we will override this when we start a txn
+ // db writes have infinite timeout
+ txn_db_options.transaction_lock_timeout = -1;
+ txn_db_options.default_lock_timeout = -1;
+
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ assert(db != nullptr);
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "aaa", "aaa");
+ ASSERT_OK(s);
+
+ TransactionOptions txn_options0;
+ txn_options0.expiration = 100; // 100ms
+ txn_options0.lock_timeout = 50; // txn timeout no longer infinite
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options0);
+
+ s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+ ASSERT_OK(s);
+
+ // Conflicts with previous GetForUpdate.
+ // Since db writes do not have a timeout, this should eventually succeed when
+ // the transaction expires.
+ s = db->Put(write_options, "aaa", "xxx");
+ ASSERT_OK(s);
+
+ ASSERT_GE(txn1->GetElapsedTime(),
+ static_cast<uint64_t>(txn_options0.expiration));
+
+ s = txn1->Commit();
+ ASSERT_TRUE(s.IsExpired()); // expired!
+
+ s = db->Get(read_options, "aaa", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("xxx", value);
+
+ delete txn1;
+ delete db;
+
+ // transaction writes have 10ms timeout,
+ // db writes have infinite timeout
+ txn_db_options.transaction_lock_timeout = 50;
+ txn_db_options.default_lock_timeout = -1;
+
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "aaa", "aaa");
+ ASSERT_OK(s);
+
+ TransactionOptions txn_options;
+ txn_options.expiration = 100; // 100ms
+ txn1 = db->BeginTransaction(write_options, txn_options);
+
+ s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+ ASSERT_OK(s);
+
+ // Conflicts with previous GetForUpdate.
+ // Since db writes do not have a timeout, this should eventually succeed when
+ // the transaction expires.
+ s = db->Put(write_options, "aaa", "xxx");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_NOK(s); // expired!
+
+ s = db->Get(read_options, "aaa", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("xxx", value);
+
+ delete txn1;
+ txn_options.expiration = 6000000; // 100 minutes
+ txn_options.lock_timeout = 1; // 1ms
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ txn1->SetLockTimeout(100);
+
+ TransactionOptions txn_options2;
+ txn_options2.expiration = 10; // 10ms
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options2);
+ ASSERT_OK(s);
+
+ s = txn2->Put("a", "2");
+ ASSERT_OK(s);
+
+ // txn1 has a lock timeout longer than txn2's expiration, so it will win
+ s = txn1->Delete("a");
+ ASSERT_OK(s);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ // txn2 should be expired out since txn1 waiting until its timeout expired.
+ s = txn2->Commit();
+ ASSERT_TRUE(s.IsExpired());
+
+ delete txn1;
+ delete txn2;
+ txn_options.expiration = 6000000; // 100 minutes
+ txn1 = db->BeginTransaction(write_options, txn_options);
+ txn_options2.expiration = 100000000;
+ txn2 = db->BeginTransaction(write_options, txn_options2);
+
+ s = txn1->Delete("asdf");
+ ASSERT_OK(s);
+
+ // txn2 has a smaller lock timeout than txn1's expiration, so it will time out
+ s = txn2->Delete("asdf");
+ ASSERT_TRUE(s.IsTimedOut());
+ ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ s = txn2->Put("asdf", "asdf");
+ ASSERT_OK(s);
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+
+ s = db->Get(read_options, "asdf", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("asdf", value);
+
+ delete txn1;
+ delete txn2;
+}
+
+TEST_P(TransactionTest, SingleDeleteTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = txn->SingleDelete("A");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ txn = db->BeginTransaction(write_options);
+
+ s = txn->SingleDelete("A");
+ ASSERT_OK(s);
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ txn = db->BeginTransaction(write_options);
+
+ s = txn->SingleDelete("A");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ s = db->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn = db->BeginTransaction(write_options);
+ Transaction* txn2 = db->BeginTransaction(write_options);
+ txn2->SetSnapshot();
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Put("A", "a2");
+ ASSERT_OK(s);
+
+ s = txn->SingleDelete("A");
+ ASSERT_OK(s);
+
+ s = txn->SingleDelete("B");
+ ASSERT_OK(s);
+
+ // According to db.h, doing a SingleDelete on a key that has been
+ // overwritten will have undefinied behavior. So it is unclear what the
+ // result of fetching "A" should be. The current implementation will
+ // return NotFound in this case.
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn2->Put("B", "b");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ // According to db.h, doing a SingleDelete on a key that has been
+ // overwritten will have undefinied behavior. So it is unclear what the
+ // result of fetching "A" should be. The current implementation will
+ // return NotFound in this case.
+ s = db->Get(read_options, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(read_options, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(TransactionTest, MergeTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+ ASSERT_TRUE(txn);
+
+ s = db->Put(write_options, "A", "a0");
+ ASSERT_OK(s);
+
+ s = txn->Merge("A", "1");
+ ASSERT_OK(s);
+
+ s = txn->Merge("A", "2");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a0,1,2", value);
+
+ s = txn->Put("A", "a");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a", value);
+
+ s = txn->Merge("A", "3");
+ ASSERT_OK(s);
+
+ s = txn->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a,3", value);
+
+ TransactionOptions txn_options;
+ txn_options.lock_timeout = 1; // 1 ms
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2);
+
+ // verify that txn has "A" locked
+ s = txn2->Merge("A", "4");
+ ASSERT_TRUE(s.IsTimedOut());
+
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+ delete txn;
+
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a,3", value);
+}
+
+TEST_P(TransactionTest, DeleteRangeSupportTest) {
+ // The `DeleteRange()` API is banned everywhere.
+ ASSERT_TRUE(
+ db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b")
+ .IsNotSupported());
+
+ // But range deletions can be added via the `Write()` API by specifying the
+ // proper flags to promise there are no conflicts according to the DB type
+ // (see `TransactionDB::DeleteRange()` API doc for details).
+ for (bool skip_concurrency_control : {false, true}) {
+ for (bool skip_duplicate_key_check : {false, true}) {
+ ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+ WriteBatch wb;
+ ASSERT_OK(wb.DeleteRange("a", "b"));
+ TransactionDBWriteOptimizations flags;
+ flags.skip_concurrency_control = skip_concurrency_control;
+ flags.skip_duplicate_key_check = skip_duplicate_key_check;
+ Status s = db->Write(WriteOptions(), flags, &wb);
+ std::string value;
+ switch (txn_db_options.write_policy) {
+ case WRITE_COMMITTED:
+ if (skip_concurrency_control) {
+ ASSERT_OK(s);
+ ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+ } else {
+ ASSERT_NOK(s);
+ ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+ }
+ break;
+ case WRITE_PREPARED:
+ // Intentional fall-through
+ case WRITE_UNPREPARED:
+ if (skip_concurrency_control && skip_duplicate_key_check) {
+ ASSERT_OK(s);
+ ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+ } else {
+ ASSERT_NOK(s);
+ ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+ }
+ break;
+ }
+ // Without any promises from the user, range deletion via other `Write()`
+ // APIs are still banned.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+ ASSERT_NOK(db->Write(WriteOptions(), &wb));
+ ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+ }
+ }
+}
+
+TEST_P(TransactionTest, DeferSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+ Status s;
+
+ s = db->Put(write_options, "A", "a0");
+ ASSERT_OK(s);
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+ Transaction* txn2 = db->BeginTransaction(write_options);
+
+ txn1->SetSnapshotOnNextOperation();
+ auto snapshot = txn1->GetSnapshot();
+ ASSERT_FALSE(snapshot);
+
+ s = txn2->Put("A", "a2");
+ ASSERT_OK(s);
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+
+ s = txn1->GetForUpdate(read_options, "A", &value);
+ // Should not conflict with txn2 since snapshot wasn't set until
+ // GetForUpdate was called.
+ ASSERT_OK(s);
+ ASSERT_EQ("a2", value);
+
+ s = txn1->Put("A", "a1");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "B", "b0");
+ ASSERT_OK(s);
+
+ // Cannot lock B since it was written after the snapshot was set
+ s = txn1->Put("B", "b1");
+ ASSERT_TRUE(s.IsBusy());
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+
+ s = db->Get(read_options, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a1", value);
+
+ s = db->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b0", value);
+}
+
+TEST_P(TransactionTest, DeferSnapshotTest2) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+
+ txn1->SetSnapshot();
+
+ s = txn1->Put("A", "a1");
+ ASSERT_OK(s);
+
+ s = db->Put(write_options, "C", "c0");
+ ASSERT_OK(s);
+ s = db->Put(write_options, "D", "d0");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+
+ txn1->SetSnapshotOnNextOperation();
+
+ s = txn1->Get(snapshot_read_options, "C", &value);
+ // Snapshot was set before C was written
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn1->Get(snapshot_read_options, "D", &value);
+ // Snapshot was set before D was written
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Snapshot should not have changed yet.
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+
+ s = txn1->Get(snapshot_read_options, "C", &value);
+ // Snapshot was set before C was written
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn1->Get(snapshot_read_options, "D", &value);
+ // Snapshot was set before D was written
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = txn1->GetForUpdate(read_options, "C", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("c0", value);
+
+ s = db->Put(write_options, "D", "d00");
+ ASSERT_OK(s);
+
+ // Snapshot is now set
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ s = txn1->Get(snapshot_read_options, "D", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("d0", value);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+ delete txn1;
+}
+
+TEST_P(TransactionTest, DeferSnapshotSavePointTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+
+ txn1->SetSavePoint(); // 1
+
+ s = db->Put(write_options, "T", "1");
+ ASSERT_OK(s);
+
+ txn1->SetSnapshotOnNextOperation();
+
+ s = db->Put(write_options, "T", "2");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 2
+
+ s = db->Put(write_options, "T", "3");
+ ASSERT_OK(s);
+
+ s = txn1->Put("A", "a");
+ ASSERT_OK(s);
+
+ txn1->SetSavePoint(); // 3
+
+ s = db->Put(write_options, "T", "4");
+ ASSERT_OK(s);
+
+ txn1->SetSnapshot();
+ txn1->SetSnapshotOnNextOperation();
+
+ txn1->SetSavePoint(); // 4
+
+ s = db->Put(write_options, "T", "5");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("4", value);
+
+ s = txn1->Put("A", "a1");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ s = txn1->RollbackToSavePoint(); // Rollback to 4
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("4", value);
+
+ s = txn1->RollbackToSavePoint(); // Rollback to 3
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("3", value);
+
+ s = txn1->Get(read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ s = txn1->RollbackToSavePoint(); // Rollback to 2
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ ASSERT_FALSE(snapshot_read_options.snapshot);
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ s = txn1->Delete("A");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ ASSERT_TRUE(snapshot_read_options.snapshot);
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ s = txn1->RollbackToSavePoint(); // Rollback to 1
+ ASSERT_OK(s);
+
+ s = txn1->Delete("A");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn1->GetSnapshot();
+ ASSERT_FALSE(snapshot_read_options.snapshot);
+ s = txn1->Get(snapshot_read_options, "T", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("5", value);
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+}
+
+TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ std::string value;
+
+ class Notifier : public TransactionNotifier {
+ private:
+ const Snapshot** snapshot_ptr_;
+
+ public:
+ explicit Notifier(const Snapshot** snapshot_ptr)
+ : snapshot_ptr_(snapshot_ptr) {}
+
+ void SnapshotCreated(const Snapshot* newSnapshot) override {
+ *snapshot_ptr_ = newSnapshot;
+ }
+ };
+
+ std::shared_ptr<Notifier> notifier =
+ std::make_shared<Notifier>(&read_options.snapshot);
+ Status s;
+
+ s = db->Put(write_options, "B", "0");
+ ASSERT_OK(s);
+
+ Transaction* txn1 = db->BeginTransaction(write_options);
+
+ txn1->SetSnapshotOnNextOperation(notifier);
+ ASSERT_FALSE(read_options.snapshot);
+
+ s = db->Put(write_options, "B", "1");
+ ASSERT_OK(s);
+
+ // A Get does not generate the snapshot
+ s = txn1->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_FALSE(read_options.snapshot);
+ ASSERT_EQ(value, "1");
+
+ // Any other operation does
+ s = txn1->Put("A", "0");
+ ASSERT_OK(s);
+
+ // Now change "B".
+ s = db->Put(write_options, "B", "2");
+ ASSERT_OK(s);
+
+ // The original value should still be read
+ s = txn1->Get(read_options, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_TRUE(read_options.snapshot);
+ ASSERT_EQ(value, "1");
+
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ delete txn1;
+}
+
+TEST_P(TransactionTest, ClearSnapshotTest) {
+ WriteOptions write_options;
+ ReadOptions read_options, snapshot_read_options;
+ std::string value;
+ Status s;
+
+ s = db->Put(write_options, "foo", "0");
+ ASSERT_OK(s);
+
+ Transaction* txn = db->BeginTransaction(write_options);
+ ASSERT_TRUE(txn);
+
+ s = db->Put(write_options, "foo", "1");
+ ASSERT_OK(s);
+
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+ ASSERT_FALSE(snapshot_read_options.snapshot);
+
+ // No snapshot created yet
+ s = txn->Get(snapshot_read_options, "foo", &value);
+ ASSERT_EQ(value, "1");
+
+ txn->SetSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+ ASSERT_TRUE(snapshot_read_options.snapshot);
+
+ s = db->Put(write_options, "foo", "2");
+ ASSERT_OK(s);
+
+ // Snapshot was created before change to '2'
+ s = txn->Get(snapshot_read_options, "foo", &value);
+ ASSERT_EQ(value, "1");
+
+ txn->ClearSnapshot();
+ snapshot_read_options.snapshot = txn->GetSnapshot();
+ ASSERT_FALSE(snapshot_read_options.snapshot);
+
+ // Snapshot has now been cleared
+ s = txn->Get(snapshot_read_options, "foo", &value);
+ ASSERT_EQ(value, "2");
+
+ s = txn->Commit();
+ ASSERT_OK(s);
+
+ delete txn;
+}
+
+TEST_P(TransactionTest, ToggleAutoCompactionTest) {
+ Status s;
+
+ ColumnFamilyHandle *cfa, *cfb;
+ ColumnFamilyOptions cf_options;
+
+ // Create 2 new column families
+ s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+ ASSERT_OK(s);
+ s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+ ASSERT_OK(s);
+
+ delete cfa;
+ delete cfb;
+ delete db;
+
+ // open DB with three column families
+ std::vector<ColumnFamilyDescriptor> column_families;
+ // have to open default column family
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+ // open the new column families
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+
+ ColumnFamilyOptions* cf_opt_default = &column_families[0].options;
+ ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options;
+ ColumnFamilyOptions* cf_opt_cfb = &column_families[2].options;
+ cf_opt_default->disable_auto_compactions = false;
+ cf_opt_cfa->disable_auto_compactions = true;
+ cf_opt_cfb->disable_auto_compactions = false;
+
+ std::vector<ColumnFamilyHandle*> handles;
+
+ s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
+ &handles, &db);
+ ASSERT_OK(s);
+
+ auto cfh_default = static_cast_with_check<ColumnFamilyHandleImpl>(handles[0]);
+ auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions();
+
+ auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(handles[1]);
+ auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions();
+
+ auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(handles[2]);
+ auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions();
+
+ ASSERT_EQ(opt_default.disable_auto_compactions, false);
+ ASSERT_EQ(opt_a.disable_auto_compactions, true);
+ ASSERT_EQ(opt_b.disable_auto_compactions, false);
+
+ for (auto handle : handles) {
+ delete handle;
+ }
+}
+
+TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
+ // In this test, txn1 should succeed committing,
+ // as the callback is called after txn1 starts committing.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"TransactionTest::ExpirableTransactionDataRace:1"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+
+ // Force txn1 to expire
+ /* sleep override */
+ std::this_thread::sleep_for(std::chrono::milliseconds(1500));
+
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ Status s;
+ s = txn2->Put("X", "2");
+ ASSERT_TRUE(s.IsTimedOut());
+ s = txn2->Commit();
+ ASSERT_OK(s);
+ delete txn2;
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+
+ txn_options.expiration = 1000; // 1 second
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+ Status s;
+ s = txn1->Put("X", "1");
+ ASSERT_OK(s);
+ s = txn1->Commit();
+ ASSERT_OK(s);
+
+ ReadOptions read_options;
+ std::string value;
+ s = db->Get(read_options, "X", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("1", value);
+
+ delete txn1;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+namespace {
+// cmt_delay_ms is the delay between prepare and commit
+// first_id is the id of the first transaction
+Status TransactionStressTestInserter(
+ TransactionDB* db, const size_t num_transactions, const size_t num_sets,
+ const size_t num_keys_per_set, Random64* rand,
+ const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) {
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+
+ // Inside the inserter we might also retake the snapshot. We do both since two
+ // separte functions are engaged for each.
+ txn_options.set_snapshot = rand->OneIn(2);
+
+ RandomTransactionInserter inserter(
+ rand, write_options, read_options, num_keys_per_set,
+ static_cast<uint16_t>(num_sets), cmt_delay_ms, first_id);
+
+ for (size_t t = 0; t < num_transactions; t++) {
+ bool success = inserter.TransactionDBInsert(db, txn_options);
+ if (!success) {
+ // unexpected failure
+ return inserter.GetLastStatus();
+ }
+ }
+ inserter.GetLastStatus().PermitUncheckedError();
+
+ // Make sure at least some of the transactions succeeded. It's ok if
+ // some failed due to write-conflicts.
+ if (num_transactions != 1 &&
+ inserter.GetFailureCount() > num_transactions / 2) {
+ return Status::TryAgain("Too many transactions failed! " +
+ std::to_string(inserter.GetFailureCount()) + " / " +
+ std::to_string(num_transactions));
+ }
+
+ return Status::OK();
+}
+} // namespace
+
+// Worker threads add a number to a key from each set of keys. The checker
+// threads verify that the sum of all keys in each set are equal.
+TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
+ // Small write buffer to trigger more compactions
+ options.write_buffer_size = 1024;
+ txn_db_options.rollback_deletion_type_callback =
+ [](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
+ return RandomTransactionInserter::RollbackDeletionTypeCallback(key);
+ };
+ ASSERT_OK(ReOpenNoDelete());
+ constexpr size_t num_workers = 4; // worker threads count
+ constexpr size_t num_checkers = 2; // checker threads count
+ constexpr size_t num_slow_checkers = 2; // checker threads emulating backups
+ constexpr size_t num_slow_workers = 1; // slow worker threads count
+ constexpr size_t num_transactions_per_thread = 1000;
+ constexpr uint16_t num_sets = 3;
+ constexpr size_t num_keys_per_set = 100;
+ // Setting the key-space to be 100 keys should cause enough write-conflicts
+ // to make this test interesting.
+
+ std::vector<port::Thread> threads;
+ std::atomic<uint32_t> finished = {0};
+ constexpr bool TAKE_SNAPSHOT = true;
+ uint64_t time_seed = env->NowMicros();
+ printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce
+
+ std::function<void()> call_inserter = [&] {
+ size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 rand(time_seed * thd_seed);
+ ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread,
+ num_sets, num_keys_per_set, &rand));
+ finished++;
+ };
+ std::function<void()> call_checker = [&] {
+ size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 rand(time_seed * thd_seed);
+ // Verify that data is consistent
+ while (finished < num_workers) {
+ ASSERT_OK(RandomTransactionInserter::Verify(
+ db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand));
+ }
+ };
+ std::function<void()> call_slow_checker = [&] {
+ size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 rand(time_seed * thd_seed);
+ // Verify that data is consistent
+ while (finished < num_workers) {
+ uint64_t delay_ms = rand.Uniform(100) + 1;
+ Status s = RandomTransactionInserter::Verify(
+ db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms);
+ ASSERT_OK(s);
+ }
+ };
+ std::function<void()> call_slow_inserter = [&] {
+ size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 rand(time_seed * thd_seed);
+ uint64_t id = 0;
+ // Verify that data is consistent
+ while (finished < num_workers) {
+ uint64_t delay_ms = rand.Uniform(500) + 1;
+ ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set,
+ &rand, delay_ms, id++));
+ }
+ };
+
+ for (uint32_t i = 0; i < num_workers; i++) {
+ threads.emplace_back(call_inserter);
+ }
+ for (uint32_t i = 0; i < num_checkers; i++) {
+ threads.emplace_back(call_checker);
+ }
+ if (with_slow_threads_) {
+ for (uint32_t i = 0; i < num_slow_checkers; i++) {
+ threads.emplace_back(call_slow_checker);
+ }
+ for (uint32_t i = 0; i < num_slow_workers; i++) {
+ threads.emplace_back(call_slow_inserter);
+ }
+ }
+
+ // Wait for all threads to finish
+ for (auto& t : threads) {
+ t.join();
+ }
+
+ // Verify that data is consistent
+ Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set,
+ !TAKE_SNAPSHOT);
+ ASSERT_OK(s);
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, MemoryLimitTest) {
+ TransactionOptions txn_options;
+ // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
+ txn_options.max_write_batch_size = 29;
+ // Set threshold to unlimited so that the write batch does not get flushed,
+ // and can hit the memory limit.
+ txn_options.write_batch_flush_threshold = 0;
+ std::string value;
+ Status s;
+
+ Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options);
+ ASSERT_TRUE(txn);
+
+ ASSERT_EQ(0, txn->GetNumPuts());
+ ASSERT_LE(0, txn->GetID());
+
+ s = txn->Put(Slice("a"), Slice("...."));
+ ASSERT_OK(s);
+ ASSERT_EQ(1, txn->GetNumPuts());
+
+ s = txn->Put(Slice("b"), Slice("...."));
+ ASSERT_OK(s);
+ ASSERT_EQ(2, txn->GetNumPuts());
+
+ s = txn->Put(Slice("b"), Slice("...."));
+ ASSERT_TRUE(s.IsMemoryLimit());
+ ASSERT_EQ(2, txn->GetNumPuts());
+
+ ASSERT_OK(txn->Rollback());
+ delete txn;
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// This test clarifies the existing expectation from the sequence number
+// algorithm. It could detect mistakes in updating the code but it is not
+// necessarily the one acceptable way. If the algorithm is legitimately changed,
+// this unit test should be updated as well.
+TEST_P(TransactionStressTest, SeqAdvanceTest) {
+ // TODO(myabandeh): must be test with false before new releases
+ const bool short_test = true;
+ WriteOptions wopts;
+ FlushOptions fopt;
+
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ // Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
+ // of the branches. This is the same as counting a binary number where i-th
+ // bit represents whether we take branch i in the represented by the number.
+ const size_t NUM_BRANCHES = short_test ? 6 : 10;
+ // Helper function that shows if the branch is to be taken in the run
+ // represented by the number n.
+ auto branch_do = [&](size_t n, size_t* branch) {
+ assert(*branch < NUM_BRANCHES);
+ const size_t filter = static_cast<size_t>(1) << *branch;
+ return n & filter;
+ };
+ const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES;
+ for (size_t n = 0; n < max_n; n++) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ size_t branch = 0;
+ auto seq = db_impl->GetLatestSequenceNumber();
+ exp_seq = seq;
+ TestTxn0(0);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ if (branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ if (!short_test && branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_EQ(exp_seq, seq);
+ }
+
+ // Doing it twice might detect some bugs
+ TestTxn0(1);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ TestTxn1(0);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ if (branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ if (!short_test && branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_EQ(exp_seq, seq);
+ }
+
+ TestTxn3(0);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ if (branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ if (!short_test && branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_EQ(exp_seq, seq);
+ }
+
+ TestTxn4(0);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+
+ ASSERT_EQ(exp_seq, seq);
+
+ if (branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ if (!short_test && branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_EQ(exp_seq, seq);
+ }
+
+ TestTxn2(0);
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ if (branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ if (!short_test && branch_do(n, &branch)) {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_EQ(exp_seq, seq);
+ }
+ ASSERT_OK(ReOpen());
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// Verify that the optimization would not compromize the correctness
+TEST_P(TransactionTest, Optimizations) {
+ size_t comb_cnt = size_t(1) << 2; // 2 is number of optimization vars
+ for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) {
+ TransactionDBWriteOptimizations optimizations;
+ optimizations.skip_concurrency_control = IsInCombination(0, new_comb);
+ optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb);
+
+ ASSERT_OK(ReOpen());
+ WriteOptions write_options;
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("k"), Slice("v1")));
+ ASSERT_OK(db->Write(write_options, &batch));
+
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val));
+ ASSERT_TRUE(pinnable_val == ("v1"));
+ }
+}
+
+// A comparator that uses only the first three bytes
+class ThreeBytewiseComparator : public Comparator {
+ public:
+ ThreeBytewiseComparator() {}
+ const char* Name() const override { return "test.ThreeBytewiseComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+ Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+ return na.compare(nb);
+ }
+ bool Equal(const Slice& a, const Slice& b) const override {
+ Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+ Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+ return na == nb;
+ }
+ // These methods below don't seem relevant to this test. Implement them if
+ // proven othersize.
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override {
+ const Comparator* bytewise_comp = BytewiseComparator();
+ bytewise_comp->FindShortestSeparator(start, limit);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ const Comparator* bytewise_comp = BytewiseComparator();
+ bytewise_comp->FindShortSuccessor(key);
+ }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionTest, GetWithoutSnapshot) {
+ WriteOptions write_options;
+ std::atomic<bool> finish = {false};
+ ASSERT_OK(db->Put(write_options, "key", "value"));
+ ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+ for (int i = 0; i < 100; i++) {
+ TransactionOptions txn_options;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put("key", "overridedvalue"));
+ ASSERT_OK(txn->Put("key", "value"));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ }
+ finish = true;
+ });
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ while (!finish) {
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val));
+ ASSERT_TRUE(pinnable_val == ("value"));
+ }
+ });
+ commit_thread.join();
+ read_thread.join();
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// Test that the transactional db can handle duplicate keys in the write batch
+TEST_P(TransactionTest, DuplicateKeys) {
+ ColumnFamilyOptions cf_options;
+ std::string cf_name = "two";
+ ColumnFamilyHandle* cf_handle = nullptr;
+ {
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+ WriteOptions write_options;
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+ ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
+ // duplicate the keys
+ ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
+ // duplicate the 2nd key. It should not be counted duplicate since a
+ // sub-patch is cut after the last duplicate.
+ ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
+ // duplicate the keys but in a different cf. It should not be counted as
+ // duplicate keys
+ ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
+
+ ASSERT_OK(db->Write(write_options, &batch));
+
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("value3"));
+ s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("value4"));
+ s = db->Get(ropt, cf_handle, "key", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("value5"));
+
+ delete cf_handle;
+ }
+
+ // Test with non-bytewise comparator
+ {
+ ASSERT_OK(ReOpen());
+ std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+ cf_options.comparator = comp_gc.get();
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+ WriteOptions write_options;
+ WriteBatch batch;
+ ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value")));
+ // The first three bytes are the same, do it must be counted as duplicate
+ ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2")));
+ // check for 2nd duplicate key in cf with non-default comparator
+ ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b")));
+ ASSERT_OK(db->Write(write_options, &batch));
+
+ // The value must be the most recent value for all the keys equal to "key",
+ // including "key2"
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
+ ASSERT_TRUE(pinnable_val == ("value2b"));
+
+ // Test duplicate keys with rollback
+ TransactionOptions txn_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3")));
+ ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
+ ASSERT_OK(txn0->Rollback());
+ ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
+ ASSERT_TRUE(pinnable_val == ("value2b"));
+ delete txn0;
+
+ delete cf_handle;
+ cf_options.comparator = BytewiseComparator();
+ }
+
+ for (bool do_prepare : {true, false}) {
+ for (bool do_rollback : {true, false}) {
+ for (bool with_commit_batch : {true, false}) {
+ if (with_commit_batch && !do_prepare) {
+ continue;
+ }
+ if (with_commit_batch && do_rollback) {
+ continue;
+ }
+ ASSERT_OK(ReOpen());
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+ TransactionOptions txn_options;
+ txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+ WriteOptions write_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ auto s = txn0->SetName("xid");
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo0"), Slice("bar0a"));
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo0"), Slice("bar0b"));
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo1"), Slice("bar1"));
+ ASSERT_OK(s);
+ s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
+ ASSERT_OK(s);
+ // Repeat a key after the start of a sub-patch. This should not cause a
+ // duplicate in the most recent sub-patch and hence not creating a new
+ // sub-patch.
+ s = txn0->Put(Slice("foo0"), Slice("bar0c"));
+ ASSERT_OK(s);
+ s = txn0->Merge(Slice("foo2"), Slice("bar2b"));
+ ASSERT_OK(s);
+ // duplicate the keys but in a different cf. It should not be counted as
+ // duplicate.
+ s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1"));
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo3"), Slice("bar3"));
+ ASSERT_OK(s);
+ s = txn0->Merge(Slice("foo3"), Slice("bar3"));
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo4"), Slice("bar4"));
+ ASSERT_OK(s);
+ s = txn0->Delete(Slice("foo4"));
+ ASSERT_OK(s);
+ s = txn0->SingleDelete(Slice("foo4"));
+ ASSERT_OK(s);
+ if (do_prepare) {
+ s = txn0->Prepare();
+ ASSERT_OK(s);
+ }
+ if (do_rollback) {
+ // Test rolling back the batch with duplicates
+ s = txn0->Rollback();
+ ASSERT_OK(s);
+ } else {
+ if (with_commit_batch) {
+ assert(do_prepare);
+ auto cb = txn0->GetCommitTimeWriteBatch();
+ // duplicate a key in the original batch
+ // TODO(myabandeh): the behavior of GetCommitTimeWriteBatch
+ // conflicting with the prepared batch is currently undefined and
+ // gives different results in different implementations.
+
+ // s = cb->Put(Slice("foo0"), Slice("bar0d"));
+ // ASSERT_OK(s);
+ // add a new duplicate key
+ s = cb->Put(Slice("foo6"), Slice("bar6a"));
+ ASSERT_OK(s);
+ s = cb->Put(Slice("foo6"), Slice("bar6b"));
+ ASSERT_OK(s);
+ // add a duplicate key that is removed in the same batch
+ s = cb->Put(Slice("foo7"), Slice("bar7a"));
+ ASSERT_OK(s);
+ s = cb->Delete(Slice("foo7"));
+ ASSERT_OK(s);
+ }
+ s = txn0->Commit();
+ ASSERT_OK(s);
+ }
+ delete txn0;
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+
+ if (do_rollback) {
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ } else {
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0c"));
+ s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0-cf1"));
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1"));
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar2a,bar2b"));
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar3,bar3"));
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ if (with_commit_batch) {
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val);
+ if (txn_db_options.write_policy ==
+ TxnDBWritePolicy::WRITE_COMMITTED) {
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar6b"));
+ } else {
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ }
+ delete cf_handle;
+ } // with_commit_batch
+ } // do_rollback
+ } // do_prepare
+
+ if (!options.unordered_write) {
+ // Also test with max_successive_merges > 0. max_successive_merges will not
+ // affect our algorithm for duplicate key insertion but we add the test to
+ // verify that.
+ cf_options.max_successive_merges = 2;
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ ASSERT_OK(ReOpen());
+ db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
+ WriteOptions write_options;
+ // Ensure one value for the key
+ ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
+ WriteBatch batch;
+ // Merge more than max_successive_merges times
+ ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1")));
+ ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2")));
+ ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3")));
+ ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4")));
+ ASSERT_OK(db->Write(write_options, &batch));
+ ReadOptions read_options;
+ std::string value;
+ ASSERT_OK(db->Get(read_options, cf_handle, "key", &value));
+ ASSERT_EQ(value, "value,1,2,3,4");
+ delete cf_handle;
+ }
+
+ {
+ // Test that the duplicate detection is not compromised after rolling back
+ // to a save point
+ TransactionOptions txn_options;
+ WriteOptions write_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+ txn0->SetSavePoint();
+ ASSERT_OK(txn0->RollbackToSavePoint());
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ }
+
+ // Test sucessfull recovery after a crash
+ {
+ ASSERT_OK(ReOpen());
+ TransactionOptions txn_options;
+ WriteOptions write_options;
+ ReadOptions ropt;
+ Transaction* txn0;
+ PinnableSlice pinnable_val;
+ Status s;
+
+ std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+ cf_options.comparator = comp_gc.get();
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+ delete cf_handle;
+ std::vector<ColumnFamilyDescriptor> cfds{
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(options)),
+ ColumnFamilyDescriptor(cf_name, cf_options),
+ };
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+
+ assert(db != nullptr);
+ ASSERT_OK(db->Put(write_options, "foo0", "init"));
+ ASSERT_OK(db->Put(write_options, "foo1", "init"));
+ ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
+ ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init"));
+
+ // one entry
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0a"));
+
+ // two entries, no duplicate
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b")));
+ ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+ ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ // Flush only cf 1
+ ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->TEST_FlushMemTable(true, false, handles[1]));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0b"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1b"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0b"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "fol1", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1b"));
+
+ // one duplicate with ::Put
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c")));
+ ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c")));
+ ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ // Flush only cf 1
+ ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->TEST_FlushMemTable(true, false, handles[1]));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0d"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1c"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1d"));
+
+ // Duplicate with ::Put, ::Delete
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e")));
+ ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+ ASSERT_OK(txn0->Delete(Slice("foo0")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ // Flush only cf 1
+ ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->TEST_FlushMemTable(true, false, handles[1]));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Duplicate with ::Put, ::SingleDelete
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g")));
+ ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+ ASSERT_OK(txn0->SingleDelete(Slice("foo0")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ // Flush only cf 1
+ ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->TEST_FlushMemTable(true, false, handles[1]));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Duplicate with ::Put, ::Merge
+ txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i")));
+ ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j")));
+ ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f")));
+ ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g")));
+ ASSERT_OK(txn0->Prepare());
+ delete txn0;
+ // This will check the asserts inside recovery code
+ ASSERT_OK(db->FlushWAL(true));
+ // Flush only cf 1
+ ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->TEST_FlushMemTable(true, false, handles[1]));
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn0 != nullptr);
+ ASSERT_OK(txn0->Commit());
+ delete txn0;
+ pinnable_val.Reset();
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar0f,bar0g"));
+ pinnable_val.Reset();
+ s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == ("bar1i,bar1j"));
+
+ for (auto h : handles) {
+ delete h;
+ }
+ delete db;
+ db = nullptr;
+ }
+}
+
+// Test that the reseek optimization in iterators will not result in an infinite
+// loop if there are too many uncommitted entries before the snapshot.
+TEST_P(TransactionTest, ReseekOptimization) {
+ WriteOptions write_options;
+ write_options.sync = true;
+ write_options.disableWAL = false;
+ ColumnFamilyDescriptor cfd;
+ ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd));
+ auto max_skip = cfd.options.max_sequential_skip_in_iterations;
+
+ ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
+
+ TransactionOptions txn_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ // Duplicate keys will result into separate sequence numbers in WritePrepared
+ // and WriteUnPrepared
+ for (size_t i = 0; i < 2 * max_skip; i++) {
+ ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar")));
+ }
+ ASSERT_OK(txn0->Prepare());
+ ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv")));
+
+ ReadOptions read_options;
+ // To avoid loops
+ read_options.max_skippable_internal_keys = 10 * max_skip;
+ Iterator* iter = db->NewIterator(read_options);
+ ASSERT_OK(iter->status());
+ size_t cnt = 0;
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ iter->Next();
+ ASSERT_OK(iter->status());
+ cnt++;
+ }
+ ASSERT_EQ(cnt, 2);
+ cnt = 0;
+ iter->SeekToLast();
+ while (iter->Valid()) {
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ cnt++;
+ }
+ ASSERT_EQ(cnt, 2);
+ delete iter;
+ ASSERT_OK(txn0->Rollback());
+ delete txn0;
+}
+
+// After recovery in kPointInTimeRecovery mode, the corrupted log file remains
+// there. The new log files should be still read succesfully during recovery of
+// the 2nd crash.
+TEST_P(TransactionTest, DoubleCrashInRecovery) {
+ for (const bool manual_wal_flush : {false, true}) {
+ for (const bool write_after_recovery : {false, true}) {
+ options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+ options.manual_wal_flush = manual_wal_flush;
+ ASSERT_OK(ReOpen());
+ std::string cf_name = "two";
+ ColumnFamilyOptions cf_options;
+ ColumnFamilyHandle* cf_handle = nullptr;
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+
+ // Add a prepare entry to prevent the older logs from being deleted.
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("foo-prepare"), Slice("bar-prepare")));
+ ASSERT_OK(txn->Prepare());
+
+ FlushOptions flush_ops;
+ ASSERT_OK(db->Flush(flush_ops));
+ // Now we have a log that cannot be deleted
+
+ ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1"));
+ // Flush only the 2nd cf
+ ASSERT_OK(db->Flush(flush_ops, cf_handle));
+
+ // The value is large enough to be touched by the corruption we ingest
+ // below.
+ std::string large_value(400, ' ');
+ // key/value not touched by corruption
+ ASSERT_OK(db->Put(write_options, "foo2", "bar2"));
+ // key/value touched by corruption
+ ASSERT_OK(db->Put(write_options, "foo3", large_value));
+ // key/value not touched by corruption
+ ASSERT_OK(db->Put(write_options, "foo4", "bar4"));
+
+ ASSERT_OK(db->FlushWAL(true));
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ uint64_t wal_file_id = db_impl->TEST_LogfileNumber();
+ std::string fname = LogFileName(dbname, wal_file_id);
+ reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+ delete txn;
+ delete cf_handle;
+ delete db;
+ db = nullptr;
+
+ // Corrupt the last log file in the middle, so that it is not corrupted
+ // in the tail.
+ std::string file_content;
+ ASSERT_OK(ReadFileToString(env, fname, &file_content));
+ file_content[400] = 'h';
+ file_content[401] = 'a';
+ ASSERT_OK(env->DeleteFile(fname));
+ ASSERT_OK(WriteStringToFile(env, file_content, fname, true));
+
+ // Recover from corruption
+ std::vector<ColumnFamilyHandle*> handles;
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName,
+ ColumnFamilyOptions()));
+ column_families.push_back(
+ ColumnFamilyDescriptor("two", ColumnFamilyOptions()));
+ ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+ assert(db != nullptr);
+
+ if (write_after_recovery) {
+ // Write data to the log right after the corrupted log
+ ASSERT_OK(db->Put(write_options, "foo5", large_value));
+ }
+
+ // Persist data written to WAL during recovery or by the last Put
+ ASSERT_OK(db->FlushWAL(true));
+ // 2nd crash to recover while having a valid log after the corrupted one.
+ ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+ assert(db != nullptr);
+ txn = db->GetTransactionByName("xid");
+ ASSERT_TRUE(txn != nullptr);
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ for (auto handle : handles) {
+ delete handle;
+ }
+ }
+ }
+}
+
+TEST_P(TransactionTest, CommitWithoutPrepare) {
+ {
+ // skip_prepare = false.
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ txn_options.skip_prepare = false;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn->Commit().IsTxnNotPrepared());
+ delete txn;
+ }
+
+ {
+ // skip_prepare = true.
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ txn_options.skip_prepare = true;
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ }
+}
+
+TEST_P(TransactionTest, OpenAndEnableU64Timestamp) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ assert(db);
+
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyOptions cf_opts;
+ cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ {
+ ColumnFamilyHandle* cfh = nullptr;
+ const Status s = db->CreateColumnFamily(cf_opts, test_cf_name, &cfh);
+ if (txn_db_options.write_policy == WRITE_COMMITTED) {
+ ASSERT_OK(s);
+ delete cfh;
+ } else {
+ ASSERT_TRUE(s.IsNotSupported());
+ assert(!cfh);
+ }
+ }
+
+ // Bypass transaction db layer.
+ if (txn_db_options.write_policy != WRITE_COMMITTED) {
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ assert(db_impl);
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+ delete cfh;
+ }
+
+ {
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_opts);
+ std::vector<ColumnFamilyHandle*> handles;
+ const Status s = ReOpenNoDelete(cf_descs, &handles);
+ if (txn_db_options.write_policy == WRITE_COMMITTED) {
+ ASSERT_OK(s);
+ for (auto* h : handles) {
+ delete h;
+ }
+ } else {
+ ASSERT_TRUE(s.IsNotSupported());
+ }
+ }
+}
+
+TEST_P(TransactionTest, OpenAndEnableU32Timestamp) {
+ class DummyComparatorWithU32Ts : public Comparator {
+ public:
+ DummyComparatorWithU32Ts() : Comparator(sizeof(uint32_t)) {}
+ const char* Name() const override { return "DummyComparatorWithU32Ts"; }
+ void FindShortSuccessor(std::string*) const override {}
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+ int Compare(const Slice&, const Slice&) const override { return 0; }
+ };
+
+ std::unique_ptr<Comparator> dummy_ucmp(new DummyComparatorWithU32Ts());
+
+ ASSERT_OK(ReOpenNoDelete());
+
+ assert(db);
+
+ const std::string test_cf_name = "test_cf";
+
+ ColumnFamilyOptions cf_opts;
+ cf_opts.comparator = dummy_ucmp.get();
+ {
+ ColumnFamilyHandle* cfh = nullptr;
+ ASSERT_TRUE(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)
+ .IsInvalidArgument());
+ }
+
+ // Bypass transaction db layer.
+ {
+ ColumnFamilyHandle* cfh = nullptr;
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ assert(db_impl);
+ ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+ delete cfh;
+ }
+
+ {
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_opts);
+ std::vector<ColumnFamilyHandle*> handles;
+ ASSERT_TRUE(ReOpenNoDelete(cf_descs, &handles).IsInvalidArgument());
+ }
+}
+
+TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) {
+ ColumnFamilyOptions cf_options;
+ WriteOptions write_options;
+
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyHandle*> cf_handles;
+
+ cf_names.push_back("test_cf");
+
+ ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+ ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
+ ASSERT_OK(db->DropColumnFamilies(cf_handles));
+
+ for (auto* h : cf_handles) {
+ delete h;
+ }
+ cf_handles.clear();
+
+ std::vector<ColumnFamilyDescriptor> cf_descriptors;
+
+ cf_descriptors.emplace_back("test_cf", ColumnFamilyOptions());
+
+ ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+ ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
+ ASSERT_OK(db->DropColumnFamilies(cf_handles));
+ for (auto* h : cf_handles) {
+ delete h;
+ }
+ cf_handles.clear();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_test.h b/src/rocksdb/utilities/transactions/transaction_test.h
new file mode 100644
index 000000000..0b86453a4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_test.h
@@ -0,0 +1,578 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Return true if the ith bit is set in combination represented by comb
+bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); }
+
+enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite };
+
+class TransactionTestBase : public ::testing::Test {
+ public:
+ TransactionDB* db;
+ SpecialEnv special_env;
+ FaultInjectionTestEnv* env;
+ std::string dbname;
+ Options options;
+
+ TransactionDBOptions txn_db_options;
+ bool use_stackable_db_;
+
+ TransactionTestBase(bool use_stackable_db, bool two_write_queue,
+ TxnDBWritePolicy write_policy,
+ WriteOrdering write_ordering)
+ : db(nullptr),
+ special_env(Env::Default()),
+ env(nullptr),
+ use_stackable_db_(use_stackable_db) {
+ options.create_if_missing = true;
+ options.max_write_buffer_number = 2;
+ options.write_buffer_size = 4 * 1024;
+ options.unordered_write = write_ordering == kUnorderedWrite;
+ options.level0_file_num_compaction_trigger = 2;
+ options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+ special_env.skip_fsync_ = true;
+ env = new FaultInjectionTestEnv(&special_env);
+ options.env = env;
+ options.two_write_queues = two_write_queue;
+ dbname = test::PerThreadDBPath("transaction_testdb");
+
+ EXPECT_OK(DestroyDB(dbname, options));
+ txn_db_options.transaction_lock_timeout = 0;
+ txn_db_options.default_lock_timeout = 0;
+ txn_db_options.write_policy = write_policy;
+ txn_db_options.rollback_merge_operands = true;
+ // This will stress write unprepared, by forcing write batch flush on every
+ // write.
+ txn_db_options.default_write_batch_flush_threshold = 1;
+ // Write unprepared requires all transactions to be named. This setting
+ // autogenerates the name so that existing tests can pass.
+ txn_db_options.autogenerate_name = true;
+ Status s;
+ if (use_stackable_db == false) {
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ } else {
+ s = OpenWithStackableDB();
+ }
+ EXPECT_OK(s);
+ }
+
+ ~TransactionTestBase() {
+ delete db;
+ db = nullptr;
+ // This is to skip the assert statement in FaultInjectionTestEnv. There
+ // seems to be a bug in btrfs that the makes readdir return recently
+ // unlink-ed files. By using the default fs we simply ignore errors resulted
+ // from attempting to delete such files in DestroyDB.
+ if (getenv("KEEP_DB") == nullptr) {
+ options.env = Env::Default();
+ EXPECT_OK(DestroyDB(dbname, options));
+ } else {
+ fprintf(stdout, "db is still in %s\n", dbname.c_str());
+ }
+ delete env;
+ }
+
+ Status ReOpenNoDelete() {
+ delete db;
+ db = nullptr;
+ env->AssertNoOpenFile();
+ env->DropUnsyncedFileData();
+ env->ResetState();
+ Status s;
+ if (use_stackable_db_ == false) {
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ } else {
+ s = OpenWithStackableDB();
+ }
+ assert(!s.ok() || db != nullptr);
+ return s;
+ }
+
+ Status ReOpenNoDelete(std::vector<ColumnFamilyDescriptor>& cfs,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete db;
+ db = nullptr;
+ env->AssertNoOpenFile();
+ env->DropUnsyncedFileData();
+ env->ResetState();
+ Status s;
+ if (use_stackable_db_ == false) {
+ s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles,
+ &db);
+ } else {
+ s = OpenWithStackableDB(cfs, handles);
+ }
+ assert(!s.ok() || db != nullptr);
+ return s;
+ }
+
+ Status ReOpen() {
+ delete db;
+ db = nullptr;
+ DestroyDB(dbname, options);
+ Status s;
+ if (use_stackable_db_ == false) {
+ s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+ } else {
+ s = OpenWithStackableDB();
+ }
+ assert(db != nullptr);
+ return s;
+ }
+
+ Status OpenWithStackableDB(std::vector<ColumnFamilyDescriptor>& cfs,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ std::vector<size_t> compaction_enabled_cf_indices;
+ TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices);
+ DB* root_db = nullptr;
+ Options options_copy(options);
+ const bool use_seq_per_batch =
+ txn_db_options.write_policy == WRITE_PREPARED ||
+ txn_db_options.write_policy == WRITE_UNPREPARED;
+ const bool use_batch_per_txn =
+ txn_db_options.write_policy == WRITE_COMMITTED ||
+ txn_db_options.write_policy == WRITE_PREPARED;
+ Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db,
+ use_seq_per_batch, use_batch_per_txn);
+ auto stackable_db = std::make_unique<StackableDB>(root_db);
+ if (s.ok()) {
+ assert(root_db != nullptr);
+ // If WrapStackableDB() returns non-ok, then stackable_db is already
+ // deleted within WrapStackableDB().
+ s = TransactionDB::WrapStackableDB(stackable_db.release(), txn_db_options,
+ compaction_enabled_cf_indices,
+ *handles, &db);
+ }
+ return s;
+ }
+
+ Status OpenWithStackableDB() {
+ std::vector<size_t> compaction_enabled_cf_indices;
+ std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor(
+ kDefaultColumnFamilyName, ColumnFamilyOptions(options))};
+
+ TransactionDB::PrepareWrap(&options, &column_families,
+ &compaction_enabled_cf_indices);
+ std::vector<ColumnFamilyHandle*> handles;
+ DB* root_db = nullptr;
+ Options options_copy(options);
+ const bool use_seq_per_batch =
+ txn_db_options.write_policy == WRITE_PREPARED ||
+ txn_db_options.write_policy == WRITE_UNPREPARED;
+ const bool use_batch_per_txn =
+ txn_db_options.write_policy == WRITE_COMMITTED ||
+ txn_db_options.write_policy == WRITE_PREPARED;
+ Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
+ &root_db, use_seq_per_batch, use_batch_per_txn);
+ if (!s.ok()) {
+ delete root_db;
+ return s;
+ }
+ StackableDB* stackable_db = new StackableDB(root_db);
+ assert(root_db != nullptr);
+ assert(handles.size() == 1);
+ s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
+ compaction_enabled_cf_indices, handles,
+ &db);
+ delete handles[0];
+ if (!s.ok()) {
+ delete stackable_db;
+ }
+ return s;
+ }
+
+ std::atomic<size_t> linked = {0};
+ std::atomic<size_t> exp_seq = {0};
+ std::atomic<size_t> commit_writes = {0};
+ std::atomic<size_t> expected_commits = {0};
+ // Without Prepare, the commit does not write to WAL
+ std::atomic<size_t> with_empty_commits = {0};
+ void TestTxn0(size_t index) {
+ // Test DB's internal txn. It involves no prepare phase nor a commit marker.
+ auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value");
+ ASSERT_OK(s);
+ if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+ // Consume one seq per key
+ exp_seq++;
+ } else {
+ // Consume one seq per batch
+ exp_seq++;
+ if (options.two_write_queues) {
+ // Consume one seq for commit
+ exp_seq++;
+ }
+ }
+ with_empty_commits++;
+ }
+
+ void TestTxn1(size_t index) {
+ // Testing directly writing a write batch. Functionality-wise it is
+ // equivalent to commit without prepare.
+ WriteBatch wb;
+ auto istr = std::to_string(index);
+ ASSERT_OK(wb.Put("k1" + istr, "v1"));
+ ASSERT_OK(wb.Put("k2" + istr, "v2"));
+ ASSERT_OK(wb.Put("k3" + istr, "v3"));
+ auto s = db->Write(WriteOptions(), &wb);
+ if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+ // Consume one seq per key
+ exp_seq += 3;
+ } else {
+ // Consume one seq per batch
+ exp_seq++;
+ if (options.two_write_queues) {
+ // Consume one seq for commit
+ exp_seq++;
+ }
+ }
+ ASSERT_OK(s);
+ with_empty_commits++;
+ }
+
+ void TestTxn2(size_t index) {
+ // Commit without prepare. It should write to DB without a commit marker.
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ auto istr = std::to_string(index);
+ ASSERT_OK(txn->SetName("xid" + istr));
+ ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+ ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+ ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+ ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+ ASSERT_OK(txn->Commit());
+ if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+ // Consume one seq per key
+ exp_seq += 4;
+ } else if (txn_db_options.write_policy ==
+ TxnDBWritePolicy::WRITE_PREPARED) {
+ // Consume one seq per batch
+ exp_seq++;
+ if (options.two_write_queues) {
+ // Consume one seq for commit
+ exp_seq++;
+ }
+ } else {
+ // Flushed after each key, consume one seq per flushed batch
+ exp_seq += 4;
+ // WriteUnprepared implements CommitWithoutPrepareInternal by simply
+ // calling Prepare then Commit. Consume one seq for the prepare.
+ exp_seq++;
+ }
+ delete txn;
+ with_empty_commits++;
+ }
+
+ void TestTxn3(size_t index) {
+ // A full 2pc txn that also involves a commit marker.
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ auto istr = std::to_string(index);
+ ASSERT_OK(txn->SetName("xid" + istr));
+ ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+ ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+ ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+ ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+ ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+ expected_commits++;
+ ASSERT_OK(txn->Prepare());
+ commit_writes++;
+ ASSERT_OK(txn->Commit());
+ if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+ // Consume one seq per key
+ exp_seq += 5;
+ } else if (txn_db_options.write_policy ==
+ TxnDBWritePolicy::WRITE_PREPARED) {
+ // Consume one seq per batch
+ exp_seq++;
+ // Consume one seq per commit marker
+ exp_seq++;
+ } else {
+ // Flushed after each key, consume one seq per flushed batch
+ exp_seq += 5;
+ // Consume one seq per commit marker
+ exp_seq++;
+ }
+ delete txn;
+ }
+
+ void TestTxn4(size_t index) {
+ // A full 2pc txn that also involves a commit marker.
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ auto istr = std::to_string(index);
+ ASSERT_OK(txn->SetName("xid" + istr));
+ ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+ ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+ ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+ ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+ ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+ expected_commits++;
+ ASSERT_OK(txn->Prepare());
+ commit_writes++;
+ ASSERT_OK(txn->Rollback());
+ if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+ // No seq is consumed for deleting the txn buffer
+ exp_seq += 0;
+ } else if (txn_db_options.write_policy ==
+ TxnDBWritePolicy::WRITE_PREPARED) {
+ // Consume one seq per batch
+ exp_seq++;
+ // Consume one seq per rollback batch
+ exp_seq++;
+ if (options.two_write_queues) {
+ // Consume one seq for rollback commit
+ exp_seq++;
+ }
+ } else {
+ // Flushed after each key, consume one seq per flushed batch
+ exp_seq += 5;
+ // Consume one seq per rollback batch
+ exp_seq++;
+ if (options.two_write_queues) {
+ // Consume one seq for rollback commit
+ exp_seq++;
+ }
+ }
+ delete txn;
+ }
+
+ // Test that we can change write policy after a clean shutdown (which would
+ // empty the WAL)
+ void CrossCompatibilityTest(TxnDBWritePolicy from_policy,
+ TxnDBWritePolicy to_policy, bool empty_wal) {
+ TransactionOptions txn_options;
+ ReadOptions read_options;
+ WriteOptions write_options;
+ uint32_t index = 0;
+ Random rnd(1103);
+ options.write_buffer_size = 1024; // To create more sst files
+ std::unordered_map<std::string, std::string> committed_kvs;
+ Transaction* txn;
+
+ txn_db_options.write_policy = from_policy;
+ if (txn_db_options.write_policy == WRITE_COMMITTED) {
+ options.unordered_write = false;
+ }
+ ASSERT_OK(ReOpen());
+
+ for (int i = 0; i < 1024; i++) {
+ auto istr = std::to_string(index);
+ auto k = Slice("foo-" + istr).ToString();
+ auto v = Slice("bar-" + istr).ToString();
+ // For test the duplicate keys
+ auto v2 = Slice("bar2-" + istr).ToString();
+ auto type = rnd.Uniform(4);
+ switch (type) {
+ case 0:
+ committed_kvs[k] = v;
+ ASSERT_OK(db->Put(write_options, k, v));
+ committed_kvs[k] = v2;
+ ASSERT_OK(db->Put(write_options, k, v2));
+ break;
+ case 1: {
+ WriteBatch wb;
+ committed_kvs[k] = v;
+ ASSERT_OK(wb.Put(k, v));
+ committed_kvs[k] = v2;
+ ASSERT_OK(wb.Put(k, v2));
+ ASSERT_OK(db->Write(write_options, &wb));
+
+ } break;
+ case 2:
+ case 3:
+ txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName("xid" + istr));
+ committed_kvs[k] = v;
+ ASSERT_OK(txn->Put(k, v));
+ committed_kvs[k] = v2;
+ ASSERT_OK(txn->Put(k, v2));
+
+ if (type == 3) {
+ ASSERT_OK(txn->Prepare());
+ }
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ break;
+ default:
+ FAIL();
+ }
+
+ index++;
+ } // for i
+
+ txn_db_options.write_policy = to_policy;
+ if (txn_db_options.write_policy == WRITE_COMMITTED) {
+ options.unordered_write = false;
+ }
+ auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ // Before upgrade/downgrade the WAL must be emptied
+ if (empty_wal) {
+ ASSERT_OK(db_impl->TEST_FlushMemTable());
+ } else {
+ ASSERT_OK(db_impl->FlushWAL(true));
+ }
+ auto s = ReOpenNoDelete();
+ if (empty_wal) {
+ ASSERT_OK(s);
+ } else {
+ // Test that we can detect the WAL that is produced by an incompatible
+ // WritePolicy and fail fast before mis-interpreting the WAL.
+ ASSERT_TRUE(s.IsNotSupported());
+ return;
+ }
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ // Check that WAL is empty
+ VectorLogPtr log_files;
+ ASSERT_OK(db_impl->GetSortedWalFiles(log_files));
+ ASSERT_EQ(0, log_files.size());
+
+ for (auto& kv : committed_kvs) {
+ std::string value;
+ s = db->Get(read_options, kv.first, &value);
+ if (s.IsNotFound()) {
+ printf("key = %s\n", kv.first.c_str());
+ }
+ ASSERT_OK(s);
+ if (kv.second != value) {
+ printf("key = %s\n", kv.first.c_str());
+ }
+ ASSERT_EQ(kv.second, value);
+ }
+ }
+};
+
+class TransactionTest
+ : public TransactionTestBase,
+ virtual public ::testing::WithParamInterface<
+ std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+ TransactionTest()
+ : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())){};
+};
+
+class TransactionStressTest : public TransactionTest {};
+
+class MySQLStyleTransactionTest
+ : public TransactionTestBase,
+ virtual public ::testing::WithParamInterface<
+ std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
+ public:
+ MySQLStyleTransactionTest()
+ : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())),
+ with_slow_threads_(std::get<4>(GetParam())) {
+ if (with_slow_threads_ &&
+ (txn_db_options.write_policy == WRITE_PREPARED ||
+ txn_db_options.write_policy == WRITE_UNPREPARED)) {
+ // The corner case with slow threads involves the caches filling
+ // over which would not happen even with artifial delays. To help
+ // such cases to show up we lower the size of the cache-related data
+ // structures.
+ txn_db_options.wp_snapshot_cache_bits = 1;
+ txn_db_options.wp_commit_cache_bits = 10;
+ options.write_buffer_size = 1024;
+ EXPECT_OK(ReOpen());
+ }
+ };
+
+ protected:
+ // Also emulate slow threads by addin artiftial delays
+ const bool with_slow_threads_;
+};
+
+class WriteCommittedTxnWithTsTest
+ : public TransactionTestBase,
+ public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+ WriteCommittedTxnWithTsTest()
+ : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ WRITE_COMMITTED, kOrderedWrite) {}
+ ~WriteCommittedTxnWithTsTest() override {
+ for (auto* h : handles_) {
+ delete h;
+ }
+ }
+
+ Status GetFromDb(ReadOptions read_opts, ColumnFamilyHandle* column_family,
+ const Slice& key, TxnTimestamp ts, std::string* value) {
+ std::string ts_buf;
+ PutFixed64(&ts_buf, ts);
+ Slice ts_slc = ts_buf;
+ read_opts.timestamp = &ts_slc;
+ assert(db);
+ return db->Get(read_opts, column_family, key, value);
+ }
+
+ Transaction* NewTxn(WriteOptions write_opts, TransactionOptions txn_opts) {
+ assert(db);
+ auto* txn = db->BeginTransaction(write_opts, txn_opts);
+ assert(txn);
+ const bool enable_indexing = std::get<2>(GetParam());
+ if (enable_indexing) {
+ txn->EnableIndexing();
+ } else {
+ txn->DisableIndexing();
+ }
+ return txn;
+ }
+
+ protected:
+ std::vector<ColumnFamilyHandle*> handles_{};
+};
+
+class TimestampedSnapshotWithTsSanityCheck
+ : public TransactionTestBase,
+ public ::testing::WithParamInterface<
+ std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+ explicit TimestampedSnapshotWithTsSanityCheck()
+ : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())) {}
+ ~TimestampedSnapshotWithTsSanityCheck() override {
+ for (auto* h : handles_) {
+ delete h;
+ }
+ }
+
+ protected:
+ std::vector<ColumnFamilyHandle*> handles_{};
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.cc b/src/rocksdb/utilities/transactions/transaction_util.cc
new file mode 100644
index 000000000..360edc8ec
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_util.h"
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status TransactionUtil::CheckKeyForConflicts(
+ DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
+ SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
+ ReadCallback* snap_checker, SequenceNumber min_uncommitted) {
+ Status result;
+
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
+
+ if (sv == nullptr) {
+ result = Status::InvalidArgument("Could not access column family " +
+ cfh->GetName());
+ }
+
+ if (result.ok()) {
+ SequenceNumber earliest_seq =
+ db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+ result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts,
+ cache_only, snap_checker, min_uncommitted);
+
+ db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ return result;
+}
+
+Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
+ SequenceNumber earliest_seq,
+ SequenceNumber snap_seq,
+ const std::string& key,
+ const std::string* const read_ts,
+ bool cache_only, ReadCallback* snap_checker,
+ SequenceNumber min_uncommitted) {
+ // When `min_uncommitted` is provided, keys are not always committed
+ // in sequence number order, and `snap_checker` is used to check whether
+ // specific sequence number is in the database is visible to the transaction.
+ // So `snap_checker` must be provided.
+ assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
+
+ Status result;
+ bool need_to_read_sst = false;
+
+ // Since it would be too slow to check the SST files, we will only use
+ // the memtables to check whether there have been any recent writes
+ // to this key after it was accessed in this transaction. But if the
+ // Memtables do not contain a long enough history, we must fail the
+ // transaction.
+ if (earliest_seq == kMaxSequenceNumber) {
+ // The age of this memtable is unknown. Cannot rely on it to check
+ // for recent writes. This error shouldn't happen often in practice as
+ // the Memtable should have a valid earliest sequence number except in some
+ // corner cases (such as error cases during recovery).
+ need_to_read_sst = true;
+
+ if (cache_only) {
+ result = Status::TryAgain(
+ "Transaction could not check for conflicts as the MemTable does not "
+ "contain a long enough history to check write at SequenceNumber: ",
+ std::to_string(snap_seq));
+ }
+ } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
+ // Use <= for min_uncommitted since earliest_seq is actually the largest sec
+ // before this memtable was created
+ need_to_read_sst = true;
+
+ if (cache_only) {
+ // The age of this memtable is too new to use to check for recent
+ // writes.
+ char msg[300];
+ snprintf(msg, sizeof(msg),
+ "Transaction could not check for conflicts for operation at "
+ "SequenceNumber %" PRIu64
+ " as the MemTable only contains changes newer than "
+ "SequenceNumber %" PRIu64
+ ". Increasing the value of the "
+ "max_write_buffer_size_to_maintain option could reduce the "
+ "frequency "
+ "of this error.",
+ snap_seq, earliest_seq);
+ result = Status::TryAgain(msg);
+ }
+ }
+
+ if (result.ok()) {
+ SequenceNumber seq = kMaxSequenceNumber;
+ std::string timestamp;
+ bool found_record_for_key = false;
+
+ // When min_uncommitted == kMaxSequenceNumber, writes are committed in
+ // sequence number order, so only keys larger than `snap_seq` can cause
+ // conflict.
+ // When min_uncommitted != kMaxSequenceNumber, keys lower than
+ // min_uncommitted will not triggered conflicts, while keys larger than
+ // min_uncommitted might create conflicts, so we need to read them out
+ // from the DB, and call callback to snap_checker to determine. So only
+ // keys lower than min_uncommitted can be skipped.
+ SequenceNumber lower_bound_seq =
+ (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
+ Status s = db_impl->GetLatestSequenceForKey(
+ sv, key, !need_to_read_sst, lower_bound_seq, &seq,
+ !read_ts ? nullptr : &timestamp, &found_record_for_key,
+ /*is_blob_index=*/nullptr);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ result = s;
+ } else if (found_record_for_key) {
+ bool write_conflict = snap_checker == nullptr
+ ? snap_seq < seq
+ : !snap_checker->IsVisible(seq);
+ // Perform conflict checking based on timestamp if applicable.
+ if (!write_conflict && read_ts != nullptr) {
+ ColumnFamilyData* cfd = sv->cfd;
+ assert(cfd);
+ const Comparator* const ucmp = cfd->user_comparator();
+ assert(ucmp);
+ assert(read_ts->size() == ucmp->timestamp_size());
+ assert(read_ts->size() == timestamp.size());
+ // Write conflict if *ts < timestamp.
+ write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0;
+ }
+ if (write_conflict) {
+ result = Status::Busy();
+ }
+ }
+ }
+
+ return result;
+}
+
+Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
+ const LockTracker& tracker,
+ bool cache_only) {
+ Status result;
+
+ std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+ tracker.GetColumnFamilyIterator());
+ assert(cf_it != nullptr);
+ while (cf_it->HasNext()) {
+ ColumnFamilyId cf = cf_it->Next();
+
+ SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf);
+ if (sv == nullptr) {
+ result = Status::InvalidArgument("Could not access column family " +
+ std::to_string(cf));
+ break;
+ }
+
+ SequenceNumber earliest_seq =
+ db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+ // For each of the keys in this transaction, check to see if someone has
+ // written to this key since the start of the transaction.
+ std::unique_ptr<LockTracker::KeyIterator> key_it(
+ tracker.GetKeyIterator(cf));
+ assert(key_it != nullptr);
+ while (key_it->HasNext()) {
+ const std::string& key = key_it->Next();
+ PointLockStatus status = tracker.GetPointLockStatus(cf, key);
+ const SequenceNumber key_seq = status.seq;
+
+ // TODO: support timestamp-based conflict checking.
+ // CheckKeysForConflicts() is currently used only by optimistic
+ // transactions.
+ result = CheckKey(db_impl, sv, earliest_seq, key_seq, key,
+ /*read_ts=*/nullptr, cache_only);
+ if (!result.ok()) {
+ break;
+ }
+ }
+
+ db_impl->ReturnAndCleanupSuperVersion(cf, sv);
+
+ if (!result.ok()) {
+ break;
+ }
+ }
+
+ return result;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.h b/src/rocksdb/utilities/transactions/transaction_util.h
new file mode 100644
index 000000000..a349ba87a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <unordered_map>
+
+#include "db/dbformat.h"
+#include "db/read_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+struct SuperVersion;
+class WriteBatchWithIndex;
+
+class TransactionUtil {
+ public:
+ // Verifies there have been no commits to this key in the db since this
+ // sequence number. If user-defined timestamp is enabled, then also check
+ // no commits to this key in the db since the given ts.
+ //
+ // If cache_only is true, then this function will not attempt to read any
+ // SST files. This will make it more likely this function will
+ // return an error if it is unable to determine if there are any conflicts.
+ //
+ // See comment of CheckKey() for explanation of `snap_seq`, `ts`,
+ // `snap_checker` and `min_uncommitted`.
+ //
+ // Returns OK on success, BUSY if there is a conflicting write, or other error
+ // status for any unexpected errors.
+ static Status CheckKeyForConflicts(
+ DBImpl* db_impl, ColumnFamilyHandle* column_family,
+ const std::string& key, SequenceNumber snap_seq,
+ const std::string* const ts, bool cache_only,
+ ReadCallback* snap_checker = nullptr,
+ SequenceNumber min_uncommitted = kMaxSequenceNumber);
+
+ // For each key,SequenceNumber pair tracked by the LockTracker, this function
+ // will verify there have been no writes to the key in the db since that
+ // sequence number.
+ //
+ // Returns OK on success, BUSY if there is a conflicting write, or other error
+ // status for any unexpected errors.
+ //
+ // REQUIRED:
+ // This function should only be called on the write thread or if the
+ // mutex is held.
+ // tracker must support point lock.
+ static Status CheckKeysForConflicts(DBImpl* db_impl,
+ const LockTracker& tracker,
+ bool cache_only);
+
+ private:
+ // If `snap_checker` == nullptr, writes are always commited in sequence number
+ // order. All sequence number <= `snap_seq` will not conflict with any
+ // write, and all keys > `snap_seq` of `key` will trigger conflict.
+ // If `snap_checker` != nullptr, writes may not commit in sequence number
+ // order. In this case `min_uncommitted` is a lower bound.
+ // seq < `min_uncommitted`: no conflict
+ // seq > `snap_seq`: applicable to conflict
+ // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
+ //
+ // If user-defined timestamp is enabled, a write conflict is detected if an
+ // operation for `key` with timestamp greater than `ts` exists.
+ static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
+ SequenceNumber earliest_seq, SequenceNumber snap_seq,
+ const std::string& key, const std::string* const ts,
+ bool cache_only, ReadCallback* snap_checker = nullptr,
+ SequenceNumber min_uncommitted = kMaxSequenceNumber);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc
new file mode 100644
index 000000000..94b8201f7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc
@@ -0,0 +1,588 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/merge_operators.h"
+#ifndef ROCKSDB_LITE
+
+#include "test_util/testutil.h"
+#include "utilities/transactions/transaction_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+INSTANTIATE_TEST_CASE_P(
+ DBAsBaseDB, WriteCommittedTxnWithTsTest,
+ ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false,
+ /*enable_indexing=*/false),
+ std::make_tuple(false, /*two_write_queue=*/true,
+ /*enable_indexing=*/false),
+ std::make_tuple(false, /*two_write_queue=*/false,
+ /*enable_indexing=*/true),
+ std::make_tuple(false, /*two_write_queue=*/true,
+ /*enable_indexing=*/true)));
+
+INSTANTIATE_TEST_CASE_P(
+ DBAsStackableDB, WriteCommittedTxnWithTsTest,
+ ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false,
+ /*enable_indexing=*/false),
+ std::make_tuple(true, /*two_write_queue=*/true,
+ /*enable_indexing=*/false),
+ std::make_tuple(true, /*two_write_queue=*/false,
+ /*enable_indexing=*/true),
+ std::make_tuple(true, /*two_write_queue=*/true,
+ /*enable_indexing=*/true)));
+
+TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_opts;
+ cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_opts);
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->Put(handles_[1], "foo", "value"));
+ ASSERT_TRUE(txn->Commit().IsInvalidArgument());
+
+ auto* pessimistic_txn =
+ static_cast_with_check<PessimisticTransaction>(txn.get());
+ ASSERT_TRUE(
+ pessimistic_txn->CommitBatch(/*batch=*/nullptr).IsInvalidArgument());
+
+ {
+ WriteBatchWithIndex* wbwi = txn->GetWriteBatch();
+ assert(wbwi);
+ WriteBatch* wb = wbwi->GetWriteBatch();
+ assert(wb);
+ // Write a key to the batch for nonexisting cf.
+ ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"",
+ /*value=*/""));
+ }
+
+ ASSERT_OK(txn->SetCommitTimestamp(20));
+
+ ASSERT_TRUE(txn->Commit().IsInvalidArgument());
+ txn.reset();
+
+ std::unique_ptr<Transaction> txn1(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn1);
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn1->Put(handles_[1], "foo", "value"));
+ {
+ WriteBatchWithIndex* wbwi = txn1->GetWriteBatch();
+ assert(wbwi);
+ WriteBatch* wb = wbwi->GetWriteBatch();
+ assert(wb);
+ // Write a key to the batch for non-existing cf.
+ ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"",
+ /*value=*/""));
+ }
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn1->SetCommitTimestamp(21));
+ ASSERT_TRUE(txn1->Commit().IsInvalidArgument());
+ txn1.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) {
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_opts;
+ cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_opts);
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn0(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn0);
+ ASSERT_OK(txn0->Put(handles_[1], "foo", "value"));
+ ASSERT_OK(txn0->SetName("txn0"));
+ ASSERT_OK(txn0->Prepare());
+ ASSERT_TRUE(txn0->Commit().IsInvalidArgument());
+ txn0.reset();
+
+ std::unique_ptr<Transaction> txn1(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn1);
+ ASSERT_OK(txn1->Put(handles_[1], "foo", "value1"));
+ {
+ std::string buf;
+ PutFixed64(&buf, 23);
+ ASSERT_OK(txn1->Put("id", buf));
+ ASSERT_OK(txn1->Merge("id", buf));
+ }
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23));
+ ASSERT_OK(txn1->Commit());
+ txn1.reset();
+
+ {
+ std::string value;
+ const Status s =
+ GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value1", value);
+ }
+
+ {
+ std::string value;
+ const Status s = db->Get(ReadOptions(), handles_[0], "id", &value);
+ ASSERT_OK(s);
+ uint64_t ival = 0;
+ Slice value_slc = value;
+ bool result = GetFixed64(&value_slc, &ival);
+ assert(result);
+ ASSERT_EQ(46, ival);
+ }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, RecoverFromWal) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_opts;
+ cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_opts);
+ options.avoid_flush_during_shutdown = true;
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn0(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn0);
+ ASSERT_OK(txn0->Put(handles_[1], "foo", "foo_value"));
+ ASSERT_OK(txn0->SetName("txn0"));
+ ASSERT_OK(txn0->Prepare());
+
+ WriteOptions write_opts;
+ write_opts.sync = true;
+ std::unique_ptr<Transaction> txn1(NewTxn(write_opts, TransactionOptions()));
+ assert(txn1);
+ ASSERT_OK(txn1->Put("bar", "bar_value_1"));
+ ASSERT_OK(txn1->Put(handles_[1], "bar", "bar_value_1"));
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23));
+ ASSERT_OK(txn1->Commit());
+ txn1.reset();
+
+ std::unique_ptr<Transaction> txn2(NewTxn(write_opts, TransactionOptions()));
+ assert(txn2);
+ ASSERT_OK(txn2->Put("key1", "value_3"));
+ ASSERT_OK(txn2->Put(handles_[1], "key1", "value_3"));
+ ASSERT_OK(txn2->SetCommitTimestamp(/*ts=*/24));
+ ASSERT_OK(txn2->Commit());
+ txn2.reset();
+
+ txn0.reset();
+
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ {
+ std::string value;
+ Status s = GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(ReadOptions(), handles_[0], "bar", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("bar_value_1", value);
+
+ value.clear();
+ s = GetFromDb(ReadOptions(), handles_[1], "bar", /*ts=*/23, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("bar_value_1", value);
+
+ s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/23, &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = db->Get(ReadOptions(), handles_[0], "key1", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value_3", value);
+
+ s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/24, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value_3", value);
+ }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, TransactionDbLevelApi) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_options;
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, cf_options);
+
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::string key_str = "tes_key";
+ std::string ts_str;
+ std::string value_str = "test_value";
+ PutFixed64(&ts_str, 100);
+ Slice value = value_str;
+
+ assert(db);
+ ASSERT_TRUE(
+ db->Put(WriteOptions(), handles_[1], "foo", "bar").IsNotSupported());
+ ASSERT_TRUE(db->Delete(WriteOptions(), handles_[1], "foo").IsNotSupported());
+ ASSERT_TRUE(
+ db->SingleDelete(WriteOptions(), handles_[1], "foo").IsNotSupported());
+ ASSERT_TRUE(
+ db->Merge(WriteOptions(), handles_[1], "foo", "+1").IsNotSupported());
+ WriteBatch wb1(/*reserved_bytes=*/0, /*max_bytes=*/0,
+ /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+ ASSERT_OK(wb1.Put(handles_[1], key_str, ts_str, value));
+ ASSERT_TRUE(db->Write(WriteOptions(), &wb1).IsNotSupported());
+ ASSERT_TRUE(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb1)
+ .IsNotSupported());
+ auto* pessimistic_txn_db =
+ static_cast_with_check<PessimisticTransactionDB>(db);
+ assert(pessimistic_txn_db);
+ ASSERT_TRUE(
+ pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb1)
+ .IsNotSupported());
+
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+ ASSERT_OK(db->Delete(WriteOptions(), "bar"));
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+ ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+ ASSERT_OK(db->Merge(WriteOptions(), "key", "_more"));
+ WriteBatch wb2(/*reserved_bytes=*/0, /*max_bytes=*/0,
+ /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+ ASSERT_OK(wb2.Put(key_str, value));
+ ASSERT_OK(db->Write(WriteOptions(), &wb2));
+ ASSERT_OK(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb2));
+ ASSERT_OK(
+ pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb2));
+
+ std::unique_ptr<Transaction> txn(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn);
+
+ WriteBatch wb3(/*reserved_bytes=*/0, /*max_bytes=*/0,
+ /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+
+ ASSERT_OK(wb3.Put(handles_[1], "key", "value"));
+ auto* pessimistic_txn =
+ static_cast_with_check<PessimisticTransaction>(txn.get());
+ assert(pessimistic_txn);
+ ASSERT_TRUE(pessimistic_txn->CommitBatch(&wb3).IsNotSupported());
+
+ txn.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, Merge) {
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_options;
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+ options.avoid_flush_during_shutdown = true;
+
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn);
+ ASSERT_OK(txn->Put(handles_[1], "foo", "bar"));
+ ASSERT_OK(txn->Merge(handles_[1], "foo", "1"));
+ ASSERT_OK(txn->SetCommitTimestamp(24));
+ ASSERT_OK(txn->Commit());
+ txn.reset();
+ {
+ std::string value;
+ const Status s =
+ GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/24, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("bar,1", value);
+ }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_options;
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+ options.avoid_flush_during_shutdown = true;
+
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn0(
+ NewTxn(WriteOptions(), TransactionOptions()));
+
+ std::unique_ptr<Transaction> txn1(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn1->Put(handles_[1], "key", "value1"));
+ ASSERT_OK(txn1->SetCommitTimestamp(24));
+ ASSERT_OK(txn1->Commit());
+ txn1.reset();
+
+ std::string value;
+ ASSERT_OK(txn0->SetReadTimestampForValidation(23));
+ ASSERT_TRUE(
+ txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value).IsBusy());
+ ASSERT_OK(txn0->Rollback());
+ txn0.reset();
+
+ std::unique_ptr<Transaction> txn2(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn2->SetReadTimestampForValidation(25));
+ ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value));
+ ASSERT_OK(txn2->SetCommitTimestamp(26));
+ ASSERT_OK(txn2->Commit());
+ txn2.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_options;
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+ options.avoid_flush_during_shutdown = true;
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn0(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn0);
+ std::unique_ptr<Transaction> txn1(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn1);
+
+ {
+ std::string value;
+ ASSERT_OK(txn0->SetReadTimestampForValidation(100));
+ // Lock "key".
+ ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value)
+ .IsNotFound());
+ }
+
+ ASSERT_OK(txn0->Put(handles_[1], "key", "value0"));
+ ASSERT_OK(txn0->SetCommitTimestamp(101));
+ ASSERT_OK(txn0->Commit());
+
+ ASSERT_OK(txn1->Put(handles_[1], "key", "value1"));
+ // In reality, caller needs to ensure commit_ts of txn1 is greater than the
+ // commit_ts of txn0, which is true for lock-based concurrency control.
+ ASSERT_OK(txn1->SetCommitTimestamp(102));
+ ASSERT_OK(txn1->Commit());
+
+ txn0.reset();
+ txn1.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, RefineReadTimestamp) {
+ ASSERT_OK(ReOpenNoDelete());
+
+ ColumnFamilyOptions cf_options;
+ cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ const std::string test_cf_name = "test_cf";
+ ColumnFamilyHandle* cfh = nullptr;
+ assert(db);
+ ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+ delete cfh;
+ cfh = nullptr;
+
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+ options.avoid_flush_during_shutdown = true;
+
+ ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+ std::unique_ptr<Transaction> txn0(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn0);
+
+ std::unique_ptr<Transaction> txn1(
+ NewTxn(WriteOptions(), TransactionOptions()));
+ assert(txn1);
+
+ {
+ ASSERT_OK(txn0->SetReadTimestampForValidation(100));
+ // Lock "key0", "key1", ..., "key4".
+ for (int i = 0; i < 5; ++i) {
+ std::string value;
+ ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1],
+ "key" + std::to_string(i), &value)
+ .IsNotFound());
+ }
+ }
+ ASSERT_OK(txn1->Put(handles_[1], "key5", "value5_0"));
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn1->SetCommitTimestamp(101));
+ ASSERT_OK(txn1->Commit());
+ txn1.reset();
+
+ {
+ std::string value;
+ ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value)
+ .IsBusy());
+ ASSERT_OK(txn0->SetReadTimestampForValidation(102));
+ ASSERT_OK(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value));
+ ASSERT_EQ("value5_0", value);
+ }
+
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_OK(txn0->Put(handles_[1], "key" + std::to_string(i),
+ "value" + std::to_string(i)));
+ }
+ ASSERT_OK(txn0->SetName("txn0"));
+ ASSERT_OK(txn0->Prepare());
+ ASSERT_OK(txn0->SetCommitTimestamp(103));
+ ASSERT_OK(txn0->Commit());
+ txn0.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) {
+ options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+ ASSERT_OK(ReOpen());
+
+ std::unique_ptr<Transaction> txn1(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn1);
+
+ std::unique_ptr<Transaction> txn2(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn2);
+ ASSERT_OK(txn2->Put("foo", "v0"));
+ ASSERT_OK(txn2->SetCommitTimestamp(10));
+ ASSERT_OK(txn2->Commit());
+ txn2.reset();
+
+ // txn1 takes a snapshot after txn2 commits. The writes of txn2 have
+ // a smaller seqno than txn1's snapshot, thus should not affect conflict
+ // checking.
+ txn1->SetSnapshot();
+
+ std::unique_ptr<Transaction> txn3(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ assert(txn3);
+ ASSERT_OK(txn3->SetReadTimestampForValidation(20));
+ std::string dontcare;
+ ASSERT_OK(txn3->GetForUpdate(ReadOptions(), "foo", &dontcare));
+ ASSERT_OK(txn3->SingleDelete("foo"));
+ ASSERT_OK(txn3->SetName("txn3"));
+ ASSERT_OK(txn3->Prepare());
+ ASSERT_OK(txn3->SetCommitTimestamp(30));
+ // txn3 reads at ts=20 > txn2's commit timestamp, and commits at ts=30.
+ // txn3 can commit successfully, leaving a tombstone with ts=30.
+ ASSERT_OK(txn3->Commit());
+ txn3.reset();
+
+ bool called = false;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::GetLatestSequenceForKey:mem", [&](void* arg) {
+ auto* const ts_ptr = reinterpret_cast<std::string*>(arg);
+ assert(ts_ptr);
+ Slice ts_slc = *ts_ptr;
+ uint64_t last_ts = 0;
+ ASSERT_TRUE(GetFixed64(&ts_slc, &last_ts));
+ ASSERT_EQ(30, last_ts);
+ called = true;
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // txn1's read timestamp is 25 < 30 (commit timestamp of txn3). Therefore,
+ // the tombstone written by txn3 causes the conflict checking to fail.
+ ASSERT_OK(txn1->SetReadTimestampForValidation(25));
+ ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), "foo", &dontcare).IsBusy());
+ ASSERT_TRUE(called);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Transactions not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
new file mode 100644
index 000000000..86a9511a4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
@@ -0,0 +1,4078 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+using CommitEntry = WritePreparedTxnDB::CommitEntry;
+using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b;
+using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
+
+TEST(PreparedHeap, BasicsTest) {
+ WritePreparedTxnDB::PreparedHeap heap;
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(14l);
+ // Test with one element
+ ASSERT_EQ(14l, heap.top());
+ heap.push(24l);
+ heap.push(34l);
+ // Test that old min is still on top
+ ASSERT_EQ(14l, heap.top());
+ heap.push(44l);
+ heap.push(54l);
+ heap.push(64l);
+ heap.push(74l);
+ heap.push(84l);
+ }
+ // Test that old min is still on top
+ ASSERT_EQ(14l, heap.top());
+ heap.erase(24l);
+ // Test that old min is still on top
+ ASSERT_EQ(14l, heap.top());
+ heap.erase(14l);
+ // Test that the new comes to the top after multiple erase
+ ASSERT_EQ(34l, heap.top());
+ heap.erase(34l);
+ // Test that the new comes to the top after single erase
+ ASSERT_EQ(44l, heap.top());
+ heap.erase(54l);
+ ASSERT_EQ(44l, heap.top());
+ heap.pop(); // pop 44l
+ // Test that the erased items are ignored after pop
+ ASSERT_EQ(64l, heap.top());
+ heap.erase(44l);
+ // Test that erasing an already popped item would work
+ ASSERT_EQ(64l, heap.top());
+ heap.erase(84l);
+ ASSERT_EQ(64l, heap.top());
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(85l);
+ heap.push(86l);
+ heap.push(87l);
+ heap.push(88l);
+ heap.push(89l);
+ }
+ heap.erase(87l);
+ heap.erase(85l);
+ heap.erase(89l);
+ heap.erase(86l);
+ heap.erase(88l);
+ // Test top remains the same after a random order of many erases
+ ASSERT_EQ(64l, heap.top());
+ heap.pop();
+ // Test that pop works with a series of random pending erases
+ ASSERT_EQ(74l, heap.top());
+ ASSERT_FALSE(heap.empty());
+ heap.pop();
+ // Test that empty works
+ ASSERT_TRUE(heap.empty());
+}
+
+// This is a scenario reconstructed from a buggy trace. Test that the bug does
+// not resurface again.
+TEST(PreparedHeap, EmptyAtTheEnd) {
+ WritePreparedTxnDB::PreparedHeap heap;
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(40l);
+ }
+ ASSERT_EQ(40l, heap.top());
+ // Although not a recommended scenario, we must be resilient against erase
+ // without a prior push.
+ heap.erase(50l);
+ ASSERT_EQ(40l, heap.top());
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(60l);
+ }
+ ASSERT_EQ(40l, heap.top());
+
+ heap.erase(60l);
+ ASSERT_EQ(40l, heap.top());
+ heap.erase(40l);
+ ASSERT_TRUE(heap.empty());
+
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(40l);
+ }
+ ASSERT_EQ(40l, heap.top());
+ heap.erase(50l);
+ ASSERT_EQ(40l, heap.top());
+ {
+ MutexLock ml(heap.push_pop_mutex());
+ heap.push(60l);
+ }
+ ASSERT_EQ(40l, heap.top());
+
+ heap.erase(40l);
+ // Test that the erase has not emptied the heap (we had a bug doing that)
+ ASSERT_FALSE(heap.empty());
+ ASSERT_EQ(60l, heap.top());
+ heap.erase(60l);
+ ASSERT_TRUE(heap.empty());
+}
+
+// Generate random order of PreparedHeap access and test that the heap will be
+// successfully emptied at the end.
+TEST(PreparedHeap, Concurrent) {
+ const size_t t_cnt = 10;
+ ROCKSDB_NAMESPACE::port::Thread t[t_cnt + 1];
+ WritePreparedTxnDB::PreparedHeap heap;
+ port::RWMutex prepared_mutex;
+ std::atomic<size_t> last;
+
+ for (size_t n = 0; n < 100; n++) {
+ last = 0;
+ t[0] = ROCKSDB_NAMESPACE::port::Thread([&]() {
+ Random rnd(1103);
+ for (size_t seq = 1; seq <= t_cnt; seq++) {
+ // This is not recommended usage but we should be resilient against it.
+ bool skip_push = rnd.OneIn(5);
+ if (!skip_push) {
+ MutexLock ml(heap.push_pop_mutex());
+ std::this_thread::yield();
+ heap.push(seq);
+ last.store(seq);
+ }
+ }
+ });
+ for (size_t i = 1; i <= t_cnt; i++) {
+ t[i] =
+ ROCKSDB_NAMESPACE::port::Thread([&heap, &prepared_mutex, &last, i]() {
+ auto seq = i;
+ do {
+ std::this_thread::yield();
+ } while (last.load() < seq);
+ WriteLock wl(&prepared_mutex);
+ heap.erase(seq);
+ });
+ }
+ for (size_t i = 0; i <= t_cnt; i++) {
+ t[i].join();
+ }
+ ASSERT_TRUE(heap.empty());
+ }
+}
+
+// Test that WriteBatchWithIndex correctly counts the number of sub-batches
+TEST(WriteBatchWithIndex, SubBatchCnt) {
+ ColumnFamilyOptions cf_options;
+ std::string cf_name = "two";
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ const std::string dbname = test::PerThreadDBPath("transaction_testdb");
+ EXPECT_OK(DestroyDB(dbname, options));
+ ASSERT_OK(DB::Open(options, dbname, &db));
+ ColumnFamilyHandle* cf_handle = nullptr;
+ ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+ WriteOptions write_options;
+ size_t batch_cnt = 1;
+ size_t save_points = 0;
+ std::vector<size_t> batch_cnt_at;
+ WriteBatchWithIndex batch(db->DefaultColumnFamily()->GetComparator(), 0, true,
+ 0);
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+ batch_cnt_at.push_back(batch_cnt);
+ batch.SetSavePoint();
+ save_points++;
+ ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+ batch_cnt_at.push_back(batch_cnt);
+ batch.SetSavePoint();
+ save_points++;
+ ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+ // duplicate the keys
+ batch_cnt_at.push_back(batch_cnt);
+ batch.SetSavePoint();
+ save_points++;
+ ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
+ batch_cnt++;
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+ // duplicate the 2nd key. It should not be counted duplicate since a
+ // sub-patch is cut after the last duplicate.
+ batch_cnt_at.push_back(batch_cnt);
+ batch.SetSavePoint();
+ save_points++;
+ ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+ // duplicate the keys but in a different cf. It should not be counted as
+ // duplicate keys
+ batch_cnt_at.push_back(batch_cnt);
+ batch.SetSavePoint();
+ save_points++;
+ ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
+ ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+
+ // Test that the number of sub-batches matches what we count with
+ // SubBatchCounter
+ std::map<uint32_t, const Comparator*> comparators;
+ comparators[0] = db->DefaultColumnFamily()->GetComparator();
+ comparators[cf_handle->GetID()] = cf_handle->GetComparator();
+ SubBatchCounter counter(comparators);
+ ASSERT_OK(batch.GetWriteBatch()->Iterate(&counter));
+ ASSERT_EQ(batch_cnt, counter.BatchCount());
+
+ // Test that RollbackToSavePoint will properly resets the number of
+ // sub-batches
+ for (size_t i = save_points; i > 0; i--) {
+ ASSERT_OK(batch.RollbackToSavePoint());
+ ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt());
+ }
+
+ // Test the count is right with random batches
+ {
+ const size_t TOTAL_KEYS = 20; // 20 ~= 10 to cause a few randoms
+ Random rnd(1131);
+ std::string keys[TOTAL_KEYS];
+ for (size_t k = 0; k < TOTAL_KEYS; k++) {
+ int len = static_cast<int>(rnd.Uniform(50));
+ keys[k] = test::RandomKey(&rnd, len);
+ }
+ for (size_t i = 0; i < 1000; i++) { // 1000 random batches
+ WriteBatchWithIndex rndbatch(db->DefaultColumnFamily()->GetComparator(),
+ 0, true, 0);
+ for (size_t k = 0; k < 10; k++) { // 10 key per batch
+ size_t ki = static_cast<size_t>(rnd.Uniform(TOTAL_KEYS));
+ Slice key = Slice(keys[ki]);
+ std::string tmp = rnd.RandomString(16);
+ Slice value = Slice(tmp);
+ ASSERT_OK(rndbatch.Put(key, value));
+ }
+ SubBatchCounter batch_counter(comparators);
+ ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter));
+ ASSERT_EQ(rndbatch.SubBatchCnt(), batch_counter.BatchCount());
+ }
+ }
+
+ delete cf_handle;
+ delete db;
+}
+
+TEST(CommitEntry64b, BasicTest) {
+ const size_t INDEX_BITS = static_cast<size_t>(21);
+ const size_t INDEX_SIZE = static_cast<size_t>(1ull << INDEX_BITS);
+ const CommitEntry64bFormat FORMAT(static_cast<size_t>(INDEX_BITS));
+
+ // zero-initialized CommitEntry64b should indicate an empty entry
+ CommitEntry64b empty_entry64b;
+ uint64_t empty_index = 11ul;
+ CommitEntry empty_entry;
+ bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT);
+ ASSERT_FALSE(ok);
+
+ // the zero entry is reserved for un-initialized entries
+ const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1;
+ // Samples over the numbers that are covered by that many index bits
+ std::array<uint64_t, 4> is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}};
+ // Samples over the numbers that are covered by that many commit bits
+ std::array<uint64_t, 4> ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}};
+ // Iterate over prepare numbers that have i) cover all bits of a sequence
+ // number, and ii) include some bits that fall into the range of index or
+ // commit bits
+ for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) {
+ for (uint64_t i : is) {
+ for (uint64_t d : ds) {
+ uint64_t p = base + i + d;
+ for (uint64_t c : {p, p + d / 2, p + d}) {
+ uint64_t index = p % INDEX_SIZE;
+ CommitEntry before(p, c), after;
+ CommitEntry64b entry64b(before, FORMAT);
+ ok = entry64b.Parse(index, &after, FORMAT);
+ ASSERT_TRUE(ok);
+ if (!(before == after)) {
+ printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64
+ " c %" PRIu64 " index %" PRIu64 "\n",
+ base, i, d, p, c, index);
+ }
+ ASSERT_EQ(before, after);
+ }
+ }
+ }
+ }
+}
+
+class WritePreparedTxnDBMock : public WritePreparedTxnDB {
+ public:
+ WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt)
+ : WritePreparedTxnDB(db_impl, opt) {}
+ void SetDBSnapshots(const std::vector<SequenceNumber>& snapshots) {
+ snapshots_ = snapshots;
+ }
+ void TakeSnapshot(SequenceNumber seq) { snapshots_.push_back(seq); }
+
+ protected:
+ const std::vector<SequenceNumber> GetSnapshotListFromDB(
+ SequenceNumber /* unused */) override {
+ return snapshots_;
+ }
+
+ private:
+ std::vector<SequenceNumber> snapshots_;
+};
+
+class WritePreparedTransactionTestBase : public TransactionTestBase {
+ public:
+ WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
+ TxnDBWritePolicy write_policy,
+ WriteOrdering write_ordering)
+ : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+ write_ordering){};
+
+ protected:
+ void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
+ size_t commit_cache_bits) {
+ txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+ txn_db_options.wp_commit_cache_bits = commit_cache_bits;
+ }
+ void UpdateTransactionDBOptions(size_t snapshot_cache_bits) {
+ txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+ }
+ // If expect_update is set, check if it actually updated old_commit_map_. If
+ // it did not and yet suggested not to check the next snapshot, do the
+ // opposite to check if it was not a bad suggestion.
+ void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit,
+ uint64_t snapshot,
+ uint64_t next_snapshot,
+ bool expect_update) {
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ // reset old_commit_map_empty_ so that its value indicate whether
+ // old_commit_map_ was updated
+ wp_db->old_commit_map_empty_ = true;
+ bool check_next = wp_db->MaybeUpdateOldCommitMap(prepare, commit, snapshot,
+ snapshot < next_snapshot);
+ if (expect_update == wp_db->old_commit_map_empty_) {
+ printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+ " next: %" PRIu64 "\n",
+ prepare, commit, snapshot, next_snapshot);
+ }
+ EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_);
+ if (!check_next && wp_db->old_commit_map_empty_) {
+ // do the opposite to make sure it was not a bad suggestion
+ const bool dont_care_bool = true;
+ wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot,
+ dont_care_bool);
+ if (!wp_db->old_commit_map_empty_) {
+ printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+ " next: %" PRIu64 "\n",
+ prepare, commit, snapshot, next_snapshot);
+ }
+ EXPECT_TRUE(wp_db->old_commit_map_empty_);
+ }
+ }
+
+ // Test that a CheckAgainstSnapshots thread reading old_snapshots will not
+ // miss a snapshot because of a concurrent update by UpdateSnapshots that is
+ // writing new_snapshots. Both threads are broken at two points. The sync
+ // points to enforce them are specified by a1, a2, b1, and b2. CommitEntry
+ // entry is expected to be vital for one of the snapshots that is common
+ // between the old and new list of snapshots.
+ void SnapshotConcurrentAccessTestInternal(
+ WritePreparedTxnDB* wp_db,
+ const std::vector<SequenceNumber>& old_snapshots,
+ const std::vector<SequenceNumber>& new_snapshots, CommitEntry& entry,
+ SequenceNumber& version, size_t a1, size_t a2, size_t b1, size_t b2) {
+ // First reset the snapshot list
+ const std::vector<SequenceNumber> empty_snapshots;
+ wp_db->old_commit_map_empty_ = true;
+ wp_db->UpdateSnapshots(empty_snapshots, ++version);
+ // Then initialize it with the old_snapshots
+ wp_db->UpdateSnapshots(old_snapshots, ++version);
+
+ // Starting from the first thread, cut each thread at two points
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a1),
+ "WritePreparedTxnDB::UpdateSnapshots:s:start"},
+ {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b1),
+ "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a1)},
+ {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a2),
+ "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b1)},
+ {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b2),
+ "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a2)},
+ {"WritePreparedTxnDB::CheckAgainstSnapshots:p:end",
+ "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b2)},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ {
+ ASSERT_TRUE(wp_db->old_commit_map_empty_);
+ ROCKSDB_NAMESPACE::port::Thread t1(
+ [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+ wp_db->CheckAgainstSnapshots(entry);
+ t1.join();
+ ASSERT_FALSE(wp_db->old_commit_map_empty_);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ wp_db->old_commit_map_empty_ = true;
+ wp_db->UpdateSnapshots(empty_snapshots, ++version);
+ wp_db->UpdateSnapshots(old_snapshots, ++version);
+ // Starting from the second thread, cut each thread at two points
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a1),
+ "WritePreparedTxnDB::CheckAgainstSnapshots:s:start"},
+ {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b1),
+ "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a1)},
+ {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a2),
+ "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b1)},
+ {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b2),
+ "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a2)},
+ {"WritePreparedTxnDB::UpdateSnapshots:p:end",
+ "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b2)},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ {
+ ASSERT_TRUE(wp_db->old_commit_map_empty_);
+ ROCKSDB_NAMESPACE::port::Thread t1(
+ [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+ wp_db->CheckAgainstSnapshots(entry);
+ t1.join();
+ ASSERT_FALSE(wp_db->old_commit_map_empty_);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+
+ // Verify value of keys.
+ void VerifyKeys(const std::unordered_map<std::string, std::string>& data,
+ const Snapshot* snapshot = nullptr) {
+ std::string value;
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ for (auto& kv : data) {
+ auto s = db->Get(read_options, kv.first, &value);
+ ASSERT_TRUE(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ if (kv.second != value) {
+ printf("key = %s\n", kv.first.c_str());
+ }
+ ASSERT_EQ(kv.second, value);
+ } else {
+ ASSERT_EQ(kv.second, "NOT_FOUND");
+ }
+
+ // Try with MultiGet API too
+ std::vector<std::string> values;
+ auto s_vec = db->MultiGet(read_options, {db->DefaultColumnFamily()},
+ {kv.first}, &values);
+ ASSERT_EQ(1, values.size());
+ ASSERT_EQ(1, s_vec.size());
+ s = s_vec[0];
+ ASSERT_TRUE(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ ASSERT_TRUE(kv.second == values[0]);
+ } else {
+ ASSERT_EQ(kv.second, "NOT_FOUND");
+ }
+ }
+ }
+
+ // Verify all versions of keys.
+ void VerifyInternalKeys(const std::vector<KeyVersion>& expected_versions) {
+ std::vector<KeyVersion> versions;
+ const size_t kMaxKeys = 100000;
+ ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key,
+ expected_versions.back().user_key, kMaxKeys,
+ &versions));
+ ASSERT_EQ(expected_versions.size(), versions.size());
+ for (size_t i = 0; i < versions.size(); i++) {
+ ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key);
+ ASSERT_EQ(expected_versions[i].sequence, versions[i].sequence);
+ ASSERT_EQ(expected_versions[i].type, versions[i].type);
+ if (versions[i].type != kTypeDeletion &&
+ versions[i].type != kTypeSingleDeletion) {
+ ASSERT_EQ(expected_versions[i].value, versions[i].value);
+ }
+ // Range delete not supported.
+ ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion);
+ }
+ }
+};
+
+class WritePreparedTransactionTest
+ : public WritePreparedTransactionTestBase,
+ virtual public ::testing::WithParamInterface<
+ std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+ WritePreparedTransactionTest()
+ : WritePreparedTransactionTestBase(
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())){};
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class SnapshotConcurrentAccessTest
+ : public WritePreparedTransactionTestBase,
+ virtual public ::testing::WithParamInterface<std::tuple<
+ bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+ public:
+ SnapshotConcurrentAccessTest()
+ : WritePreparedTransactionTestBase(
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())),
+ split_id_(std::get<4>(GetParam())),
+ split_cnt_(std::get<5>(GetParam())){};
+
+ protected:
+ // A test is split into split_cnt_ tests, each identified with split_id_ where
+ // 0 <= split_id_ < split_cnt_
+ size_t split_id_;
+ size_t split_cnt_;
+};
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class SeqAdvanceConcurrentTest
+ : public WritePreparedTransactionTestBase,
+ virtual public ::testing::WithParamInterface<std::tuple<
+ bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+ public:
+ SeqAdvanceConcurrentTest()
+ : WritePreparedTransactionTestBase(
+ std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()), std::get<3>(GetParam())),
+ split_id_(std::get<4>(GetParam())),
+ split_cnt_(std::get<5>(GetParam())) {
+ special_env.skip_fsync_ = true;
+ };
+
+ protected:
+ // A test is split into split_cnt_ tests, each identified with split_id_ where
+ // 0 <= split_id_ < split_cnt_
+ size_t split_id_;
+ size_t split_cnt_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ WritePreparedTransaction, WritePreparedTransactionTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+ TwoWriteQueues, SnapshotConcurrentAccessTest,
+ ::testing::Values(
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+ OneWriteQueue, SnapshotConcurrentAccessTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+ TwoWriteQueues, SeqAdvanceConcurrentTest,
+ ::testing::Values(
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+ std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+ OneWriteQueue, SeqAdvanceConcurrentTest,
+ ::testing::Values(
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+ std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(WritePreparedTransactionTest, CommitMap) {
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ASSERT_NE(wp_db, nullptr);
+ ASSERT_NE(wp_db->db_impl_, nullptr);
+ size_t size = wp_db->COMMIT_CACHE_SIZE;
+ CommitEntry c = {5, 12}, e;
+ bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e);
+ ASSERT_FALSE(evicted);
+
+ // Should be able to read the same value
+ CommitEntry64b dont_care;
+ bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+ ASSERT_TRUE(found);
+ ASSERT_EQ(c, e);
+ // Should be able to distinguish between overlapping entries
+ found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e);
+ ASSERT_TRUE(found);
+ ASSERT_NE(c.prep_seq + size, e.prep_seq);
+ // Should be able to detect non-existent entry
+ found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e);
+ ASSERT_FALSE(found);
+
+ // Reject an invalid exchange
+ CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size};
+ CommitEntry64b e2_64b(e2, wp_db->FORMAT);
+ bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e);
+ ASSERT_FALSE(exchanged);
+ // check whether it did actually reject that
+ found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e);
+ ASSERT_TRUE(found);
+ ASSERT_EQ(c, e);
+
+ // Accept a valid exchange
+ CommitEntry64b c_64b(c, wp_db->FORMAT);
+ CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1};
+ exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3);
+ ASSERT_TRUE(exchanged);
+ // check whether it did actually accepted that
+ found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+ ASSERT_TRUE(found);
+ ASSERT_EQ(e3, e);
+
+ // Rewrite an entry
+ CommitEntry e4 = {e3.prep_seq + size, e3.commit_seq + size + 1};
+ evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e);
+ ASSERT_TRUE(evicted);
+ ASSERT_EQ(e3, e);
+ found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e);
+ ASSERT_TRUE(found);
+ ASSERT_EQ(e4, e);
+}
+
+TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
+ // If prepare <= snapshot < commit we should keep the entry around since its
+ // nonexistence could be interpreted as committed in the snapshot while it is
+ // not true. We keep such entries around by adding them to the
+ // old_commit_map_.
+ uint64_t p /*prepare*/, c /*commit*/, s /*snapshot*/, ns /*next_snapshot*/;
+ p = 10l, c = 15l, s = 20l, ns = 21l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+ // If we do not expect the old commit map to be updated, try also with a next
+ // snapshot that is expected to update the old commit map. This would test
+ // that MaybeUpdateOldCommitMap would not prevent us from checking the next
+ // snapshot that must be checked.
+ p = 10l, c = 15l, s = 20l, ns = 11l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+ p = 10l, c = 20l, s = 20l, ns = 19l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+ p = 10l, c = 20l, s = 20l, ns = 21l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+ p = 20l, c = 20l, s = 20l, ns = 21l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+ p = 20l, c = 20l, s = 20l, ns = 19l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+ p = 10l, c = 25l, s = 20l, ns = 21l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+ p = 20l, c = 25l, s = 20l, ns = 21l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+ p = 21l, c = 25l, s = 20l, ns = 22l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+ p = 21l, c = 25l, s = 20l, ns = 19l;
+ MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+}
+
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
+ const int kAttemptHistoryMemtable = 0;
+ const int kAttemptImmMemTable = 1;
+ for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+ attempt++) {
+ options.max_write_buffer_number_to_maintain = 3;
+ ASSERT_OK(ReOpen());
+
+ WriteOptions write_options;
+ ReadOptions read_options;
+ TransactionOptions txn_options;
+ txn_options.set_snapshot = true;
+ string value;
+
+ ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+ ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn != nullptr);
+ ASSERT_OK(txn->SetName("txn"));
+
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn2 != nullptr);
+ ASSERT_OK(txn2->SetName("txn2"));
+
+ // This transaction is created to cause potential conflict.
+ Transaction* txn_x = db->BeginTransaction(write_options);
+ ASSERT_OK(txn_x->SetName("txn_x"));
+ ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3")));
+ ASSERT_OK(txn_x->Prepare());
+
+ // Create snapshots after the prepare, but there should still
+ // be a conflict when trying to read "foo".
+
+ if (attempt == kAttemptImmMemTable) {
+ // For the second attempt, hold flush from beginning. The memtable
+ // will be switched to immutable after calling TEST_SwitchMemtable()
+ // while CheckKey() is called.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable",
+ "FlushJob::Start"}});
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ // force a memtable flush. The memtable should still be kept
+ FlushOptions flush_ops;
+ if (attempt == kAttemptHistoryMemtable) {
+ ASSERT_OK(db->Flush(flush_ops));
+ } else {
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
+ ASSERT_OK(db_impl->TEST_SwitchMemtable());
+ }
+ uint64_t num_imm_mems;
+ ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+ &num_imm_mems));
+ if (attempt == kAttemptHistoryMemtable) {
+ ASSERT_EQ(0, num_imm_mems);
+ } else {
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ ASSERT_EQ(1, num_imm_mems);
+ }
+
+ // Put something in active memtable
+ ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+ // Create txn3 after flushing, but this transaction also needs to
+ // check all memtables because of they contains uncommitted data.
+ Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn3 != nullptr);
+ ASSERT_OK(txn3->SetName("txn3"));
+
+ // Commit the pending write
+ ASSERT_OK(txn_x->Commit());
+
+ // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will
+ // pass. In all cases, both memtables are queried.
+ SetPerfLevel(PerfLevel::kEnableCount);
+ get_perf_context()->Reset();
+ ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy());
+ // We should have checked two memtables, active and either immutable
+ // or history memtable, depending on the test case.
+ ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+ get_perf_context()->Reset();
+ ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy());
+ // We should have checked two memtables, active and either immutable
+ // or history memtable, depending on the test case.
+ ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+ get_perf_context()->Reset();
+ ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value));
+ ASSERT_EQ(value, "bar");
+ // We should have checked two memtables, and since there is no
+ // conflict, another Get() will be made and fetch the data from
+ // DB. If it is in immutable memtable, two extra memtable reads
+ // will be issued. If it is not (in history), only one will
+ // be made, which is to the active memtable.
+ if (attempt == kAttemptHistoryMemtable) {
+ ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+ } else {
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
+ }
+
+ Transaction* txn4 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_TRUE(txn4 != nullptr);
+ ASSERT_OK(txn4->SetName("txn4"));
+ get_perf_context()->Reset();
+ ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value));
+ if (attempt == kAttemptHistoryMemtable) {
+ // Active memtable will be checked in snapshot validation and when
+ // getting the value.
+ ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+ } else {
+ // Only active memtable will be checked in snapshot validation but
+ // both of active and immutable snapshot will be queried when
+ // getting the value.
+ ASSERT_EQ(attempt, kAttemptImmMemTable);
+ ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+ }
+
+ ASSERT_OK(txn2->Commit());
+ ASSERT_OK(txn4->Commit());
+
+ TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable");
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ SetPerfLevel(PerfLevel::kDisable);
+
+ delete txn;
+ delete txn2;
+ delete txn3;
+ delete txn4;
+ delete txn_x;
+ }
+}
+
+// Reproduce the bug with two snapshots with the same seuqence number and test
+// that the release of the first snapshot will not affect the reads by the other
+// snapshot
+TEST_P(WritePreparedTransactionTest, DoubleSnapshot) {
+ TransactionOptions txn_options;
+ Status s;
+
+ // Insert initial value
+ ASSERT_OK(db->Put(WriteOptions(), "key", "value1"));
+
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ Transaction* txn =
+ wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr);
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Put("key", "value2"));
+ ASSERT_OK(txn->Prepare());
+ // Three snapshots with the same seq number
+ const Snapshot* snapshot0 = wp_db->GetSnapshot();
+ const Snapshot* snapshot1 = wp_db->GetSnapshot();
+ const Snapshot* snapshot2 = wp_db->GetSnapshot();
+ ASSERT_OK(txn->Commit());
+ SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE;
+ SequenceNumber overlap_seq = txn->GetId() + cache_size;
+ delete txn;
+
+ // 4th snapshot with a larger seq
+ const Snapshot* snapshot3 = wp_db->GetSnapshot();
+ // Cause an eviction to advance max evicted seq number
+ // This also fetches the 4 snapshots from db since their seq is lower than the
+ // new max
+ wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+ ReadOptions ropt;
+ // It should see the value before commit
+ ropt.snapshot = snapshot2;
+ PinnableSlice pinnable_val;
+ s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == "value1");
+ pinnable_val.Reset();
+
+ wp_db->ReleaseSnapshot(snapshot1);
+
+ // It should still see the value before commit
+ s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == "value1");
+ pinnable_val.Reset();
+
+ // Cause an eviction to advance max evicted seq number and trigger updating
+ // the snapshot list
+ overlap_seq += cache_size;
+ wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+ // It should still see the value before commit
+ s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_OK(s);
+ ASSERT_TRUE(pinnable_val == "value1");
+ pinnable_val.Reset();
+
+ wp_db->ReleaseSnapshot(snapshot0);
+ wp_db->ReleaseSnapshot(snapshot2);
+ wp_db->ReleaseSnapshot(snapshot3);
+}
+
+size_t UniqueCnt(std::vector<SequenceNumber> vec) {
+ std::set<SequenceNumber> aset;
+ for (auto i : vec) {
+ aset.insert(i);
+ }
+ return aset.size();
+}
+// Test that the entries in old_commit_map_ get garbage collected properly
+TEST_P(WritePreparedTransactionTest, OldCommitMapGC) {
+ const size_t snapshot_cache_bits = 0;
+ const size_t commit_cache_bits = 0;
+ DBImpl* mock_db = new DBImpl(options, dbname);
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+ new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+ SequenceNumber seq = 0;
+ // Take the first snapshot that overlaps with two txn
+ auto prep_seq = ++seq;
+ wp_db->AddPrepared(prep_seq);
+ auto prep_seq2 = ++seq;
+ wp_db->AddPrepared(prep_seq2);
+ auto snap_seq1 = seq;
+ wp_db->TakeSnapshot(snap_seq1);
+ auto commit_seq = ++seq;
+ wp_db->AddCommitted(prep_seq, commit_seq);
+ wp_db->RemovePrepared(prep_seq);
+ auto commit_seq2 = ++seq;
+ wp_db->AddCommitted(prep_seq2, commit_seq2);
+ wp_db->RemovePrepared(prep_seq2);
+ // Take the 2nd and 3rd snapshot that overlap with the same txn
+ prep_seq = ++seq;
+ wp_db->AddPrepared(prep_seq);
+ auto snap_seq2 = seq;
+ wp_db->TakeSnapshot(snap_seq2);
+ seq++;
+ auto snap_seq3 = seq;
+ wp_db->TakeSnapshot(snap_seq3);
+ seq++;
+ commit_seq = ++seq;
+ wp_db->AddCommitted(prep_seq, commit_seq);
+ wp_db->RemovePrepared(prep_seq);
+ // Make sure max_evicted_seq_ will be larger than 2nd snapshot by evicting the
+ // only item in the commit_cache_ via another commit.
+ prep_seq = ++seq;
+ wp_db->AddPrepared(prep_seq);
+ commit_seq = ++seq;
+ wp_db->AddCommitted(prep_seq, commit_seq);
+ wp_db->RemovePrepared(prep_seq);
+
+ // Verify that the evicted commit entries for all snapshots are in the
+ // old_commit_map_
+ {
+ ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+ ReadLock rl(&wp_db->old_commit_map_mutex_);
+ ASSERT_EQ(3, wp_db->old_commit_map_.size());
+ ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+ ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq2]));
+ ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+ }
+
+ // Verify that the 2nd snapshot is cleaned up after the release
+ wp_db->ReleaseSnapshotInternal(snap_seq2);
+ {
+ ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+ ReadLock rl(&wp_db->old_commit_map_mutex_);
+ ASSERT_EQ(2, wp_db->old_commit_map_.size());
+ ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+ ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+ }
+
+ // Verify that the 1st snapshot is cleaned up after the release
+ wp_db->ReleaseSnapshotInternal(snap_seq1);
+ {
+ ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+ ReadLock rl(&wp_db->old_commit_map_mutex_);
+ ASSERT_EQ(1, wp_db->old_commit_map_.size());
+ ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+ }
+
+ // Verify that the 3rd snapshot is cleaned up after the release
+ wp_db->ReleaseSnapshotInternal(snap_seq3);
+ {
+ ASSERT_TRUE(wp_db->old_commit_map_empty_.load());
+ ReadLock rl(&wp_db->old_commit_map_mutex_);
+ ASSERT_EQ(0, wp_db->old_commit_map_.size());
+ }
+}
+
+TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) {
+ std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l,
+ 600l, 700l, 800l, 900l};
+ const size_t snapshot_cache_bits = 2;
+ const uint64_t cache_size = 1ul << snapshot_cache_bits;
+ // Safety check to express the intended size in the test. Can be adjusted if
+ // the snapshots lists changed.
+ ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size());
+ DBImpl* mock_db = new DBImpl(options, dbname);
+ UpdateTransactionDBOptions(snapshot_cache_bits);
+ std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+ new WritePreparedTxnDBMock(mock_db, txn_db_options));
+ SequenceNumber version = 1000l;
+ ASSERT_EQ(0, wp_db->snapshots_total_);
+ wp_db->UpdateSnapshots(snapshots, version);
+ ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_);
+ // seq numbers are chosen so that we have two of them between each two
+ // snapshots. If the diff of two consecutive seq is more than 5, there is a
+ // snapshot between them.
+ std::vector<SequenceNumber> seqs = {50l, 55l, 150l, 155l, 250l, 255l, 350l,
+ 355l, 450l, 455l, 550l, 555l, 650l, 655l,
+ 750l, 755l, 850l, 855l, 950l, 955l};
+ ASSERT_GT(seqs.size(), 1);
+ for (size_t i = 0; i + 1 < seqs.size(); i++) {
+ wp_db->old_commit_map_empty_ = true; // reset
+ CommitEntry commit_entry = {seqs[i], seqs[i + 1]};
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ // Expect update if there is snapshot in between the prepare and commit
+ bool expect_update = commit_entry.commit_seq - commit_entry.prep_seq > 5 &&
+ commit_entry.commit_seq >= snapshots.front() &&
+ commit_entry.prep_seq <= snapshots.back();
+ ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_);
+ }
+
+ // Test that search will include multiple snapshot from snapshot cache
+ {
+ // exclude first and last item in the cache
+ CommitEntry commit_entry = {snapshots.front() + 1,
+ snapshots[cache_size - 1] - 1};
+ wp_db->old_commit_map_empty_ = true; // reset
+ wp_db->old_commit_map_.clear();
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2);
+ }
+
+ // Test that search will include multiple snapshot from old snapshots
+ {
+ // include two in the middle
+ CommitEntry commit_entry = {snapshots[cache_size] + 1,
+ snapshots[cache_size + 2] + 1};
+ wp_db->old_commit_map_empty_ = true; // reset
+ wp_db->old_commit_map_.clear();
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ ASSERT_EQ(wp_db->old_commit_map_.size(), 2);
+ }
+
+ // Test that search will include both snapshot cache and old snapshots
+ // Case 1: includes all in snapshot cache
+ {
+ CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1};
+ wp_db->old_commit_map_empty_ = true; // reset
+ wp_db->old_commit_map_.clear();
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size());
+ }
+
+ // Case 2: includes all snapshot caches except the smallest
+ {
+ CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1};
+ wp_db->old_commit_map_empty_ = true; // reset
+ wp_db->old_commit_map_.clear();
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1);
+ }
+
+ // Case 3: includes only the largest of snapshot cache
+ {
+ CommitEntry commit_entry = {snapshots[cache_size - 1] - 1,
+ snapshots.back() + 1};
+ wp_db->old_commit_map_empty_ = true; // reset
+ wp_db->old_commit_map_.clear();
+ wp_db->CheckAgainstSnapshots(commit_entry);
+ ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1);
+ }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in
+// parallel with UpdateSnapshots.
+TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) {
+ // We have a sync point in the method under test after checking each snapshot.
+ // If you increase the max number of snapshots in this test, more sync points
+ // in the methods must also be added.
+ const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l,
+ 60l, 70l, 80l, 90l, 100l};
+ const size_t snapshot_cache_bits = 2;
+ // Safety check to express the intended size in the test. Can be adjusted if
+ // the snapshots lists changed.
+ ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size());
+ SequenceNumber version = 1000l;
+ // Choose the cache size so that the new snapshot list could replace all the
+ // existing items in the cache and also have some overflow.
+ DBImpl* mock_db = new DBImpl(options, dbname);
+ UpdateTransactionDBOptions(snapshot_cache_bits);
+ std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+ new WritePreparedTxnDBMock(mock_db, txn_db_options));
+ const size_t extra = 2;
+ size_t loop_id = 0;
+ // Add up to extra items that do not fit into the cache
+ for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + extra;
+ old_size++) {
+ const std::vector<SequenceNumber> old_snapshots(
+ snapshots.begin(), snapshots.begin() + old_size);
+
+ // Each member of old snapshot might or might not appear in the new list. We
+ // create a common_snapshots for each combination.
+ size_t new_comb_cnt = size_t(1) << old_size;
+ for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) {
+ if (loop_id % split_cnt_ != split_id_) continue;
+ printf("."); // To signal progress
+ fflush(stdout);
+ std::vector<SequenceNumber> common_snapshots;
+ for (size_t i = 0; i < old_snapshots.size(); i++) {
+ if (IsInCombination(i, new_comb)) {
+ common_snapshots.push_back(old_snapshots[i]);
+ }
+ }
+ // And add some new snapshots to the common list
+ for (size_t added_snapshots = 0;
+ added_snapshots <= snapshots.size() - old_snapshots.size();
+ added_snapshots++) {
+ std::vector<SequenceNumber> new_snapshots = common_snapshots;
+ for (size_t i = 0; i < added_snapshots; i++) {
+ new_snapshots.push_back(snapshots[old_snapshots.size() + i]);
+ }
+ for (auto it = common_snapshots.begin(); it != common_snapshots.end();
+ ++it) {
+ auto snapshot = *it;
+ // Create a commit entry that is around the snapshot and thus should
+ // be not be discarded
+ CommitEntry entry = {static_cast<uint64_t>(snapshot - 1),
+ snapshot + 1};
+ // The critical part is when iterating the snapshot cache. Afterwards,
+ // we are operating under the lock
+ size_t a_range =
+ std::min(old_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+ size_t b_range =
+ std::min(new_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+ // Break each thread at two points
+ for (size_t a1 = 1; a1 <= a_range; a1++) {
+ for (size_t a2 = a1 + 1; a2 <= a_range; a2++) {
+ for (size_t b1 = 1; b1 <= b_range; b1++) {
+ for (size_t b2 = b1 + 1; b2 <= b_range; b2++) {
+ SnapshotConcurrentAccessTestInternal(
+ wp_db.get(), old_snapshots, new_snapshots, entry, version,
+ a1, a2, b1, b2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ printf("\n");
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// This test clarifies the contract of AdvanceMaxEvictedSeq method
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasic) {
+ DBImpl* mock_db = new DBImpl(options, dbname);
+ std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+ new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+ // 1. Set the initial values for max, prepared, and snapshots
+ SequenceNumber zero_max = 0l;
+ // Set the initial list of prepared txns
+ const std::vector<SequenceNumber> initial_prepared = {10, 30, 50, 100,
+ 150, 200, 250};
+ for (auto p : initial_prepared) {
+ wp_db->AddPrepared(p);
+ }
+ // This updates the max value and also set old prepared
+ SequenceNumber init_max = 100;
+ wp_db->AdvanceMaxEvictedSeq(zero_max, init_max);
+ const std::vector<SequenceNumber> initial_snapshots = {20, 40};
+ wp_db->SetDBSnapshots(initial_snapshots);
+ // This will update the internal cache of snapshots from the DB
+ wp_db->UpdateSnapshots(initial_snapshots, init_max);
+
+ // 2. Invoke AdvanceMaxEvictedSeq
+ const std::vector<SequenceNumber> latest_snapshots = {20, 110, 220, 300};
+ wp_db->SetDBSnapshots(latest_snapshots);
+ SequenceNumber new_max = 200;
+ wp_db->AdvanceMaxEvictedSeq(init_max, new_max);
+
+ // 3. Verify that the state matches with AdvanceMaxEvictedSeq contract
+ // a. max should be updated to new_max
+ ASSERT_EQ(wp_db->max_evicted_seq_, new_max);
+ // b. delayed prepared should contain every txn <= max and prepared should
+ // only contain txns > max
+ auto it = initial_prepared.begin();
+ for (; it != initial_prepared.end() && *it <= new_max; ++it) {
+ ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it));
+ }
+ ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+ for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty();
+ ++it, wp_db->prepared_txns_.pop()) {
+ ASSERT_EQ(*it, wp_db->prepared_txns_.top());
+ }
+ ASSERT_TRUE(it == initial_prepared.end());
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ // c. snapshots should contain everything below new_max
+ auto sit = latest_snapshots.begin();
+ for (size_t i = 0; sit != latest_snapshots.end() && *sit <= new_max &&
+ i < wp_db->snapshots_total_;
+ sit++, i++) {
+ ASSERT_TRUE(i < wp_db->snapshots_total_);
+ // This test is in small scale and the list of snapshots are assumed to be
+ // within the cache size limit. This is just a safety check to double check
+ // that assumption.
+ ASSERT_TRUE(i < wp_db->SNAPSHOT_CACHE_SIZE);
+ ASSERT_EQ(*sit, wp_db->snapshot_cache_[i]);
+ }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// Otherwise the snapshot does not go through AdvanceMaxEvictedSeq
+TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ Transaction* txn0 = db->BeginTransaction(woptions, txn_options);
+ ASSERT_OK(txn0->Put(Slice("key"), Slice("value")));
+ ASSERT_OK(txn0->Commit());
+ const SequenceNumber seq = txn0->GetId(); // is also prepare seq
+ delete txn0;
+ std::vector<Transaction*> txns;
+ // Inc seq without committing anything
+ for (int i = 0; i < 10; i++) {
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+ ASSERT_OK(txn->Put(Slice("key" + std::to_string(i)), Slice("value")));
+ ASSERT_OK(txn->Prepare());
+ txns.push_back(txn);
+ }
+
+ // The new commit is seq + 10
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ auto snap = wp_db->GetSnapshot();
+ const SequenceNumber last_seq = snap->GetSequenceNumber();
+ wp_db->ReleaseSnapshot(snap);
+ ASSERT_LT(seq, last_seq);
+ // Otherwise our test is not effective
+ ASSERT_LT(last_seq - seq, wp_db->INC_STEP_FOR_MAX_EVICTED);
+
+ // Evict seq out of commit cache
+ const SequenceNumber overwrite_seq = seq + wp_db->COMMIT_CACHE_SIZE;
+ // Check that the next write could make max go beyond last
+ auto last_max = wp_db->max_evicted_seq_.load();
+ wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+ // Check that eviction has advanced the max
+ ASSERT_LT(last_max, wp_db->max_evicted_seq_.load());
+ // Check that the new max has not advanced the last seq
+ ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq);
+ for (auto txn : txns) {
+ txn->Rollback();
+ delete txn;
+ }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// In very rare cases max could be below last published seq. Test that
+// taking snapshot will wait for max to catch up.
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WriteOptions woptions;
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+ const int writes = 50;
+ const int batch_cnt = 4;
+ ROCKSDB_NAMESPACE::port::Thread t1([&]() {
+ for (int i = 0; i < writes; i++) {
+ WriteBatch batch;
+ // For duplicate keys cause 4 commit entries, each evicting an entry that
+ // is not published yet, thus causing max evicted seq go higher than last
+ // published.
+ for (int b = 0; b < batch_cnt; b++) {
+ ASSERT_OK(batch.Put("foo", "foo"));
+ }
+ ASSERT_OK(db->Write(woptions, &batch));
+ }
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread t2([&]() {
+ while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread
+ std::this_thread::yield();
+ }
+ for (int i = 0; i < 10; i++) {
+ SequenceNumber max_lower_bound = wp_db->max_evicted_seq_;
+ auto snap = db->GetSnapshot();
+ if (snap->GetSequenceNumber() != 0) {
+ // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus
+ // compare with the lower bound instead as an approximation.
+ ASSERT_LT(max_lower_bound, snap->GetSequenceNumber());
+ } // seq 0 is ok to be less than max since nothing is visible to it
+ db->ReleaseSnapshot(snap);
+ }
+ });
+
+ t1.join();
+ t2.join();
+
+ // Make sure that the test has worked and seq number has advanced as we
+ // thought
+ auto snap = db->GetSnapshot();
+ ASSERT_GT(snap->GetSequenceNumber(), batch_cnt * writes - 1);
+ db->ReleaseSnapshot(snap);
+}
+
+// Test that reads without snapshots would not hit an undefined state
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WriteOptions woptions;
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+ const int writes = 50;
+ ROCKSDB_NAMESPACE::port::Thread t1([&]() {
+ for (int i = 0; i < writes; i++) {
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("key", "foo"));
+ ASSERT_OK(db->Write(woptions, &batch));
+ }
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread t2([&]() {
+ while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread
+ std::this_thread::yield();
+ }
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ TransactionOptions txn_options;
+ for (int i = 0; i < 10; i++) {
+ auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_TRUE(s.ok() || s.IsTryAgain());
+ pinnable_val.Reset();
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_TRUE(s.ok() || s.IsTryAgain());
+ pinnable_val.Reset();
+ std::vector<std::string> values;
+ auto s_vec =
+ txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values);
+ ASSERT_EQ(1, values.size());
+ ASSERT_EQ(1, s_vec.size());
+ s = s_vec[0];
+ ASSERT_TRUE(s.ok() || s.IsTryAgain());
+ Slice key("key");
+ txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s,
+ true);
+ ASSERT_TRUE(s.ok() || s.IsTryAgain());
+ delete txn;
+ }
+ });
+
+ t1.join();
+ t2.join();
+
+ // Make sure that the test has worked and seq number has advanced as we
+ // thought
+ auto snap = db->GetSnapshot();
+ ASSERT_GT(snap->GetSequenceNumber(), writes - 1);
+ db->ReleaseSnapshot(snap);
+}
+
+// Check that old_commit_map_ cleanup works correctly if the snapshot equals
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WriteOptions woptions;
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ // Insert something to increase seq
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ auto snap = db->GetSnapshot();
+ auto snap_seq = snap->GetSequenceNumber();
+ // Another insert should trigger eviction + load snapshot from db
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ // This is the scenario that we check agaisnt
+ ASSERT_EQ(snap_seq, wp_db->max_evicted_seq_);
+ // old_commit_map_ now has some data that needs gc
+ ASSERT_EQ(1, wp_db->snapshots_total_);
+ ASSERT_EQ(1, wp_db->old_commit_map_.size());
+
+ db->ReleaseSnapshot(snap);
+
+ // Another insert should trigger eviction + load snapshot from db
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+
+ // the snapshot and related metadata must be properly garbage collected
+ ASSERT_EQ(0, wp_db->snapshots_total_);
+ ASSERT_TRUE(wp_db->snapshots_all_.empty());
+ ASSERT_EQ(0, wp_db->old_commit_map_.size());
+}
+
+TEST_P(WritePreparedTransactionTest, AdvanceSeqByOne) {
+ auto snap = db->GetSnapshot();
+ auto seq1 = snap->GetSequenceNumber();
+ db->ReleaseSnapshot(snap);
+
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ wp_db->AdvanceSeqByOne();
+
+ snap = db->GetSnapshot();
+ auto seq2 = snap->GetSequenceNumber();
+ db->ReleaseSnapshot(snap);
+
+ ASSERT_LT(seq1, seq2);
+}
+
+// Test that the txn Initilize calls the overridden functions
+TEST_P(WritePreparedTransactionTest, TxnInitialize) {
+ TransactionOptions txn_options;
+ WriteOptions write_options;
+ ASSERT_OK(db->Put(write_options, "key", "value"));
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+ ASSERT_OK(txn0->Prepare());
+
+ // SetSnapshot is overridden to update min_uncommitted_
+ txn_options.set_snapshot = true;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ auto snap = txn1->GetSnapshot();
+ auto snap_impl = reinterpret_cast<const SnapshotImpl*>(snap);
+ // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be
+ // udpated
+ ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq);
+
+ ASSERT_OK(txn0->Rollback());
+ ASSERT_OK(txn1->Rollback());
+ delete txn0;
+ delete txn1;
+}
+
+// This tests that transactions with duplicate keys perform correctly after max
+// is advancing their prepared sequence numbers. This will not be the case if
+// for example the txn does not add the prepared seq for the second sub-batch to
+// the PreparedHeap structure.
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 1; // disable commit cache
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn0->SetName("xid"));
+ ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+ ASSERT_OK(txn0->Put(Slice("key"), Slice("value2")));
+ ASSERT_OK(txn0->Prepare());
+
+ ASSERT_OK(db->Put(write_options, "key2", "value"));
+ // Will cause max advance due to disabled commit cache
+ ASSERT_OK(db->Put(write_options, "key3", "value"));
+
+ auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ delete txn0;
+
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+ wp_db->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+
+ txn0 = db->GetTransactionByName("xid");
+ ASSERT_OK(txn0->Rollback());
+ delete txn0;
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
+// delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
+// which moves prepared txns from prepared_txns_ to delayed_prepared_.
+TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 1; // disable commit cache
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ std::vector<Transaction*> txns, committed_txns;
+
+ const int cnt = 100;
+ for (int i = 0; i < cnt; i++) {
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+ auto key = "key1" + std::to_string(i);
+ auto value = "value1" + std::to_string(i);
+ ASSERT_OK(txn->Put(Slice(key), Slice(value)));
+ ASSERT_OK(txn->Prepare());
+ txns.push_back(txn);
+ }
+
+ port::Mutex mutex;
+ Random rnd(1103);
+ ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+ for (int i = 0; i < cnt; i++) {
+ uint32_t index = rnd.Uniform(cnt - i);
+ Transaction* txn;
+ {
+ MutexLock l(&mutex);
+ txn = txns[index];
+ txns.erase(txns.begin() + index);
+ }
+ // Since commit cache is practically disabled, commit results in immediate
+ // advance in max_evicted_seq_ and subsequently moving some prepared txns
+ // to delayed_prepared_.
+ ASSERT_OK(txn->Commit());
+ committed_txns.push_back(txn);
+ }
+ });
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ while (1) {
+ MutexLock l(&mutex);
+ if (txns.empty()) {
+ break;
+ }
+ auto min_uncommitted = wp_db->SmallestUnCommittedSeq();
+ ASSERT_LE(min_uncommitted, (*txns.begin())->GetId());
+ }
+ });
+
+ commit_thread.join();
+ read_thread.join();
+ for (auto txn : committed_txns) {
+ delete txn;
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
+ // Given the sequential run of txns, with this timeout we should never see a
+ // deadlock nor a timeout unless we have a key conflict, which should be
+ // almost infeasible.
+ txn_db_options.transaction_lock_timeout = 1000;
+ txn_db_options.default_lock_timeout = 1000;
+ ASSERT_OK(ReOpen());
+ FlushOptions fopt;
+
+ // Number of different txn types we use in this test
+ const size_t type_cnt = 5;
+ // The size of the first write group
+ // TODO(myabandeh): This should be increase for pre-release tests
+ const size_t first_group_size = 2;
+ // Total number of txns we run in each test
+ // TODO(myabandeh): This should be increase for pre-release tests
+ const size_t txn_cnt = first_group_size + 1;
+
+ size_t base[txn_cnt + 1] = {
+ 1,
+ };
+ for (size_t bi = 1; bi <= txn_cnt; bi++) {
+ base[bi] = base[bi - 1] * type_cnt;
+ }
+ const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt));
+ printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n);
+ for (size_t n = 0; n < max_n; n++) {
+ if (n > 0) {
+ ASSERT_OK(ReOpen());
+ }
+
+ if (n % split_cnt_ != split_id_) continue;
+ if (n % 1000 == 0) {
+ printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
+ }
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ auto seq = db_impl->TEST_GetLastVisibleSequence();
+ with_empty_commits = 0;
+ exp_seq = seq;
+ // This is increased before writing the batch for commit
+ commit_writes = 0;
+ // This is increased before txn starts linking if it expects to do a commit
+ // eventually
+ expected_commits = 0;
+ std::vector<port::Thread> threads;
+
+ linked.store(0, std::memory_order_release);
+ std::atomic<bool> batch_formed(false);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::EnterAsBatchGroupLeader:End",
+ [&](void* /*arg*/) { batch_formed = true; });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) {
+ size_t orig_linked = linked.fetch_add(1, std::memory_order_acq_rel);
+ if (orig_linked == 0) {
+ // Wait until the others are linked too.
+ while (linked.load(std::memory_order_acquire) < first_group_size) {
+ }
+ } else if (orig_linked == first_group_size) {
+ // Make the 2nd batch of the rest of writes plus any followup
+ // commits from the first batch
+ while (linked.load(std::memory_order_acquire) <
+ txn_cnt + commit_writes) {
+ }
+ }
+ // Then we will have one or more batches consisting of follow-up
+ // commits from the 2nd batch. There is a bit of non-determinism here
+ // but it should be tolerable.
+ });
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ for (size_t bi = 0; bi < txn_cnt; bi++) {
+ // get the bi-th digit in number system based on type_cnt
+ size_t d = (n % base[bi + 1]) / base[bi];
+ switch (d) {
+ case 0:
+ threads.emplace_back(&TransactionTestBase::TestTxn0, this, bi);
+ break;
+ case 1:
+ threads.emplace_back(&TransactionTestBase::TestTxn1, this, bi);
+ break;
+ case 2:
+ threads.emplace_back(&TransactionTestBase::TestTxn2, this, bi);
+ break;
+ case 3:
+ threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi);
+ break;
+ case 4:
+ threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi);
+ break;
+ default:
+ FAIL();
+ }
+ // wait to be linked
+ while (linked.load(std::memory_order_acquire) <= bi) {
+ }
+ // after a queue of size first_group_size
+ if (bi + 1 == first_group_size) {
+ while (!batch_formed) {
+ }
+ // to make it more deterministic, wait until the commits are linked
+ while (linked.load(std::memory_order_acquire) <=
+ bi + expected_commits) {
+ }
+ }
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+ if (options.two_write_queues) {
+ // In this case none of the above scheduling tricks to deterministically
+ // form merged batches works because the writes go to separate queues.
+ // This would result in different write groups in each run of the test. We
+ // still keep the test since although non-deterministic and hard to debug,
+ // it is still useful to have.
+ // TODO(myabandeh): Add a deterministic unit test for two_write_queues
+ }
+
+ // Check if memtable inserts advanced seq number as expected
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_EQ(exp_seq, seq);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Check if recovery preserves the last sequence number
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->TEST_GetLastVisibleSequence();
+ ASSERT_LE(exp_seq, seq + with_empty_commits);
+
+ // Check if flush preserves the last sequence number
+ ASSERT_OK(db_impl->Flush(fopt));
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_LE(exp_seq, seq + with_empty_commits);
+
+ // Check if recovery after flush preserves the last sequence number
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ seq = db_impl->GetLatestSequenceNumber();
+ ASSERT_LE(exp_seq, seq + with_empty_commits);
+ }
+}
+
+// Run a couple of different txns among them some uncommitted. Restart the db at
+// a couple points to check whether the list of uncommitted txns are recovered
+// properly.
+TEST_P(WritePreparedTransactionTest, BasicRecovery) {
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+ TestTxn0(0);
+
+ TransactionOptions txn_options;
+ WriteOptions write_options;
+ size_t index = 1000;
+ Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+ auto istr0 = std::to_string(index);
+ auto s = txn0->SetName("xid" + istr0);
+ ASSERT_OK(s);
+ s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0));
+ ASSERT_OK(s);
+ s = txn0->Prepare();
+ ASSERT_OK(s);
+ auto prep_seq_0 = txn0->GetId();
+
+ TestTxn1(0);
+
+ index++;
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ auto istr1 = std::to_string(index);
+ s = txn1->SetName("xid" + istr1);
+ ASSERT_OK(s);
+ s = txn1->Put(Slice("foo1" + istr1), Slice("bar"));
+ ASSERT_OK(s);
+ s = txn1->Prepare();
+ ASSERT_OK(s);
+ auto prep_seq_1 = txn1->GetId();
+
+ TestTxn2(0);
+
+ ReadOptions ropt;
+ PinnableSlice pinnable_val;
+ // Check the value is not committed before restart
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ pinnable_val.Reset();
+
+ delete txn0;
+ delete txn1;
+ ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+ wp_db->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ // After recovery, all the uncommitted txns (0 and 1) should be inserted into
+ // delayed_prepared_
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+ ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+ ASSERT_LE(prep_seq_1, wp_db->max_evicted_seq_);
+ {
+ ReadLock rl(&wp_db->prepared_mutex_);
+ ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+ ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_0) !=
+ wp_db->delayed_prepared_.end());
+ ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_1) !=
+ wp_db->delayed_prepared_.end());
+ }
+
+ // Check the value is still not committed after restart
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+ ASSERT_TRUE(s.IsNotFound());
+ pinnable_val.Reset();
+
+ TestTxn3(0);
+
+ // Test that a recovered txns will be properly marked committed for the next
+ // recovery
+ txn1 = db->GetTransactionByName("xid" + istr1);
+ ASSERT_NE(txn1, nullptr);
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+
+ index++;
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ auto istr2 = std::to_string(index);
+ s = txn2->SetName("xid" + istr2);
+ ASSERT_OK(s);
+ s = txn2->Put(Slice("foo2" + istr2), Slice("bar"));
+ ASSERT_OK(s);
+ s = txn2->Prepare();
+ ASSERT_OK(s);
+ auto prep_seq_2 = txn2->GetId();
+
+ delete txn2;
+ ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+ wp_db->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+
+ // 0 and 2 are prepared and 1 is committed
+ {
+ ReadLock rl(&wp_db->prepared_mutex_);
+ ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+ const auto& end = wp_db->delayed_prepared_.end();
+ ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_0), end);
+ ASSERT_EQ(wp_db->delayed_prepared_.find(prep_seq_1), end);
+ ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_2), end);
+ }
+ ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+ ASSERT_LE(prep_seq_2, wp_db->max_evicted_seq_);
+
+ // Commit all the remaining txns
+ txn0 = db->GetTransactionByName("xid" + istr0);
+ ASSERT_NE(txn0, nullptr);
+ ASSERT_OK(txn0->Commit());
+ txn2 = db->GetTransactionByName("xid" + istr2);
+ ASSERT_NE(txn2, nullptr);
+ ASSERT_OK(txn2->Commit());
+
+ // Check the value is committed after commit
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+ ASSERT_TRUE(s.ok());
+ ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+ pinnable_val.Reset();
+
+ delete txn0;
+ delete txn2;
+ ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+
+ // Check the value is still committed after recovery
+ s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+ ASSERT_TRUE(s.ok());
+ ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+ pinnable_val.Reset();
+}
+
+// After recovery the commit map is empty while the max is set. The code would
+// go through a different path which requires a separate test. Test that the
+// committed data before the restart is visible to all snapshots.
+TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) {
+ for (bool end_with_prepare : {false, true}) {
+ ASSERT_OK(ReOpen());
+ WriteOptions woptions;
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ SequenceNumber prepare_seq = kMaxSequenceNumber;
+ if (end_with_prepare) {
+ TransactionOptions txn_options;
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ ASSERT_OK(txn->SetName("xid0"));
+ ASSERT_OK(txn->Prepare());
+ prepare_seq = txn->GetId();
+ delete txn;
+ }
+ dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+ auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(db_impl->FlushWAL(true));
+ ASSERT_OK(ReOpenNoDelete());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ ASSERT_NE(wp_db, nullptr);
+ ASSERT_GT(wp_db->max_evicted_seq_, 0); // max after recovery
+ // Take a snapshot right after recovery
+ const Snapshot* snap = db->GetSnapshot();
+ auto snap_seq = snap->GetSequenceNumber();
+ ASSERT_GT(snap_seq, 0);
+
+ for (SequenceNumber seq = 0;
+ seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+ }
+ if (end_with_prepare) {
+ ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+ }
+ // trivial check
+ ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+ db->ReleaseSnapshot(snap);
+
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ // Take a snapshot after some writes
+ snap = db->GetSnapshot();
+ snap_seq = snap->GetSequenceNumber();
+ for (SequenceNumber seq = 0;
+ seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+ }
+ if (end_with_prepare) {
+ ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+ }
+ // trivial check
+ ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+ db->ReleaseSnapshot(snap);
+ }
+}
+
+// Shows the contract of IsInSnapshot when called on invalid/released snapshots
+TEST_P(WritePreparedTransactionTest, IsInSnapshotReleased) {
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ WriteOptions woptions;
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ // snap seq = 1
+ const Snapshot* snap1 = db->GetSnapshot();
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ ASSERT_OK(db->Put(woptions, "key", "value"));
+ // snap seq = 3
+ const Snapshot* snap2 = db->GetSnapshot();
+ const SequenceNumber seq = 1;
+ // Evict seq out of commit cache
+ size_t overwrite_seq = wp_db->COMMIT_CACHE_SIZE + seq;
+ wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+ SequenceNumber snap_seq;
+ uint64_t min_uncommitted = kMinUnCommittedSeq;
+ bool released;
+
+ released = false;
+ snap_seq = snap1->GetSequenceNumber();
+ ASSERT_LE(seq, snap_seq);
+ // Valid snapshot lower than max
+ ASSERT_LE(snap_seq, wp_db->max_evicted_seq_);
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+ ASSERT_FALSE(released);
+
+ released = false;
+ snap_seq = snap1->GetSequenceNumber();
+ // Invaid snapshot lower than max
+ ASSERT_LE(snap_seq + 1, wp_db->max_evicted_seq_);
+ ASSERT_TRUE(
+ wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+ ASSERT_TRUE(released);
+
+ db->ReleaseSnapshot(snap1);
+
+ released = false;
+ // Released snapshot lower than max
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+ // The release does not take affect until the next max advance
+ ASSERT_FALSE(released);
+
+ released = false;
+ // Invaid snapshot lower than max
+ ASSERT_TRUE(
+ wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+ ASSERT_TRUE(released);
+
+ // This make the snapshot release to reflect in txn db structures
+ wp_db->AdvanceMaxEvictedSeq(wp_db->max_evicted_seq_,
+ wp_db->max_evicted_seq_ + 1);
+
+ released = false;
+ // Released snapshot lower than max
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+ ASSERT_TRUE(released);
+
+ released = false;
+ // Invaid snapshot lower than max
+ ASSERT_TRUE(
+ wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+ ASSERT_TRUE(released);
+
+ snap_seq = snap2->GetSequenceNumber();
+
+ released = false;
+ // Unreleased snapshot lower than max
+ ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+ ASSERT_FALSE(released);
+
+ db->ReleaseSnapshot(snap2);
+}
+
+// Test WritePreparedTxnDB's IsInSnapshot against different ordering of
+// snapshot, max_committed_seq_, prepared, and commit entries.
+TEST_P(WritePreparedTransactionTest, IsInSnapshot) {
+ WriteOptions wo;
+ // Use small commit cache to trigger lots of eviction and fast advance of
+ // max_evicted_seq_
+ const size_t commit_cache_bits = 3;
+ // Same for snapshot cache size
+ const size_t snapshot_cache_bits = 2;
+
+ // Take some preliminary snapshots first. This is to stress the data structure
+ // that holds the old snapshots as it will be designed to be efficient when
+ // only a few snapshots are below the max_evicted_seq_.
+ for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) {
+ // Leave some gap between the preliminary snapshots and the final snapshot
+ // that we check. This should test for also different overlapping scenarios
+ // between the last snapshot and the commits.
+ for (int max_gap = 1; max_gap < 10; max_gap++) {
+ // Since we do not actually write to db, we mock the seq as it would be
+ // increased by the db. The only exception is that we need db seq to
+ // advance for our snapshots. for which we apply a dummy put each time we
+ // increase our mock of seq.
+ uint64_t seq = 0;
+ // At each step we prepare a txn and then we commit it in the next txn.
+ // This emulates the consecutive transactions that write to the same key
+ uint64_t cur_txn = 0;
+ // Number of snapshots taken so far
+ int num_snapshots = 0;
+ // Number of gaps applied so far
+ int gap_cnt = 0;
+ // The final snapshot that we will inspect
+ uint64_t snapshot = 0;
+ bool found_committed = false;
+ // To stress the data structure that maintain prepared txns, at each cycle
+ // we add a new prepare txn. These do not mean to be committed for
+ // snapshot inspection.
+ std::set<uint64_t> prepared;
+ // We keep the list of txns committed before we take the last snapshot.
+ // These should be the only seq numbers that will be found in the snapshot
+ std::set<uint64_t> committed_before;
+ // The set of commit seq numbers to be excluded from IsInSnapshot queries
+ std::set<uint64_t> commit_seqs;
+ DBImpl* mock_db = new DBImpl(options, dbname);
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+ new WritePreparedTxnDBMock(mock_db, txn_db_options));
+ // We continue until max advances a bit beyond the snapshot.
+ while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
+ // do prepare for a transaction
+ seq++;
+ wp_db->AddPrepared(seq);
+ prepared.insert(seq);
+
+ // If cur_txn is not started, do prepare for it.
+ if (!cur_txn) {
+ seq++;
+ cur_txn = seq;
+ wp_db->AddPrepared(cur_txn);
+ } else { // else commit it
+ seq++;
+ wp_db->AddCommitted(cur_txn, seq);
+ wp_db->RemovePrepared(cur_txn);
+ commit_seqs.insert(seq);
+ if (!snapshot) {
+ committed_before.insert(cur_txn);
+ }
+ cur_txn = 0;
+ }
+
+ if (num_snapshots < max_snapshots - 1) {
+ // Take preliminary snapshots
+ wp_db->TakeSnapshot(seq);
+ num_snapshots++;
+ } else if (gap_cnt < max_gap) {
+ // Wait for some gap before taking the final snapshot
+ gap_cnt++;
+ } else if (!snapshot) {
+ // Take the final snapshot if it is not already taken
+ snapshot = seq;
+ wp_db->TakeSnapshot(snapshot);
+ num_snapshots++;
+ }
+
+ // If the snapshot is taken, verify seq numbers visible to it. We redo
+ // it at each cycle to test that the system is still sound when
+ // max_evicted_seq_ advances.
+ if (snapshot) {
+ for (uint64_t s = 1;
+ s <= seq && commit_seqs.find(s) == commit_seqs.end(); s++) {
+ bool was_committed =
+ (committed_before.find(s) != committed_before.end());
+ bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot);
+ if (was_committed != is_in_snapshot) {
+ printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64
+ " snapshot %" PRIu64
+ " gap_cnt %d num_snapshots %d s %" PRIu64 "\n",
+ max_snapshots, max_gap, seq,
+ wp_db->max_evicted_seq_.load(), snapshot, gap_cnt,
+ num_snapshots, s);
+ }
+ ASSERT_EQ(was_committed, is_in_snapshot);
+ found_committed = found_committed || is_in_snapshot;
+ }
+ }
+ }
+ // Safety check to make sure the test actually ran
+ ASSERT_TRUE(found_committed);
+ // As an extra check, check if prepared set will be properly empty after
+ // they are committed.
+ if (cur_txn) {
+ wp_db->AddCommitted(cur_txn, seq);
+ wp_db->RemovePrepared(cur_txn);
+ }
+ for (auto p : prepared) {
+ wp_db->AddCommitted(p, seq);
+ wp_db->RemovePrepared(p);
+ }
+ ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ }
+ }
+}
+
+void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s,
+ PinnableSlice& exp_v, Slice key) {
+ Status s;
+ PinnableSlice v;
+ s = db->Get(roptions, db->DefaultColumnFamily(), key, &v);
+ ASSERT_EQ(exp_s, s);
+ ASSERT_TRUE(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ ASSERT_TRUE(exp_v == v);
+ }
+
+ // Try with MultiGet API too
+ std::vector<std::string> values;
+ auto s_vec =
+ db->MultiGet(roptions, {db->DefaultColumnFamily()}, {key}, &values);
+ ASSERT_EQ(1, values.size());
+ ASSERT_EQ(1, s_vec.size());
+ s = s_vec[0];
+ ASSERT_EQ(exp_s, s);
+ ASSERT_TRUE(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ ASSERT_TRUE(exp_v == values[0]);
+ }
+}
+
+void ASSERT_SAME(TransactionDB* db, Status exp_s, PinnableSlice& exp_v,
+ Slice key) {
+ ASSERT_SAME(ReadOptions(), db, exp_s, exp_v, key);
+}
+
+TEST_P(WritePreparedTransactionTest, Rollback) {
+ ReadOptions roptions;
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ const size_t num_keys = 4;
+ const size_t num_values = 5;
+ for (size_t ikey = 1; ikey <= num_keys; ikey++) {
+ for (size_t ivalue = 0; ivalue < num_values; ivalue++) {
+ for (bool crash : {false, true}) {
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ std::string key_str = "key" + std::to_string(ikey);
+ switch (ivalue) {
+ case 0:
+ break;
+ case 1:
+ ASSERT_OK(db->Put(woptions, key_str, "initvalue1"));
+ break;
+ case 2:
+ ASSERT_OK(db->Merge(woptions, key_str, "initvalue2"));
+ break;
+ case 3:
+ ASSERT_OK(db->Delete(woptions, key_str));
+ break;
+ case 4:
+ ASSERT_OK(db->SingleDelete(woptions, key_str));
+ break;
+ default:
+ FAIL();
+ }
+
+ PinnableSlice v1;
+ auto s1 =
+ db->Get(roptions, db->DefaultColumnFamily(), Slice("key1"), &v1);
+ PinnableSlice v2;
+ auto s2 =
+ db->Get(roptions, db->DefaultColumnFamily(), Slice("key2"), &v2);
+ PinnableSlice v3;
+ auto s3 =
+ db->Get(roptions, db->DefaultColumnFamily(), Slice("key3"), &v3);
+ PinnableSlice v4;
+ auto s4 =
+ db->Get(roptions, db->DefaultColumnFamily(), Slice("key4"), &v4);
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ auto s = txn->SetName("xid0");
+ ASSERT_OK(s);
+ s = txn->Put(Slice("key1"), Slice("value1"));
+ ASSERT_OK(s);
+ s = txn->Merge(Slice("key2"), Slice("value2"));
+ ASSERT_OK(s);
+ s = txn->Delete(Slice("key3"));
+ ASSERT_OK(s);
+ s = txn->SingleDelete(Slice("key4"));
+ ASSERT_OK(s);
+ s = txn->Prepare();
+ ASSERT_OK(s);
+
+ {
+ ReadLock rl(&wp_db->prepared_mutex_);
+ ASSERT_FALSE(wp_db->prepared_txns_.empty());
+ ASSERT_EQ(txn->GetId(), wp_db->prepared_txns_.top());
+ }
+
+ ASSERT_SAME(db, s1, v1, "key1");
+ ASSERT_SAME(db, s2, v2, "key2");
+ ASSERT_SAME(db, s3, v3, "key3");
+ ASSERT_SAME(db, s4, v4, "key4");
+
+ if (crash) {
+ delete txn;
+ auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(db_impl->FlushWAL(true));
+ dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ ASSERT_NE(db, nullptr);
+ wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ txn = db->GetTransactionByName("xid0");
+ ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+ ReadLock rl(&wp_db->prepared_mutex_);
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ ASSERT_FALSE(wp_db->delayed_prepared_.empty());
+ ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
+ wp_db->delayed_prepared_.end());
+ }
+
+ ASSERT_SAME(db, s1, v1, "key1");
+ ASSERT_SAME(db, s2, v2, "key2");
+ ASSERT_SAME(db, s3, v3, "key3");
+ ASSERT_SAME(db, s4, v4, "key4");
+
+ s = txn->Rollback();
+ ASSERT_OK(s);
+
+ {
+ ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+ ReadLock rl(&wp_db->prepared_mutex_);
+ ASSERT_TRUE(wp_db->prepared_txns_.empty());
+ ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+ }
+
+ ASSERT_SAME(db, s1, v1, "key1");
+ ASSERT_SAME(db, s2, v2, "key2");
+ ASSERT_SAME(db, s3, v3, "key3");
+ ASSERT_SAME(db, s4, v4, "key4");
+ delete txn;
+ }
+ }
+ }
+}
+
+TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) {
+ // Use large buffer to avoid memtable flush after 1024 insertions
+ options.write_buffer_size = 1024 * 1024;
+ ASSERT_OK(ReOpen());
+ std::vector<KeyVersion> versions;
+ uint64_t seq = 0;
+ for (uint64_t i = 1; i <= 1024; i++) {
+ std::string v = "bar" + std::to_string(i);
+ ASSERT_OK(db->Put(WriteOptions(), "foo", v));
+ VerifyKeys({{"foo", v}});
+ seq++; // one for the key/value
+ KeyVersion kv = {"foo", v, seq, kTypeValue};
+ if (options.two_write_queues) {
+ seq++; // one for the commit
+ }
+ versions.emplace_back(kv);
+ }
+ std::reverse(std::begin(versions), std::end(versions));
+ VerifyInternalKeys(versions);
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_OK(db_impl->FlushWAL(true));
+ // Use small buffer to ensure memtable flush during recovery
+ options.write_buffer_size = 1024;
+ ASSERT_OK(ReOpenNoDelete());
+ VerifyInternalKeys(versions);
+}
+
+TEST_P(WritePreparedTransactionTest, SequenceNumberZero) {
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "bar"));
+ VerifyKeys({{"foo", "bar"}});
+ const Snapshot* snapshot = db->GetSnapshot();
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Compaction will output keys with sequence number 0, if it is visible to
+ // earliest snapshot. Make sure IsInSnapshot() report sequence number 0 is
+ // visible to any snapshot.
+ VerifyKeys({{"foo", "bar"}});
+ VerifyKeys({{"foo", "bar"}}, snapshot);
+ VerifyInternalKeys({{"foo", "bar", 0, kTypeValue}});
+ db->ReleaseSnapshot(snapshot);
+}
+
+// Compaction should not remove a key if it is not committed, and should
+// proceed with older versions of the key as-if the new version doesn't exist.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ // Snapshots to avoid keys get evicted.
+ std::vector<const Snapshot*> snapshots;
+ // Keep track of expected sequence number.
+ SequenceNumber expected_seq = 0;
+
+ auto add_key = [&](std::function<Status()> func) {
+ ASSERT_OK(func());
+ expected_seq++;
+ if (options.two_write_queues) {
+ expected_seq++; // 1 for commit
+ }
+ ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ snapshots.push_back(db->GetSnapshot());
+ };
+
+ // Each key here represent a standalone test case.
+ add_key([&]() { return db->Put(WriteOptions(), "key1", "value1_1"); });
+ add_key([&]() { return db->Put(WriteOptions(), "key2", "value2_1"); });
+ add_key([&]() { return db->Put(WriteOptions(), "key3", "value3_1"); });
+ add_key([&]() { return db->Put(WriteOptions(), "key4", "value4_1"); });
+ add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_1"); });
+ add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_2"); });
+ add_key([&]() { return db->Put(WriteOptions(), "key6", "value6_1"); });
+ add_key([&]() { return db->Put(WriteOptions(), "key7", "value7_1"); });
+ ASSERT_OK(db->Flush(FlushOptions()));
+ add_key([&]() { return db->Delete(WriteOptions(), "key6"); });
+ add_key([&]() { return db->SingleDelete(WriteOptions(), "key7"); });
+
+ auto* transaction = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Put("key1", "value1_2"));
+ ASSERT_OK(transaction->Delete("key2"));
+ ASSERT_OK(transaction->SingleDelete("key3"));
+ ASSERT_OK(transaction->Merge("key4", "value4_2"));
+ ASSERT_OK(transaction->Merge("key5", "value5_3"));
+ ASSERT_OK(transaction->Put("key6", "value6_2"));
+ ASSERT_OK(transaction->Put("key7", "value7_2"));
+ // Prepare but not commit.
+ ASSERT_OK(transaction->Prepare());
+ ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+ ASSERT_OK(db->Flush(FlushOptions()));
+ for (auto* s : snapshots) {
+ db->ReleaseSnapshot(s);
+ }
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyKeys({
+ {"key1", "value1_1"},
+ {"key2", "value2_1"},
+ {"key3", "value3_1"},
+ {"key4", "value4_1"},
+ {"key5", "value5_1,value5_2"},
+ {"key6", "NOT_FOUND"},
+ {"key7", "NOT_FOUND"},
+ });
+ VerifyInternalKeys({
+ {"key1", "value1_2", expected_seq, kTypeValue},
+ {"key1", "value1_1", 0, kTypeValue},
+ {"key2", "", expected_seq, kTypeDeletion},
+ {"key2", "value2_1", 0, kTypeValue},
+ {"key3", "", expected_seq, kTypeSingleDeletion},
+ {"key3", "value3_1", 0, kTypeValue},
+ {"key4", "value4_2", expected_seq, kTypeMerge},
+ {"key4", "value4_1", 0, kTypeValue},
+ {"key5", "value5_3", expected_seq, kTypeMerge},
+ {"key5", "value5_1,value5_2", 0, kTypeValue},
+ {"key6", "value6_2", expected_seq, kTypeValue},
+ {"key7", "value7_2", expected_seq, kTypeValue},
+ });
+ ASSERT_OK(transaction->Commit());
+ VerifyKeys({
+ {"key1", "value1_2"},
+ {"key2", "NOT_FOUND"},
+ {"key3", "NOT_FOUND"},
+ {"key4", "value4_1,value4_2"},
+ {"key5", "value5_1,value5_2,value5_3"},
+ {"key6", "value6_2"},
+ {"key7", "value7_2"},
+ });
+ delete transaction;
+}
+
+// Compaction should keep keys visible to a snapshot based on commit sequence,
+// not just prepare sequence.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+ // Keep track of expected sequence number.
+ SequenceNumber expected_seq = 0;
+ auto* txn1 = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(txn1->SetName("txn1"));
+ ASSERT_OK(txn1->Put("key1", "value1_1"));
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+ ASSERT_OK(txn1->Commit());
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ delete txn1;
+ // Take a snapshots to avoid keys get evicted before compaction.
+ const Snapshot* snapshot1 = db->GetSnapshot();
+ auto* txn2 = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(txn2->SetName("txn2"));
+ ASSERT_OK(txn2->Put("key2", "value2_1"));
+ ASSERT_OK(txn2->Prepare());
+ ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+ // txn1 commit before snapshot2 and it is visible to snapshot2.
+ // txn2 commit after snapshot2 and it is not visible.
+ const Snapshot* snapshot2 = db->GetSnapshot();
+ ASSERT_OK(txn2->Commit());
+ ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ delete txn2;
+ // Take a snapshots to avoid keys get evicted before compaction.
+ const Snapshot* snapshot3 = db->GetSnapshot();
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2"));
+ expected_seq++; // 1 for write
+ SequenceNumber seq1 = expected_seq;
+ if (options.two_write_queues) {
+ expected_seq++; // 1 for commit
+ }
+ ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2"));
+ expected_seq++; // 1 for write
+ SequenceNumber seq2 = expected_seq;
+ if (options.two_write_queues) {
+ expected_seq++; // 1 for commit
+ }
+ ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ ASSERT_OK(db->Flush(FlushOptions()));
+ db->ReleaseSnapshot(snapshot1);
+ db->ReleaseSnapshot(snapshot3);
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyKeys({{"key1", "value1_2"}, {"key2", "value2_2"}});
+ VerifyKeys({{"key1", "value1_1"}, {"key2", "NOT_FOUND"}}, snapshot2);
+ VerifyInternalKeys({
+ {"key1", "value1_2", seq1, kTypeValue},
+ // "value1_1" is visible to snapshot2. Also keys at bottom level visible
+ // to earliest snapshot will output with seq = 0.
+ {"key1", "value1_1", 0, kTypeValue},
+ {"key2", "value2_2", seq2, kTypeValue},
+ });
+ db->ReleaseSnapshot(snapshot2);
+}
+
+TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // disable commit cache
+ for (bool has_recent_prepare : {true, false}) {
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+ auto* transaction =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Delete("key1"));
+ ASSERT_OK(transaction->Prepare());
+ // snapshot1 should get min_uncommitted from prepared_txns_ heap.
+ auto snapshot1 = db->GetSnapshot();
+ ASSERT_EQ(transaction->GetId(),
+ ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+ // Add a commit to advance max_evicted_seq and move the prepared transaction
+ // into delayed_prepared_ set.
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+ Transaction* txn2 = nullptr;
+ if (has_recent_prepare) {
+ txn2 =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(txn2->SetName("txn2"));
+ ASSERT_OK(txn2->Put("key3", "value3"));
+ ASSERT_OK(txn2->Prepare());
+ }
+ // snapshot2 should get min_uncommitted from delayed_prepared_ set.
+ auto snapshot2 = db->GetSnapshot();
+ ASSERT_EQ(transaction->GetId(),
+ ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+ ASSERT_OK(transaction->Commit());
+ delete transaction;
+ if (has_recent_prepare) {
+ ASSERT_OK(txn2->Commit());
+ delete txn2;
+ }
+ VerifyKeys({{"key1", "NOT_FOUND"}});
+ VerifyKeys({{"key1", "value1"}}, snapshot1);
+ VerifyKeys({{"key1", "value1"}}, snapshot2);
+ db->ReleaseSnapshot(snapshot1);
+ db->ReleaseSnapshot(snapshot2);
+ }
+}
+
+// Insert two values, v1 and v2, for a key. Between prepare and commit of v2
+// take two snapshots, s1 and s2. Release s1 during compaction.
+// Test to make sure compaction doesn't get confused and think s1 can see both
+// values, and thus compact out the older value by mistake.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1"));
+ auto* transaction =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Put("key1", "value1_2"));
+ ASSERT_OK(transaction->Prepare());
+ auto snapshot1 = db->GetSnapshot();
+ // Increment sequence number.
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+ auto snapshot2 = db->GetSnapshot();
+ ASSERT_OK(transaction->Commit());
+ delete transaction;
+ VerifyKeys({{"key1", "value1_2"}});
+ VerifyKeys({{"key1", "value1_1"}}, snapshot1);
+ VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+ // Add a flush to avoid compaction to fallback to trivial move.
+
+ // The callback might be called twice, record the calling state to
+ // prevent double calling.
+ bool callback_finished = false;
+ auto callback = [&](void*) {
+ if (callback_finished) {
+ return;
+ }
+ // Release snapshot1 after CompactionIterator init.
+ // CompactionIterator need to figure out the earliest snapshot
+ // that can see key1:value1_2 is kMaxSequenceNumber, not
+ // snapshot1 or snapshot2.
+ db->ReleaseSnapshot(snapshot1);
+ // Add some keys to advance max_evicted_seq.
+ ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+ ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+ callback_finished = true;
+ };
+ SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+ callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+ VerifyKeys({{"key1", "value1_2"}});
+ VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+ db->ReleaseSnapshot(snapshot2);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Take two snapshots, s1 and s2,
+// after committing v2. Release s1 during compaction, right after compaction
+// processes v2 and before processes v1. Test to make sure compaction doesn't
+// get confused and believe v1 and v2 are visible to different snapshot
+// (v1 by s2, v2 by s1) and refuse to compact out v1.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value2"));
+ SequenceNumber v2_seq = db->GetLatestSequenceNumber();
+ auto* s1 = db->GetSnapshot();
+ // Advance sequence number.
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "dummy"));
+ auto* s2 = db->GetSnapshot();
+
+ int count_value = 0;
+ auto callback = [&](void* arg) {
+ auto* ikey = reinterpret_cast<ParsedInternalKey*>(arg);
+ if (ikey->user_key == "key1") {
+ count_value++;
+ if (count_value == 2) {
+ // Processing v1.
+ db->ReleaseSnapshot(s1);
+ // Add some keys to advance max_evicted_seq and update
+ // old_commit_map.
+ ASSERT_OK(db->Put(WriteOptions(), "key3", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "key4", "dummy"));
+ }
+ }
+ };
+ SyncPoint::GetInstance()->SetCallBack("CompactionIterator:ProcessKV",
+ callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // value1 should be compact out.
+ VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+ // cleanup
+ db->ReleaseSnapshot(s2);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Insert another dummy key
+// so to evict the commit cache for v2, while v1 is still in commit cache.
+// Take two snapshots, s1 and s2. Release s1 during compaction.
+// Since commit cache for v2 is evicted, and old_commit_map don't have
+// s1 (it is released),
+// TODO(myabandeh): how can we be sure that the v2's commit info is evicted
+// (and not v1's)? Instead of putting a dummy, we can directly call
+// AddCommitted(v2_seq + cache_size, ...) to evict v2's entry from commit cache.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 1; // commit cache size = 2
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ // Add a dummy key to evict v2 commit cache, but keep v1 commit cache.
+ // It also advance max_evicted_seq and can trigger old_commit_map cleanup.
+ auto add_dummy = [&]() {
+ auto* txn_dummy =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(txn_dummy->SetName("txn_dummy"));
+ ASSERT_OK(txn_dummy->Put("dummy", "dummy"));
+ ASSERT_OK(txn_dummy->Prepare());
+ ASSERT_OK(txn_dummy->Commit());
+ delete txn_dummy;
+ };
+
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+ auto* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Put("key1", "value2"));
+ ASSERT_OK(txn->Prepare());
+ // TODO(myabandeh): replace it with GetId()?
+ auto v2_seq = db->GetLatestSequenceNumber();
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ auto* s1 = db->GetSnapshot();
+ // Dummy key to advance sequence number.
+ add_dummy();
+ auto* s2 = db->GetSnapshot();
+
+ // The callback might be called twice, record the calling state to
+ // prevent double calling.
+ bool callback_finished = false;
+ auto callback = [&](void*) {
+ if (callback_finished) {
+ return;
+ }
+ db->ReleaseSnapshot(s1);
+ // Add some dummy entries to trigger s1 being cleanup from old_commit_map.
+ add_dummy();
+ add_dummy();
+ callback_finished = true;
+ };
+ SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+ callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // value1 should be compact out.
+ VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+ db->ReleaseSnapshot(s2);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+ SequenceNumber put_seq = db->GetLatestSequenceNumber();
+ auto* transaction =
+ db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Delete("key1"));
+ ASSERT_OK(transaction->Prepare());
+ SequenceNumber del_seq = db->GetLatestSequenceNumber();
+ auto snapshot1 = db->GetSnapshot();
+ // Increment sequence number.
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+ auto snapshot2 = db->GetSnapshot();
+ ASSERT_OK(transaction->Commit());
+ delete transaction;
+ VerifyKeys({{"key1", "NOT_FOUND"}});
+ VerifyKeys({{"key1", "value1"}}, snapshot1);
+ VerifyKeys({{"key1", "value1"}}, snapshot2);
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ auto callback = [&](void* compaction) {
+ // Release snapshot1 after CompactionIterator init.
+ // CompactionIterator need to double check and find out snapshot2 is now
+ // the earliest existing snapshot.
+ if (compaction != nullptr) {
+ db->ReleaseSnapshot(snapshot1);
+ // Add some keys to advance max_evicted_seq.
+ ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+ ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+ }
+ };
+ SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+ callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ // Only verify for key1. Both the put and delete for the key should be kept.
+ // Since the delete tombstone is not visible to snapshot2, we need to keep
+ // at least one version of the key, for write-conflict check.
+ VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion},
+ {"key1", "value1", put_seq, kTypeValue}});
+ db->ReleaseSnapshot(snapshot2);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+ ReleaseEarliestSnapshotDuringCompaction_WithSD) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr);
+ ASSERT_OK(txn->SingleDelete("key"));
+ ASSERT_OK(txn->Put("wow", "value"));
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ const bool two_write_queues = std::get<1>(GetParam());
+ if (two_write_queues) {
+ // In the case of two queues, commit another txn just to bump
+ // last_published_seq so that a subsequent GetSnapshot() call can return
+ // a snapshot with higher sequence.
+ auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr);
+ ASSERT_OK(dummy_txn->Put("haha", "value"));
+ ASSERT_OK(dummy_txn->Commit());
+ delete dummy_txn;
+ }
+ auto* snapshot = db->GetSnapshot();
+
+ ASSERT_OK(txn->Commit());
+ delete txn;
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+ if (!arg) {
+ return;
+ }
+ db->ReleaseSnapshot(snapshot);
+
+ // Advance max_evicted_seq
+ ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+ ReleaseEarliestSnapshotDuringCompaction_WithSD2) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr);
+ ASSERT_OK(txn->Put("bar", "value"));
+ ASSERT_OK(txn->SingleDelete("key"));
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ ASSERT_OK(txn->Commit());
+ delete txn;
+
+ ASSERT_OK(db->Put(WriteOptions(), "haha", "value"));
+
+ // Create a dummy transaction to take a snapshot for ww-conflict detection.
+ TransactionOptions txn_opts;
+ txn_opts.set_snapshot = true;
+ auto* dummy_txn =
+ db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:SingleDelete:2", [&](void* /*arg*/) {
+ ASSERT_OK(dummy_txn->Rollback());
+ delete dummy_txn;
+
+ ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Put(WriteOptions(), "haha2", "value"));
+ auto* snapshot = db->GetSnapshot();
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ db->ReleaseSnapshot(snapshot);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+ ReleaseEarliestSnapshotDuringCompaction_WithDelete) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr);
+ ASSERT_OK(txn->Delete("b"));
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Prepare());
+
+ const bool two_write_queues = std::get<1>(GetParam());
+ if (two_write_queues) {
+ // In the case of two queues, commit another txn just to bump
+ // last_published_seq so that a subsequent GetSnapshot() call can return
+ // a snapshot with higher sequence.
+ auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr);
+ ASSERT_OK(dummy_txn->Put("haha", "value"));
+ ASSERT_OK(dummy_txn->Commit());
+ delete dummy_txn;
+ }
+ auto* snapshot1 = db->GetSnapshot();
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ auto* snapshot2 = db->GetSnapshot();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:BottommostDelete:1", [&](void* arg) {
+ if (!arg) {
+ return;
+ }
+ db->ReleaseSnapshot(snapshot1);
+
+ // Advance max_evicted_seq
+ ASSERT_OK(db->Put(WriteOptions(), "dummy1", "value"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+ db->ReleaseSnapshot(snapshot2);
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+ ReleaseSnapshotBetweenSDAndPutDuringCompaction) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ // Create a dummy transaction to take a snapshot for ww-conflict detection.
+ TransactionOptions txn_opts;
+ txn_opts.set_snapshot = true;
+ auto* dummy_txn =
+ db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+ // Increment seq
+ ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+ auto* snapshot1 = db->GetSnapshot();
+ // Increment seq
+ ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+ auto* snapshot2 = db->GetSnapshot();
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:KeepSDForWW", [&](void* /*arg*/) {
+ db->ReleaseSnapshot(snapshot1);
+
+ ASSERT_OK(db->Put(WriteOptions(), "dontcare2", "value2"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+ db->ReleaseSnapshot(snapshot2);
+ ASSERT_OK(dummy_txn->Commit());
+ delete dummy_txn;
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+ ReleaseEarliestWriteConflictSnapshot_SingleDelete) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ }
+
+ std::unique_ptr<Transaction> txn;
+ txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(),
+ /*old_txn=*/nullptr));
+ ASSERT_OK(txn->SetName("txn1"));
+ ASSERT_OK(txn->SingleDelete("b"));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn->Commit());
+
+ auto* snapshot1 = db->GetSnapshot();
+
+ // Bump seq of the db by performing writes so that
+ // earliest_snapshot_ < earliest_write_conflict_snapshot_ in
+ // CompactionIterator.
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+
+ // Create another snapshot for write conflict checking
+ std::unique_ptr<Transaction> txn2;
+ {
+ TransactionOptions txn_opts;
+ txn_opts.set_snapshot = true;
+ txn2.reset(
+ db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr));
+ }
+
+ // Bump seq so that the subsequent bg flush won't create a snapshot with the
+ // same seq as the previous snapshot for conflict checking.
+ ASSERT_OK(db->Put(WriteOptions(), "y", "dont"));
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) {
+ // Rolling back txn2 should release its snapshot(for ww checking).
+ ASSERT_OK(txn2->Rollback());
+ txn2.reset();
+ // Advance max_evicted_seq
+ ASSERT_OK(db->Put(WriteOptions(), "x", "value"));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ db->ReleaseSnapshot(snapshot1);
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ }
+
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+
+ // Take a snapshot so that the SD won't be dropped during flush.
+ auto* tmp_snapshot = db->GetSnapshot();
+
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value2"));
+ auto* snapshot = db->GetSnapshot();
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ db->ReleaseSnapshot(tmp_snapshot);
+
+ // Bump the sequence so that the below bg compaction job's snapshot will be
+ // different from snapshot's sequence.
+ ASSERT_OK(db->Put(WriteOptions(), "z", "foo"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+ const auto* const ikey =
+ reinterpret_cast<const ParsedInternalKey*>(arg);
+ assert(ikey);
+ if (ikey->user_key == "b") {
+ assert(ikey->type == kTypeValue);
+ db->ReleaseSnapshot(snapshot);
+
+ // Bump max_evicted_seq.
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ // Generate an L0 with only SD for one key "b".
+ ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+ // Take a snapshot so that subsequent flush outputs the SD for "b".
+ auto* tmp_snapshot = db->GetSnapshot();
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+ ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:SingleDelete:3", [&](void* arg) {
+ if (!arg) {
+ db->ReleaseSnapshot(tmp_snapshot);
+ // Bump max_evicted_seq
+ ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // Finish generating L0 with only SD for "b".
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Move the L0 to L2.
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = 2;
+ ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ }
+
+ ASSERT_OK(db->Put(WriteOptions(), "b", "value1"));
+
+ auto* snapshot = db->GetSnapshot();
+
+ // Bump seq so that a subsequent flush/compaction job's snapshot is larger
+ // than the above snapshot's seq.
+ ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+
+ // Generate a second L0.
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+ const auto* const ikey =
+ reinterpret_cast<const ParsedInternalKey*>(arg);
+ assert(ikey);
+ if (ikey->user_key == "b") {
+ assert(ikey->type == kTypeValue);
+ db->ReleaseSnapshot(snapshot);
+
+ // Bump max_evicted_seq.
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+ /*end=*/nullptr));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Although the user-contract indicates that a SD can only be issued for a key
+// that exists and has not been overwritten, it is still possible for a Delete
+// to be present when write-prepared transaction is rolled back.
+TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) {
+ constexpr size_t kSnapshotCacheBits = 7; // same as default
+ constexpr size_t kCommitCacheBits = 0; // minimum commit cache
+ txn_db_options.rollback_deletion_type_callback =
+ [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; };
+ UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ // Get a write conflict snapshot by creating a transaction with
+ // set_snapshot=true.
+ TransactionOptions txn_opts;
+ txn_opts.set_snapshot = true;
+ std::unique_ptr<Transaction> dummy_txn(
+ db->BeginTransaction(WriteOptions(), txn_opts));
+
+ std::unique_ptr<Transaction> txn0(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn0->Put("foo", "value"));
+ ASSERT_OK(txn0->SetName("xid0"));
+ ASSERT_OK(txn0->Prepare());
+
+ // Create an SST with only {"foo": "value"}.
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Insert a Delete to cancel out the prior Put by txn0.
+ ASSERT_OK(txn0->Rollback());
+ txn0.reset();
+
+ // Create a second SST.
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "value1"));
+
+ auto* snapshot = db->GetSnapshot();
+
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+
+ int count = 0;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+ const auto* const c = reinterpret_cast<const Compaction*>(arg);
+ assert(!c);
+ // Trigger once only for SingleDelete during flush.
+ if (0 == count) {
+ ++count;
+ db->ReleaseSnapshot(snapshot);
+ // Bump max_evicted_seq
+ ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Create a third SST containing a SD without its matching PUT.
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ DBImpl* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ assert(dbimpl);
+ ASSERT_OK(dbimpl->TEST_CompactRange(
+ /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr,
+ /*column_family=*/nullptr, /*disallow_trivial_mode=*/true));
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Release the conflict-checking snapshot.
+ ASSERT_OK(dummy_txn->Rollback());
+}
+
+// A more complex test to verify compaction/flush should keep keys visible
+// to snapshots.
+TEST_P(WritePreparedTransactionTest,
+ CompactionKeepSnapshotVisibleKeysRandomized) {
+ constexpr size_t kNumTransactions = 10;
+ constexpr size_t kNumIterations = 1000;
+
+ std::vector<Transaction*> transactions(kNumTransactions, nullptr);
+ std::vector<size_t> versions(kNumTransactions, 0);
+ std::unordered_map<std::string, std::string> current_data;
+ std::vector<const Snapshot*> snapshots;
+ std::vector<std::unordered_map<std::string, std::string>> snapshot_data;
+
+ Random rnd(1103);
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ for (size_t i = 0; i < kNumTransactions; i++) {
+ std::string key = "key" + std::to_string(i);
+ std::string value = "value0";
+ ASSERT_OK(db->Put(WriteOptions(), key, value));
+ current_data[key] = value;
+ }
+ VerifyKeys(current_data);
+
+ for (size_t iter = 0; iter < kNumIterations; iter++) {
+ auto r = rnd.Next() % (kNumTransactions + 1);
+ if (r < kNumTransactions) {
+ std::string key = "key" + std::to_string(r);
+ if (transactions[r] == nullptr) {
+ std::string value = "value" + std::to_string(versions[r] + 1);
+ auto* txn = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(txn->SetName("txn" + std::to_string(r)));
+ ASSERT_OK(txn->Put(key, value));
+ ASSERT_OK(txn->Prepare());
+ transactions[r] = txn;
+ } else {
+ std::string value = "value" + std::to_string(++versions[r]);
+ ASSERT_OK(transactions[r]->Commit());
+ delete transactions[r];
+ transactions[r] = nullptr;
+ current_data[key] = value;
+ }
+ } else {
+ auto* snapshot = db->GetSnapshot();
+ VerifyKeys(current_data, snapshot);
+ snapshots.push_back(snapshot);
+ snapshot_data.push_back(current_data);
+ }
+ VerifyKeys(current_data);
+ }
+ // Take a last snapshot to test compaction with uncommitted prepared
+ // transaction.
+ snapshots.push_back(db->GetSnapshot());
+ snapshot_data.push_back(current_data);
+
+ ASSERT_EQ(snapshots.size(), snapshot_data.size());
+ for (size_t i = 0; i < snapshots.size(); i++) {
+ VerifyKeys(snapshot_data[i], snapshots[i]);
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+ for (size_t i = 0; i < snapshots.size(); i++) {
+ VerifyKeys(snapshot_data[i], snapshots[i]);
+ }
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ for (size_t i = 0; i < snapshots.size(); i++) {
+ VerifyKeys(snapshot_data[i], snapshots[i]);
+ }
+ // cleanup
+ for (size_t i = 0; i < kNumTransactions; i++) {
+ if (transactions[i] == nullptr) {
+ continue;
+ }
+ ASSERT_OK(transactions[i]->Commit());
+ delete transactions[i];
+ }
+ for (size_t i = 0; i < snapshots.size(); i++) {
+ db->ReleaseSnapshot(snapshots[i]);
+ }
+}
+
+// Compaction should not apply the optimization to output key with sequence
+// number equal to 0 if the key is not visible to earliest snapshot, based on
+// commit sequence number.
+TEST_P(WritePreparedTransactionTest,
+ CompactionShouldKeepSequenceForUncommittedKeys) {
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+ // Keep track of expected sequence number.
+ SequenceNumber expected_seq = 0;
+ auto* transaction = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Put("key1", "value1"));
+ ASSERT_OK(transaction->Prepare());
+ ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+ SequenceNumber seq1 = expected_seq;
+ ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ expected_seq++; // one for data
+ if (options.two_write_queues) {
+ expected_seq++; // one for commit
+ }
+ ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // Dummy keys to avoid compaction trivially move files and get around actual
+ // compaction logic.
+ ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+ ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ VerifyKeys({
+ {"key1", "NOT_FOUND"},
+ {"key2", "value2"},
+ });
+ VerifyInternalKeys({
+ // "key1" has not been committed. It keeps its sequence number.
+ {"key1", "value1", seq1, kTypeValue},
+ // "key2" is committed and output with seq = 0.
+ {"key2", "value2", 0, kTypeValue},
+ });
+ ASSERT_OK(transaction->Commit());
+ VerifyKeys({
+ {"key1", "value1"},
+ {"key2", "value2"},
+ });
+ delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) {
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ const Snapshot* snapshot = nullptr;
+ ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+ auto* txn = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(txn->SetName("txn"));
+ ASSERT_OK(txn->Put("key1", "value2"));
+ ASSERT_OK(txn->Prepare());
+
+ auto callback = [&](void*) {
+ // Snapshot is taken after compaction start. It should be taken into
+ // consideration for whether to compact out value1.
+ snapshot = db->GetSnapshot();
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ };
+ SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+ callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+ ASSERT_OK(db->Flush(FlushOptions()));
+ ASSERT_NE(nullptr, snapshot);
+ VerifyKeys({{"key1", "value2"}});
+ VerifyKeys({{"key1", "value1"}}, snapshot);
+ db->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(WritePreparedTransactionTest, Iterate) {
+ auto verify_state = [](Iterator* iter, const std::string& key,
+ const std::string& value) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(key, iter->key().ToString());
+ ASSERT_EQ(value, iter->value().ToString());
+ };
+
+ auto verify_iter = [&](const std::string& expected_val) {
+ // Get iterator from a concurrent transaction and make sure it has the
+ // same view as an iterator from the DB.
+ auto* txn = db->BeginTransaction(WriteOptions());
+
+ for (int i = 0; i < 2; i++) {
+ Iterator* iter = (i == 0) ? db->NewIterator(ReadOptions())
+ : txn->GetIterator(ReadOptions());
+ // Seek
+ iter->Seek("foo");
+ verify_state(iter, "foo", expected_val);
+ // Next
+ iter->Seek("a");
+ verify_state(iter, "a", "va");
+ iter->Next();
+ verify_state(iter, "foo", expected_val);
+ // SeekForPrev
+ iter->SeekForPrev("y");
+ verify_state(iter, "foo", expected_val);
+ // Prev
+ iter->SeekForPrev("z");
+ verify_state(iter, "z", "vz");
+ iter->Prev();
+ verify_state(iter, "foo", expected_val);
+ delete iter;
+ }
+ delete txn;
+ };
+
+ ASSERT_OK(db->Put(WriteOptions(), "foo", "v1"));
+ auto* transaction = db->BeginTransaction(WriteOptions());
+ ASSERT_OK(transaction->SetName("txn"));
+ ASSERT_OK(transaction->Put("foo", "v2"));
+ ASSERT_OK(transaction->Prepare());
+ VerifyKeys({{"foo", "v1"}});
+ // dummy keys
+ ASSERT_OK(db->Put(WriteOptions(), "a", "va"));
+ ASSERT_OK(db->Put(WriteOptions(), "z", "vz"));
+ verify_iter("v1");
+ ASSERT_OK(transaction->Commit());
+ VerifyKeys({{"foo", "v2"}});
+ verify_iter("v2");
+ delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) {
+ Iterator* iter = db->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Refresh().IsNotSupported());
+ delete iter;
+}
+
+// Committing an delayed prepared has two non-atomic steps: update commit cache,
+// remove seq from delayed_prepared_. The read in IsInSnapshot also involves two
+// non-atomic steps of checking these two data structures. This test breaks each
+// in the middle to ensure correctness in spite of non-atomic execution.
+// Note: This test is limitted to the case where snapshot is larger than the
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 3; // 8 entries
+ for (auto split_read : {true, false}) {
+ std::vector<bool> split_options = {false};
+ if (split_read) {
+ // Also test for break before mutex
+ split_options.push_back(true);
+ }
+ for (auto split_before_mutex : split_options) {
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ // Fill up the commit cache
+ std::string init_value("value1");
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+ }
+ // Prepare a transaction but do not commit it
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+ ASSERT_OK(txn->Prepare());
+ // Commit a bunch of entries to advance max evicted seq and make the
+ // prepared a delayed prepared
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ }
+ // The snapshot should not see the delayed prepared entry
+ auto snap = db->GetSnapshot();
+
+ if (split_read) {
+ if (split_before_mutex) {
+ // split before acquiring prepare_mutex_
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause",
+ "AtomicCommitOfDelayedPrepared:Commit:before"},
+ {"AtomicCommitOfDelayedPrepared:Commit:after",
+ "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"}});
+ } else {
+ // split right after reading from the commit cache
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause",
+ "AtomicCommitOfDelayedPrepared:Commit:before"},
+ {"AtomicCommitOfDelayedPrepared:Commit:after",
+ "WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"}});
+ }
+ } else { // split commit
+ // split right before removing from delayed_prepared_
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTxnDB::RemovePrepared:pause",
+ "AtomicCommitOfDelayedPrepared:Read:before"},
+ {"AtomicCommitOfDelayedPrepared:Read:after",
+ "WritePreparedTxnDB::RemovePrepared:resume"}});
+ }
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+ TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:before");
+ ASSERT_OK(txn->Commit());
+ if (split_before_mutex) {
+ // Do bunch of inserts to evict the commit entry from the cache. This
+ // would prevent the 2nd look into commit cache under prepare_mutex_
+ // to see the commit entry.
+ auto seq = db_impl->TEST_GetLastVisibleSequence();
+ size_t tries = 0;
+ while (wp_db->max_evicted_seq_ < seq && tries < 50) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ tries++;
+ };
+ ASSERT_LT(tries, 50);
+ }
+ TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:after");
+ delete txn;
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:before");
+ ReadOptions roptions;
+ roptions.snapshot = snap;
+ PinnableSlice value;
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+ ASSERT_OK(s);
+ // It should not see the commit of delayed prepared
+ ASSERT_TRUE(value == init_value);
+ TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:after");
+ db->ReleaseSnapshot(snap);
+ });
+
+ read_thread.join();
+ commit_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ } // for split_before_mutex
+ } // for split_read
+}
+
+// When max evicted seq advances a prepared seq, it involves two updates: i)
+// adding prepared seq to delayed_prepared_, ii) updating max_evicted_seq_.
+// ::IsInSnapshot also reads these two values in a non-atomic way. This test
+// ensures correctness if the update occurs after ::IsInSnapshot reads
+// delayed_prepared_empty_ and before it reads max_evicted_seq_.
+// Note: this test focuses on read snapshot larger than max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 3; // 8 entries
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ // Fill up the commit cache
+ std::string init_value("value1");
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+ }
+ // Prepare a transaction but do not commit it
+ Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+ ASSERT_OK(txn->Prepare());
+ // Create a gap between prepare seq and snapshot seq
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ // The snapshot should not see the delayed prepared entry
+ auto snap = db->GetSnapshot();
+ ASSERT_LT(txn->GetId(), snap->GetSequenceNumber());
+
+ // split right after reading delayed_prepared_empty_
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause",
+ "AtomicUpdateOfDelayedPrepared:before"},
+ {"AtomicUpdateOfDelayedPrepared:after",
+ "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+ TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:before");
+ // Commit a bunch of entries to advance max evicted seq and make the
+ // prepared a delayed prepared
+ size_t tries = 0;
+ while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ tries++;
+ };
+ ASSERT_LT(tries, 50);
+ // This is the case on which the test focuses
+ ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+ TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:after");
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ ReadOptions roptions;
+ roptions.snapshot = snap;
+ PinnableSlice value;
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+ ASSERT_OK(s);
+ // It should not see the uncommitted value of delayed prepared
+ ASSERT_TRUE(value == init_value);
+ db->ReleaseSnapshot(snap);
+ });
+
+ read_thread.join();
+ commit_thread.join();
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Eviction from commit cache and update of max evicted seq are two non-atomic
+// steps. Similarly the read of max_evicted_seq_ in ::IsInSnapshot and reading
+// from commit cache are two non-atomic steps. This tests if the update occurs
+// after reading max_evicted_seq_ and before reading the commit cache.
+// Note: the test focuses on snapshot larger than max_evicted_seq_
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ const size_t commit_cache_bits = 3; // 8 entries
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ // Fill up the commit cache
+ std::string init_value("value1");
+ std::string last_value("value_final");
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+ }
+ // Do an uncommitted write to prevent min_uncommitted optimization
+ Transaction* txn1 =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn1->SetName("xid1"));
+ ASSERT_OK(txn1->Put(Slice("key0"), last_value));
+ ASSERT_OK(txn1->Prepare());
+ // Do a write with prepare to get the prepare seq
+ Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("key1"), last_value));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn->Commit());
+ // Create a gap between commit entry and snapshot seq
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ // The snapshot should see the last commit
+ auto snap = db->GetSnapshot();
+ ASSERT_LE(txn->GetId(), snap->GetSequenceNumber());
+
+ // split right after reading max_evicted_seq_
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause",
+ "NonAtomicUpdateOfMaxEvictedSeq:before"},
+ {"NonAtomicUpdateOfMaxEvictedSeq:after",
+ "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+ TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:before");
+ // Commit a bunch of entries to advance max evicted seq beyond txn->GetId()
+ size_t tries = 0;
+ while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+ tries++;
+ };
+ ASSERT_LT(tries, 50);
+ // This is the case on which the test focuses
+ ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+ TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:after");
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ ReadOptions roptions;
+ roptions.snapshot = snap;
+ PinnableSlice value;
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+ ASSERT_OK(s);
+ // It should see the committed value of the evicted entry
+ ASSERT_TRUE(value == last_value);
+ db->ReleaseSnapshot(snap);
+ });
+
+ read_thread.join();
+ commit_thread.join();
+ delete txn;
+ ASSERT_OK(txn1->Commit());
+ delete txn1;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test when we add a prepared seq when the max_evicted_seq_ already goes beyond
+// that. The test focuses on a race condition between AddPrepared and
+// AdvanceMaxEvictedSeq functions.
+TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
+ if (!options.two_write_queues) {
+ // This test is only for two write queues
+ return;
+ }
+ const size_t snapshot_cache_bits = 7; // same as default
+ // 1 entry to advance max after the 2nd commit
+ const size_t commit_cache_bits = 0;
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+ std::string some_value("value_some");
+ std::string uncommitted_value("value_uncommitted");
+ // Prepare two uncommitted transactions
+ Transaction* txn1 =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn1->SetName("xid1"));
+ ASSERT_OK(txn1->Put(Slice("key1"), some_value));
+ ASSERT_OK(txn1->Prepare());
+ Transaction* txn2 =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn2->SetName("xid2"));
+ ASSERT_OK(txn2->Put(Slice("key2"), some_value));
+ ASSERT_OK(txn2->Prepare());
+ // Start the txn here so the other thread could get its id
+ Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value));
+ port::Mutex txn_mutex_;
+
+ // t1) Insert prepared entry, t2) commit other entries to advance max
+ // evicted sec and finish checking the existing prepared entries, t1)
+ // AddPrepared, t2) update max_evicted_seq_
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"AddPreparedCallback::AddPrepared::begin:pause",
+ "AddPreparedBeforeMax::read_thread:start"},
+ {"AdvanceMaxEvictedSeq::update_max:pause",
+ "AddPreparedCallback::AddPrepared::begin:resume"},
+ {"AddPreparedCallback::AddPrepared::end",
+ "AdvanceMaxEvictedSeq::update_max:resume"},
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+ txn_mutex_.Lock();
+ ASSERT_OK(txn->Prepare());
+ txn_mutex_.Unlock();
+ });
+
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ TEST_SYNC_POINT("AddPreparedBeforeMax::read_thread:start");
+ // Publish seq number with a commit
+ ASSERT_OK(txn1->Commit());
+ // Since the commit cache size is one the 2nd commit evict the 1st one and
+ // invokes AdcanceMaxEvictedSeq
+ ASSERT_OK(txn2->Commit());
+
+ ReadOptions roptions;
+ PinnableSlice value;
+ // The snapshot should not see the uncommitted value from write_thread
+ auto snap = db->GetSnapshot();
+ ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+ // This is the scenario that we test for
+ txn_mutex_.Lock();
+ ASSERT_GT(wp_db->max_evicted_seq_, txn->GetId());
+ txn_mutex_.Unlock();
+ roptions.snapshot = snap;
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key0", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ db->ReleaseSnapshot(snap);
+ });
+
+ read_thread.join();
+ write_thread.join();
+ delete txn1;
+ delete txn2;
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// When an old prepared entry gets committed, there is a gap between the time
+// that it is published and when it is cleaned up from old_prepared_. This test
+// stresses such cases.
+TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
+ const size_t snapshot_cache_bits = 7; // same as default
+ for (const size_t commit_cache_bits : {0, 2, 3}) {
+ for (const size_t sub_batch_cnt : {1, 2, 3}) {
+ UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+ ASSERT_OK(ReOpen());
+ std::atomic<const Snapshot*> snap = {nullptr};
+ std::atomic<SequenceNumber> exp_prepare = {0};
+ ROCKSDB_NAMESPACE::port::Thread callback_thread;
+ // Value is synchronized via snap
+ PinnableSlice value;
+ // Take a snapshot after publish and before RemovePrepared:Start
+ auto snap_callback = [&]() {
+ ASSERT_EQ(nullptr, snap.load());
+ snap.store(db->GetSnapshot());
+ ReadOptions roptions;
+ roptions.snapshot = snap.load();
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value);
+ ASSERT_OK(s);
+ };
+ auto callback = [&](void* param) {
+ SequenceNumber prep_seq = *((SequenceNumber*)param);
+ if (prep_seq == exp_prepare.load()) { // only for write_thread
+ // We need to spawn a thread to avoid deadlock since getting a
+ // snpashot might end up calling AdvanceSeqByOne which needs joining
+ // the write queue.
+ callback_thread = ROCKSDB_NAMESPACE::port::Thread(snap_callback);
+ TEST_SYNC_POINT("callback:end");
+ }
+ };
+ // Wait for the first snapshot be taken in GetSnapshotInternal. Although
+ // it might be updated before GetSnapshotInternal finishes but this should
+ // cover most of the cases.
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"},
+ });
+ SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback);
+ SyncPoint::GetInstance()->EnableProcessing();
+ // Thread to cause frequent evictions
+ ROCKSDB_NAMESPACE::port::Thread eviction_thread([&]() {
+ // Too many txns might cause commit_seq - prepare_seq in another thread
+ // to go beyond DELTA_UPPERBOUND
+ for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1")));
+ }
+ });
+ ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+ for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ std::string val_str = "value" + std::to_string(i);
+ for (size_t b = 0; b < sub_batch_cnt; b++) {
+ ASSERT_OK(txn->Put(Slice("key2"), val_str));
+ }
+ ASSERT_OK(txn->Prepare());
+ // Let an eviction to kick in
+ std::this_thread::yield();
+
+ exp_prepare.store(txn->GetId());
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ // Wait for the snapshot taking that is triggered by
+ // RemovePrepared:Start callback
+ callback_thread.join();
+
+ // Read with the snapshot taken before delayed_prepared_ cleanup
+ ReadOptions roptions;
+ roptions.snapshot = snap.load();
+ ASSERT_NE(nullptr, roptions.snapshot);
+ PinnableSlice value2;
+ auto s =
+ db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2);
+ ASSERT_OK(s);
+ // It should see its own write
+ ASSERT_TRUE(val_str == value2);
+ // The value read by snapshot should not change
+ ASSERT_STREQ(value2.ToString().c_str(), value.ToString().c_str());
+
+ db->ReleaseSnapshot(roptions.snapshot);
+ snap.store(nullptr);
+ }
+ });
+ write_thread.join();
+ eviction_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ }
+ }
+}
+
+// Test that updating the commit map will not affect the existing snapshots
+TEST_P(WritePreparedTransactionTest, AtomicCommit) {
+ for (bool skip_prepare : {true, false}) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+ {"WritePreparedTxnDB::AddCommitted:start",
+ "AtomicCommit::GetSnapshot:start"},
+ {"AtomicCommit::Get:end",
+ "WritePreparedTxnDB::AddCommitted:start:pause"},
+ {"WritePreparedTxnDB::AddCommitted:end", "AtomicCommit::Get2:start"},
+ {"AtomicCommit::Get2:end",
+ "WritePreparedTxnDB::AddCommitted:end:pause:"},
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+ if (skip_prepare) {
+ ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value")));
+ } else {
+ Transaction* txn =
+ db->BeginTransaction(WriteOptions(), TransactionOptions());
+ ASSERT_OK(txn->SetName("xid"));
+ ASSERT_OK(txn->Put(Slice("key"), Slice("value")));
+ ASSERT_OK(txn->Prepare());
+ ASSERT_OK(txn->Commit());
+ delete txn;
+ }
+ });
+ ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+ ReadOptions roptions;
+ TEST_SYNC_POINT("AtomicCommit::GetSnapshot:start");
+ roptions.snapshot = db->GetSnapshot();
+ PinnableSlice val;
+ auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &val);
+ TEST_SYNC_POINT("AtomicCommit::Get:end");
+ TEST_SYNC_POINT("AtomicCommit::Get2:start");
+ ASSERT_SAME(roptions, db, s, val, "key");
+ TEST_SYNC_POINT("AtomicCommit::Get2:end");
+ db->ReleaseSnapshot(roptions.snapshot);
+ });
+ read_thread.join();
+ write_thread.join();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+
+TEST_P(WritePreparedTransactionTest, BasicRollbackDeletionTypeCb) {
+ options.level0_file_num_compaction_trigger = 2;
+ // Always use SingleDelete to rollback Put.
+ txn_db_options.rollback_deletion_type_callback =
+ [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; };
+
+ const auto write_to_db = [&]() {
+ assert(db);
+ std::unique_ptr<Transaction> txn0(
+ db->BeginTransaction(WriteOptions(), TransactionOptions()));
+ ASSERT_OK(txn0->SetName("txn0"));
+ ASSERT_OK(txn0->Put("a", "v0"));
+ ASSERT_OK(txn0->Prepare());
+
+ // Generate sst1: [PUT('a')]
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ {
+ CompactRangeOptions cro;
+ cro.change_level = true;
+ cro.target_level = options.num_levels - 1;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ }
+
+ ASSERT_OK(txn0->Rollback());
+ txn0.reset();
+
+ ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
+
+ ASSERT_OK(db->SingleDelete(WriteOptions(), "a"));
+ // Generate another SST with a SD to cover the oldest PUT('a')
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+ assert(dbimpl);
+ ASSERT_OK(dbimpl->TEST_WaitForCompact());
+
+ {
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+ ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+ }
+
+ {
+ std::string value;
+ const Status s = db->Get(ReadOptions(), "a", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+ };
+
+ // Destroy and reopen
+ ASSERT_OK(ReOpen());
+ write_to_db();
+}
+
+// Test that we can change write policy from WriteCommitted to WritePrepared
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
+ bool empty_wal = true;
+ CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) {
+ bool empty_wal = true;
+ CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal);
+}
+
+// Test that we can change write policy from WritePrepare back to WriteCommitted
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) {
+ bool empty_wal = true;
+ CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) {
+ bool empty_wal = true;
+ CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ if (getenv("CIRCLECI")) {
+ // Looking for backtrace on "Resource temporarily unavailable" exceptions
+ ::testing::FLAGS_gtest_catch_exceptions = false;
+ }
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.cc b/src/rocksdb/utilities/transactions/write_prepared_txn.cc
new file mode 100644
index 000000000..16b5cc1cb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn.cc
@@ -0,0 +1,512 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn.h"
+
+#include <cinttypes>
+#include <map>
+#include <set>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options)
+ : PessimisticTransaction(txn_db, write_options, txn_options, false),
+ wpt_db_(txn_db) {
+ // Call Initialize outside PessimisticTransaction constructor otherwise it
+ // would skip overridden functions in WritePreparedTxn since they are not
+ // defined yet in the constructor of PessimisticTransaction
+ Initialize(txn_options);
+}
+
+void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
+ PessimisticTransaction::Initialize(txn_options);
+ prepare_batch_cnt_ = 0;
+}
+
+void WritePreparedTxn::MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ SequenceNumber min_uncommitted, snap_seq;
+ const SnapshotBackup backed_by_snapshot =
+ wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+ WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+ backed_by_snapshot);
+ write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+ keys, values, statuses, sorted_input,
+ &callback);
+ if (UNLIKELY(!callback.valid() ||
+ !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+ wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+ for (size_t i = 0; i < num_keys; i++) {
+ statuses[i] = Status::TryAgain();
+ }
+ }
+}
+
+Status WritePreparedTxn::Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val) {
+ SequenceNumber min_uncommitted, snap_seq;
+ const SnapshotBackup backed_by_snapshot =
+ wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+ WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+ backed_by_snapshot);
+ Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+ pinnable_val, &callback);
+ const bool callback_valid =
+ callback.valid(); // NOTE: validity of callback must always be checked
+ // before it is destructed
+ if (res.ok()) {
+ if (!LIKELY(callback_valid &&
+ wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+ backed_by_snapshot))) {
+ wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+ res = Status::TryAgain();
+ }
+ }
+
+ return res;
+}
+
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
+ // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+ Iterator* db_iter = wpt_db_->NewIterator(options);
+ assert(db_iter);
+
+ return write_batch_.NewIteratorWithBase(db_iter);
+}
+
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) {
+ // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+ Iterator* db_iter = wpt_db_->NewIterator(options, column_family);
+ assert(db_iter);
+
+ return write_batch_.NewIteratorWithBase(column_family, db_iter);
+}
+
+Status WritePreparedTxn::PrepareInternal() {
+ WriteOptions write_options = write_options_;
+ write_options.disableWAL = false;
+ const bool WRITE_AFTER_COMMIT = true;
+ const bool kFirstPrepareBatch = true;
+ auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+ name_, !WRITE_AFTER_COMMIT);
+ assert(s.ok());
+ // For each duplicate key we account for a new sub-batch
+ prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+ // Having AddPrepared in the PreReleaseCallback allows in-order addition of
+ // prepared entries to PreparedHeap and hence enables an optimization. Refer
+ // to SmallestUnCommittedSeq for more details.
+ AddPreparedCallback add_prepared_callback(
+ wpt_db_, db_impl_, prepare_batch_cnt_,
+ db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
+ const bool DISABLE_MEMTABLE = true;
+ uint64_t seq_used = kMaxSequenceNumber;
+ s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+ /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+ !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+ &add_prepared_callback);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ auto prepare_seq = seq_used;
+ SetId(prepare_seq);
+ return s;
+}
+
+Status WritePreparedTxn::CommitWithoutPrepareInternal() {
+ // For each duplicate key we account for a new sub-batch
+ const size_t batch_cnt = GetWriteBatch()->SubBatchCnt();
+ return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
+}
+
+Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
+ size_t batch_cnt) {
+ return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this);
+}
+
+Status WritePreparedTxn::CommitInternal() {
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "CommitInternal prepare_seq: %" PRIu64, GetID());
+ // We take the commit-time batch and append the Commit marker.
+ // The Memtable will ignore the Commit marker in non-recovery mode
+ WriteBatch* working_batch = GetCommitTimeWriteBatch();
+ const bool empty = working_batch->Count() == 0;
+ auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+ assert(s.ok());
+
+ const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+ if (!empty) {
+ // When not writing to memtable, we can still cache the latest write batch.
+ // The cached batch will be written to memtable in WriteRecoverableState
+ // during FlushMemTable
+ if (for_recovery) {
+ WriteBatchInternal::SetAsLatestPersistentState(working_batch);
+ } else {
+ return Status::InvalidArgument(
+ "Commit-time-batch can only be used if "
+ "use_only_the_last_commit_time_batch_for_recovery is true");
+ }
+ }
+
+ auto prepare_seq = GetId();
+ const bool includes_data = !empty && !for_recovery;
+ assert(prepare_batch_cnt_);
+ size_t commit_batch_cnt = 0;
+ if (UNLIKELY(includes_data)) {
+ ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+ "Duplicate key overhead");
+ SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+ s = working_batch->Iterate(&counter);
+ assert(s.ok());
+ commit_batch_cnt = counter.BatchCount();
+ }
+ const bool disable_memtable = !includes_data;
+ const bool do_one_write =
+ !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+ WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+ wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
+ // This is to call AddPrepared on CommitTimeWriteBatch
+ const bool kFirstPrepareBatch = true;
+ AddPreparedCallback add_prepared_callback(
+ wpt_db_, db_impl_, commit_batch_cnt,
+ db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+ PreReleaseCallback* pre_release_callback;
+ if (do_one_write) {
+ pre_release_callback = &update_commit_map;
+ } else {
+ pre_release_callback = &add_prepared_callback;
+ }
+ uint64_t seq_used = kMaxSequenceNumber;
+ // Since the prepared batch is directly written to memtable, there is already
+ // a connection between the memtable and its WAL, so there is no need to
+ // redundantly reference the log that contains the prepared data.
+ const uint64_t zero_log_number = 0ull;
+ size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+ // If `two_write_queues && includes_data`, then `do_one_write` is false. The
+ // following `WriteImpl` will insert the data of the commit-time-batch into
+ // the database before updating the commit cache. Therefore, the data of the
+ // commmit-time-batch is considered uncommitted. Furthermore, since data of
+ // the commit-time-batch are not locked, it is possible for two uncommitted
+ // versions of the same key to co-exist for a (short) period of time until
+ // the commit cache is updated by the second write. If the two uncommitted
+ // keys are compacted to the bottommost level in the meantime, it is possible
+ // that compaction iterator will zero out the sequence numbers of both, thus
+ // violating the invariant that an SST does not have two identical internal
+ // keys. To prevent this situation, we should allow the usage of
+ // commit-time-batch only if the user sets
+ // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to
+ // true. See the comments about GetCommitTimeWriteBatch() in
+ // include/rocksdb/utilities/transaction.h.
+ s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+ zero_log_number, disable_memtable, &seq_used,
+ batch_cnt, pre_release_callback);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ const SequenceNumber commit_batch_seq = seq_used;
+ if (LIKELY(do_one_write || !s.ok())) {
+ if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
+ s.ok())) {
+ // Note: RemovePrepared should be called after WriteImpl that publishsed
+ // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+ wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+ } // else RemovePrepared is called from within PreReleaseCallback
+ if (UNLIKELY(!do_one_write)) {
+ assert(!s.ok());
+ // Cleanup the prepared entry we added with add_prepared_callback
+ wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+ }
+ return s;
+ } // else do the 2nd write to publish seq
+ // Note: the 2nd write comes with a performance penality. So if we have too
+ // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+ // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+ // two_write_queues should be disabled to avoid many additional writes here.
+ const size_t kZeroData = 0;
+ // Update commit map only from the 2nd queue
+ WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch(
+ wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
+ commit_batch_seq, commit_batch_cnt);
+ WriteBatch empty_batch;
+ s = empty_batch.PutLogData(Slice());
+ assert(s.ok());
+ // In the absence of Prepare markers, use Noop as a batch separator
+ s = WriteBatchInternal::InsertNoop(&empty_batch);
+ assert(s.ok());
+ const bool DISABLE_MEMTABLE = true;
+ const size_t ONE_BATCH = 1;
+ const uint64_t NO_REF_LOG = 0;
+ s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+ NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ &update_commit_map_with_aux_batch);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ return s;
+}
+
+Status WritePreparedTxn::RollbackInternal() {
+ ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+ "RollbackInternal prepare_seq: %" PRIu64, GetId());
+
+ assert(db_impl_);
+ assert(wpt_db_);
+
+ WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ write_options_.protection_bytes_per_key,
+ 0 /* default_cf_ts_sz */);
+ assert(GetId() != kMaxSequenceNumber);
+ assert(GetId() > 0);
+ auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
+ auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
+ auto read_at_seq = kMaxSequenceNumber;
+ ReadOptions roptions;
+ // to prevent callback's seq to be overrriden inside DBImpk::Get
+ roptions.snapshot = wpt_db_->GetMaxSnapshot();
+ struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+ DBImpl* const db_;
+ WritePreparedTxnDB* const wpt_db_;
+ WritePreparedTxnReadCallback callback_;
+ WriteBatch* rollback_batch_;
+ std::map<uint32_t, const Comparator*>& comparators_;
+ std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+ using CFKeys = std::set<Slice, SetComparator>;
+ std::map<uint32_t, CFKeys> keys_;
+ bool rollback_merge_operands_;
+ ReadOptions roptions_;
+
+ RollbackWriteBatchBuilder(
+ DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
+ WriteBatch* dst_batch,
+ std::map<uint32_t, const Comparator*>& comparators,
+ std::map<uint32_t, ColumnFamilyHandle*>& handles,
+ bool rollback_merge_operands, const ReadOptions& _roptions)
+ : db_(db),
+ wpt_db_(wpt_db),
+ callback_(wpt_db, snap_seq), // disable min_uncommitted optimization
+ rollback_batch_(dst_batch),
+ comparators_(comparators),
+ handles_(handles),
+ rollback_merge_operands_(rollback_merge_operands),
+ roptions_(_roptions) {}
+
+ Status Rollback(uint32_t cf, const Slice& key) {
+ Status s;
+ CFKeys& cf_keys = keys_[cf];
+ if (cf_keys.size() == 0) { // just inserted
+ auto cmp = comparators_[cf];
+ keys_[cf] = CFKeys(SetComparator(cmp));
+ }
+ auto it = cf_keys.insert(key);
+ // second is false if a element already existed.
+ if (it.second == false) {
+ return s;
+ }
+
+ PinnableSlice pinnable_val;
+ bool not_used;
+ auto cf_handle = handles_[cf];
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cf_handle;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = &not_used;
+ get_impl_options.callback = &callback_;
+ s = db_->GetImpl(roptions_, key, get_impl_options);
+ assert(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+ assert(s.ok());
+ } else if (s.IsNotFound()) {
+ // There has been no readable value before txn. By adding a delete we
+ // make sure that there will be none afterwards either.
+ if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
+ s = rollback_batch_->SingleDelete(cf_handle, key);
+ } else {
+ s = rollback_batch_->Delete(cf_handle, key);
+ }
+ assert(s.ok());
+ } else {
+ // Unexpected status. Return it to the user.
+ }
+ return s;
+ }
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override {
+ return Rollback(cf, key);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return Rollback(cf, key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return Rollback(cf, key);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key,
+ const Slice& /*val*/) override {
+ if (rollback_merge_operands_) {
+ return Rollback(cf, key);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ Status MarkNoop(bool) override { return Status::OK(); }
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
+ Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+ Status MarkCommit(const Slice&) override { return Status::OK(); }
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ protected:
+ Handler::OptionState WriteAfterCommit() const override {
+ return Handler::OptionState::kDisabled;
+ }
+ } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
+ *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+ wpt_db_->txn_db_options_.rollback_merge_operands,
+ roptions);
+ auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
+ if (!s.ok()) {
+ return s;
+ }
+ // The Rollback marker will be used as a batch separator
+ s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+ assert(s.ok());
+ bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+ const bool DISABLE_MEMTABLE = true;
+ const uint64_t NO_REF_LOG = 0;
+ uint64_t seq_used = kMaxSequenceNumber;
+ const size_t ONE_BATCH = 1;
+ const bool kFirstPrepareBatch = true;
+ // We commit the rolled back prepared batches. Although this is
+ // counter-intuitive, i) it is safe to do so, since the prepared batches are
+ // already canceled out by the rollback batch, ii) adding the commit entry to
+ // CommitCache will allow us to benefit from the existing mechanism in
+ // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+ // with a live snapshot around so that the live snapshot properly skips the
+ // entry even if its prepare seq is lower than max_evicted_seq_.
+ AddPreparedCallback add_prepared_callback(
+ wpt_db_, db_impl_, ONE_BATCH,
+ db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+ WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+ wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH);
+ PreReleaseCallback* pre_release_callback;
+ if (do_one_write) {
+ pre_release_callback = &update_commit_map;
+ } else {
+ pre_release_callback = &add_prepared_callback;
+ }
+ // Note: the rollback batch does not need AddPrepared since it is written to
+ // DB in one shot. min_uncommitted still works since it requires capturing
+ // data that is written to DB but not yet committed, while
+ // the rollback batch commits with PreReleaseCallback.
+ s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
+ NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ pre_release_callback);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ if (!s.ok()) {
+ return s;
+ }
+ if (do_one_write) {
+ assert(!db_impl_->immutable_db_options().two_write_queues);
+ wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+ return s;
+ } // else do the 2nd write for commit
+ uint64_t rollback_seq = seq_used;
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "RollbackInternal 2nd write rollback_seq: %" PRIu64,
+ rollback_seq);
+ // Commit the batch by writing an empty batch to the queue that will release
+ // the commit sequence number to readers.
+ WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
+ wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
+ WriteBatch empty_batch;
+ s = empty_batch.PutLogData(Slice());
+ assert(s.ok());
+ // In the absence of Prepare markers, use Noop as a batch separator
+ s = WriteBatchInternal::InsertNoop(&empty_batch);
+ assert(s.ok());
+ s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+ NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ &update_commit_map_with_prepare);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "RollbackInternal (status=%s) commit: %" PRIu64,
+ s.ToString().c_str(), GetId());
+ // TODO(lth): For WriteUnPrepared that rollback is called frequently,
+ // RemovePrepared could be moved to the callback to reduce lock contention.
+ if (s.ok()) {
+ wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+ }
+ // Note: RemovePrepared for prepared batch is called from within
+ // PreReleaseCallback
+ wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
+
+ return s;
+}
+
+Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ SequenceNumber* tracked_at_seq) {
+ assert(snapshot_);
+
+ SequenceNumber min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(snapshot_.get())
+ ->min_uncommitted_;
+ SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+ // tracked_at_seq is either max or the last snapshot with which this key was
+ // trackeed so there is no need to apply the IsInSnapshot to this comparison
+ // here as tracked_at_seq is not a prepare seq.
+ if (*tracked_at_seq <= snap_seq) {
+ // If the key has been previous validated at a sequence number earlier
+ // than the curent snapshot's sequence number, we already know it has not
+ // been modified.
+ return Status::OK();
+ }
+
+ *tracked_at_seq = snap_seq;
+
+ ColumnFamilyHandle* cfh =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+ WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
+ kBackedByDBSnapshot);
+ // TODO(yanqin): support user-defined timestamp
+ return TransactionUtil::CheckKeyForConflicts(
+ db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+ false /* cache_only */, &snap_checker, min_uncommitted);
+}
+
+void WritePreparedTxn::SetSnapshot() {
+ const bool kForWWConflictCheck = true;
+ SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck);
+ SetSnapshotInternal(snapshot);
+}
+
+Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
+ auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
+ prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+ return ret;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.h b/src/rocksdb/utilities/transactions/write_prepared_txn.h
new file mode 100644
index 000000000..30d9bdb99
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritePreparedTxnDB;
+
+// This impl could write to DB also uncommitted data and then later tell apart
+// committed data from uncommitted data. Uncommitted data could be after the
+// Prepare phase in 2PC (WritePreparedTxn) or before that
+// (WriteUnpreparedTxnImpl).
+class WritePreparedTxn : public PessimisticTransaction {
+ public:
+ WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
+ const TransactionOptions& txn_options);
+ // No copying allowed
+ WritePreparedTxn(const WritePreparedTxn&) = delete;
+ void operator=(const WritePreparedTxn&) = delete;
+
+ virtual ~WritePreparedTxn() {}
+
+ // To make WAL commit markers visible, the snapshot will be based on the last
+ // seq in the WAL that is also published, LastPublishedSequence, as opposed to
+ // the last seq in the memtable.
+ using Transaction::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using Transaction::MultiGet;
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ // Note: The behavior is undefined in presence of interleaved writes to the
+ // same transaction.
+ // To make WAL commit markers visible, the snapshot will be
+ // based on the last seq in the WAL that is also published,
+ // LastPublishedSequence, as opposed to the last seq in the memtable.
+ using Transaction::GetIterator;
+ virtual Iterator* GetIterator(const ReadOptions& options) override;
+ virtual Iterator* GetIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual void SetSnapshot() override;
+
+ protected:
+ void Initialize(const TransactionOptions& txn_options) override;
+ // Override the protected SetId to make it visible to the friend class
+ // WritePreparedTxnDB
+ inline void SetId(uint64_t id) override { Transaction::SetId(id); }
+
+ private:
+ friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
+ friend class WritePreparedTxnDB;
+ friend class WriteUnpreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+
+ Status PrepareInternal() override;
+
+ Status CommitWithoutPrepareInternal() override;
+
+ Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
+ // Since the data is already written to memtables at the Prepare phase, the
+ // commit entails writing only a commit marker in the WAL. The sequence number
+ // of the commit marker is then the commit timestamp of the transaction. To
+ // make WAL commit markers visible, the snapshot will be based on the last seq
+ // in the WAL that is also published, LastPublishedSequence, as opposed to the
+ // last seq in the memtable.
+ Status CommitInternal() override;
+
+ Status RollbackInternal() override;
+
+ virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ SequenceNumber* tracked_at_seq) override;
+
+ virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
+
+ WritePreparedTxnDB* wpt_db_;
+ // Number of sub-batches in prepare
+ size_t prepare_batch_cnt_ = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc
new file mode 100644
index 000000000..595c3df8f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc
@@ -0,0 +1,1030 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+// This function is for testing only. If it returns true, then all entries in
+// the commit cache will be evicted. Unit and/or stress tests (db_stress)
+// can implement this function and customize how frequently commit cache
+// eviction occurs.
+// TODO: remove this function once we can configure commit cache to be very
+// small so that eviction occurs very frequently. This requires the commit
+// cache entry to be able to encode prepare and commit sequence numbers so that
+// the commit sequence number does not have to be within a certain range of
+// prepare sequence number.
+extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void)
+ __attribute__((__weak__));
+
+namespace ROCKSDB_NAMESPACE {
+
+Status WritePreparedTxnDB::Initialize(
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles) {
+ auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+ assert(dbimpl != nullptr);
+ auto rtxns = dbimpl->recovered_transactions();
+ std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
+ for (auto rtxn : rtxns) {
+ // There should only one batch for WritePrepared policy.
+ assert(rtxn.second->batches_.size() == 1);
+ const auto& seq = rtxn.second->batches_.begin()->first;
+ const auto& batch_info = rtxn.second->batches_.begin()->second;
+ auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+ ordered_seq_cnt[seq] = cnt;
+ }
+ // AddPrepared must be called in order
+ for (auto seq_cnt : ordered_seq_cnt) {
+ auto seq = seq_cnt.first;
+ auto cnt = seq_cnt.second;
+ for (size_t i = 0; i < cnt; i++) {
+ AddPrepared(seq + i);
+ }
+ }
+ SequenceNumber prev_max = max_evicted_seq_;
+ SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+ AdvanceMaxEvictedSeq(prev_max, last_seq);
+ // Create a gap between max and the next snapshot. This simplifies the logic
+ // in IsInSnapshot by not having to consider the special case of max ==
+ // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+ if (last_seq) {
+ db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+ db_impl_->versions_->SetLastSequence(last_seq + 1);
+ db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+ }
+
+ db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+ // A callback to commit a single sub-batch
+ class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+ public:
+ explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+ : db_(db) {}
+ Status Callback(SequenceNumber commit_seq,
+ bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+ size_t /*index*/, size_t /*total*/) override {
+ assert(!is_mem_disabled);
+ db_->AddCommitted(commit_seq, commit_seq);
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ };
+ db_impl_->SetRecoverableStatePreReleaseCallback(
+ new CommitSubBatchPreReleaseCallback(this));
+
+ auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices,
+ handles);
+ return s;
+}
+
+Status WritePreparedTxnDB::VerifyCFOptions(
+ const ColumnFamilyOptions& cf_options) {
+ Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
+ return Status::InvalidArgument(
+ "memtable_factory->CanHandleDuplicatedKey() cannot be false with "
+ "WritePrpeared transactions");
+ }
+ return Status::OK();
+}
+
+Transaction* WritePreparedTxnDB::BeginTransaction(
+ const WriteOptions& write_options, const TransactionOptions& txn_options,
+ Transaction* old_txn) {
+ if (old_txn != nullptr) {
+ ReinitializeTransaction(old_txn, write_options, txn_options);
+ return old_txn;
+ } else {
+ return new WritePreparedTxn(this, write_options, txn_options);
+ }
+}
+
+Status WritePreparedTxnDB::Write(const WriteOptions& opts,
+ WriteBatch* updates) {
+ if (txn_db_options_.skip_concurrency_control) {
+ // Skip locking the rows
+ const size_t UNKNOWN_BATCH_CNT = 0;
+ WritePreparedTxn* NO_TXN = nullptr;
+ return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN);
+ } else {
+ return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+ }
+}
+
+Status WritePreparedTxnDB::Write(
+ const WriteOptions& opts,
+ const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+ if (optimizations.skip_concurrency_control) {
+ // Skip locking the rows
+ const size_t UNKNOWN_BATCH_CNT = 0;
+ const size_t ONE_BATCH_CNT = 1;
+ const size_t batch_cnt = optimizations.skip_duplicate_key_check
+ ? ONE_BATCH_CNT
+ : UNKNOWN_BATCH_CNT;
+ WritePreparedTxn* NO_TXN = nullptr;
+ return WriteInternal(opts, updates, batch_cnt, NO_TXN);
+ } else {
+ // TODO(myabandeh): Make use of skip_duplicate_key_check hint
+ // Fall back to unoptimized version
+ return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+ }
+}
+
+Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
+ WriteBatch* batch, size_t batch_cnt,
+ WritePreparedTxn* txn) {
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "CommitBatchInternal");
+ if (batch->Count() == 0) {
+ // Otherwise our 1 seq per batch logic will break since there is no seq
+ // increased for this batch.
+ return Status::OK();
+ }
+
+ if (write_options_orig.protection_bytes_per_key > 0) {
+ auto s = WriteBatchInternal::UpdateProtectionInfo(
+ batch, write_options_orig.protection_bytes_per_key);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (batch_cnt == 0) { // not provided, then compute it
+ // TODO(myabandeh): add an option to allow user skipping this cost
+ SubBatchCounter counter(*GetCFComparatorMap());
+ auto s = batch->Iterate(&counter);
+ if (!s.ok()) {
+ return s;
+ }
+ batch_cnt = counter.BatchCount();
+ WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+ ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+ static_cast<uint64_t>(batch_cnt));
+ }
+ assert(batch_cnt);
+
+ bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+ WriteOptions write_options(write_options_orig);
+ // In the absence of Prepare markers, use Noop as a batch separator
+ auto s = WriteBatchInternal::InsertNoop(batch);
+ assert(s.ok());
+ const bool DISABLE_MEMTABLE = true;
+ const uint64_t no_log_ref = 0;
+ uint64_t seq_used = kMaxSequenceNumber;
+ const size_t ZERO_PREPARES = 0;
+ const bool kSeperatePrepareCommitBatches = true;
+ // Since this is not 2pc, there is no need for AddPrepared but having it in
+ // the PreReleaseCallback enables an optimization. Refer to
+ // SmallestUnCommittedSeq for more details.
+ AddPreparedCallback add_prepared_callback(
+ this, db_impl_, batch_cnt,
+ db_impl_->immutable_db_options().two_write_queues,
+ !kSeperatePrepareCommitBatches);
+ WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+ this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
+ PreReleaseCallback* pre_release_callback;
+ if (do_one_write) {
+ pre_release_callback = &update_commit_map;
+ } else {
+ pre_release_callback = &add_prepared_callback;
+ }
+ s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref,
+ !DISABLE_MEMTABLE, &seq_used, batch_cnt,
+ pre_release_callback);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ uint64_t prepare_seq = seq_used;
+ if (txn != nullptr) {
+ txn->SetId(prepare_seq);
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ if (do_one_write) {
+ return s;
+ } // else do the 2nd write for commit
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "CommitBatchInternal 2nd write prepare_seq: %" PRIu64,
+ prepare_seq);
+ // Commit the batch by writing an empty batch to the 2nd queue that will
+ // release the commit sequence number to readers.
+ const size_t ZERO_COMMITS = 0;
+ WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
+ this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS);
+ WriteBatch empty_batch;
+ write_options.disableWAL = true;
+ write_options.sync = false;
+ const size_t ONE_BATCH = 1; // Just to inc the seq
+ s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
+ no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ &update_commit_map_with_prepare);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ // Note: RemovePrepared is called from within PreReleaseCallback
+ return s;
+}
+
+Status WritePreparedTxnDB::Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) {
+ SequenceNumber min_uncommitted, snap_seq;
+ const SnapshotBackup backed_by_snapshot =
+ AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+ WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted,
+ backed_by_snapshot);
+ bool* dont_care = nullptr;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = value;
+ get_impl_options.value_found = dont_care;
+ get_impl_options.callback = &callback;
+ auto res = db_impl_->GetImpl(options, key, get_impl_options);
+ if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(),
+ backed_by_snapshot))) {
+ return res;
+ } else {
+ res.PermitUncheckedError();
+ WPRecordTick(TXN_GET_TRY_AGAIN);
+ return Status::TryAgain();
+ }
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(
+ const std::vector<ColumnFamilyHandle*>& handles) {
+ auto cf_map = new std::map<uint32_t, const Comparator*>();
+ auto handle_map = new std::map<uint32_t, ColumnFamilyHandle*>();
+ for (auto h : handles) {
+ auto id = h->GetID();
+ const Comparator* comparator = h->GetComparator();
+ (*cf_map)[id] = comparator;
+ if (id != 0) {
+ (*handle_map)[id] = h;
+ } else {
+ // The pointer to the default cf handle in the handles will be deleted.
+ // Use the pointer maintained by the db instead.
+ (*handle_map)[id] = DefaultColumnFamily();
+ }
+ }
+ cf_map_.reset(cf_map);
+ handle_map_.reset(handle_map);
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) {
+ auto old_cf_map_ptr = cf_map_.get();
+ assert(old_cf_map_ptr);
+ auto cf_map = new std::map<uint32_t, const Comparator*>(*old_cf_map_ptr);
+ auto old_handle_map_ptr = handle_map_.get();
+ assert(old_handle_map_ptr);
+ auto handle_map =
+ new std::map<uint32_t, ColumnFamilyHandle*>(*old_handle_map_ptr);
+ auto id = h->GetID();
+ const Comparator* comparator = h->GetComparator();
+ (*cf_map)[id] = comparator;
+ (*handle_map)[id] = h;
+ cf_map_.reset(cf_map);
+ handle_map_.reset(handle_map);
+}
+
+std::vector<Status> WritePreparedTxnDB::MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ assert(values);
+ size_t num_keys = keys.size();
+ values->resize(num_keys);
+
+ std::vector<Status> stat_list(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]);
+ }
+ return stat_list;
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WritePreparedTxnDB::IteratorState {
+ IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+ std::shared_ptr<ManagedSnapshot> s,
+ SequenceNumber min_uncommitted)
+ : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot),
+ snapshot(s) {}
+
+ WritePreparedTxnReadCallback callback;
+ std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+ delete reinterpret_cast<WritePreparedTxnDB::IteratorState*>(arg1);
+}
+} // anonymous namespace
+
+Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) {
+ constexpr bool expose_blob_index = false;
+ constexpr bool allow_refresh = false;
+ std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+ SequenceNumber snapshot_seq = kMaxSequenceNumber;
+ SequenceNumber min_uncommitted = 0;
+ if (options.snapshot != nullptr) {
+ snapshot_seq = options.snapshot->GetSequenceNumber();
+ min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(options.snapshot)
+ ->min_uncommitted_;
+ } else {
+ auto* snapshot = GetSnapshot();
+ // We take a snapshot to make sure that the related data in the commit map
+ // are not deleted.
+ snapshot_seq = snapshot->GetSequenceNumber();
+ min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+ own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+ }
+ assert(snapshot_seq != kMaxSequenceNumber);
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ auto* state =
+ new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+ auto* db_iter =
+ db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+ expose_blob_index, allow_refresh);
+ db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+ return db_iter;
+}
+
+Status WritePreparedTxnDB::NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ constexpr bool expose_blob_index = false;
+ constexpr bool allow_refresh = false;
+ std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+ SequenceNumber snapshot_seq = kMaxSequenceNumber;
+ SequenceNumber min_uncommitted = 0;
+ if (options.snapshot != nullptr) {
+ snapshot_seq = options.snapshot->GetSequenceNumber();
+ min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(options.snapshot)
+ ->min_uncommitted_;
+ } else {
+ auto* snapshot = GetSnapshot();
+ // We take a snapshot to make sure that the related data in the commit map
+ // are not deleted.
+ snapshot_seq = snapshot->GetSequenceNumber();
+ own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+ min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ for (auto* column_family : column_families) {
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ auto* state =
+ new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+ auto* db_iter =
+ db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+ expose_blob_index, allow_refresh);
+ db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+ iterators->push_back(db_iter);
+ }
+ return Status::OK();
+}
+
+void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) {
+ // Adcance max_evicted_seq_ no more than 100 times before the cache wraps
+ // around.
+ INC_STEP_FOR_MAX_EVICTED =
+ std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
+ snapshot_cache_ = std::unique_ptr<std::atomic<SequenceNumber>[]>(
+ new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
+ commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
+ new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
+ dummy_max_snapshot_.number_ = kMaxSequenceNumber;
+ rollback_deletion_type_callback_ =
+ txn_db_opts.rollback_deletion_type_callback;
+}
+
+void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
+ bool locked) {
+ // When max_evicted_seq_ advances, move older entries from prepared_txns_
+ // to delayed_prepared_. This guarantees that if a seq is lower than max,
+ // then it is not in prepared_txns_ and save an expensive, synchronized
+ // lookup from a shared set. delayed_prepared_ is expected to be empty in
+ // normal cases.
+ ROCKS_LOG_DETAILS(
+ info_log_,
+ "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64,
+ prepared_txns_.empty(),
+ prepared_txns_.empty() ? 0 : prepared_txns_.top());
+ const SequenceNumber prepared_top = prepared_txns_.top();
+ const bool empty = prepared_top == kMaxSequenceNumber;
+ // Preliminary check to avoid the synchronization cost
+ if (!empty && prepared_top <= new_max) {
+ if (locked) {
+ // Needed to avoid double locking in pop().
+ prepared_txns_.push_pop_mutex()->Unlock();
+ }
+ WriteLock wl(&prepared_mutex_);
+ // Need to fetch fresh values of ::top after mutex is acquired
+ while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
+ auto to_be_popped = prepared_txns_.top();
+ delayed_prepared_.insert(to_be_popped);
+ ROCKS_LOG_WARN(info_log_,
+ "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
+ " new_max=%" PRIu64 ")",
+ static_cast<uint64_t>(delayed_prepared_.size()),
+ to_be_popped, new_max);
+ delayed_prepared_empty_.store(false, std::memory_order_release);
+ // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise
+ // there will be a point in time that the entry is neither in
+ // prepared_txns_ nor in delayed_prepared_, which will not be checked if
+ // delayed_prepared_empty_ is false.
+ prepared_txns_.pop();
+ }
+ if (locked) {
+ prepared_txns_.push_pop_mutex()->Lock();
+ }
+ }
+}
+
+void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) {
+ ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64,
+ seq, max_evicted_seq_.load());
+ TEST_SYNC_POINT("AddPrepared::begin:pause");
+ TEST_SYNC_POINT("AddPrepared::begin:resume");
+ if (!locked) {
+ prepared_txns_.push_pop_mutex()->Lock();
+ }
+ prepared_txns_.push_pop_mutex()->AssertHeld();
+ prepared_txns_.push(seq);
+ auto new_max = future_max_evicted_seq_.load();
+ if (UNLIKELY(seq <= new_max)) {
+ // This should not happen in normal case
+ ROCKS_LOG_ERROR(
+ info_log_,
+ "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64
+ " <= %" PRIu64,
+ seq, new_max);
+ CheckPreparedAgainstMax(new_max, true /*locked*/);
+ }
+ if (!locked) {
+ prepared_txns_.push_pop_mutex()->Unlock();
+ }
+ TEST_SYNC_POINT("AddPrepared::end");
+}
+
+void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+ uint8_t loop_cnt) {
+ ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
+ prepare_seq, commit_seq);
+ TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start");
+ TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause");
+ auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
+ CommitEntry64b evicted_64b;
+ CommitEntry evicted;
+ bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted);
+ if (LIKELY(to_be_evicted)) {
+ assert(evicted.prep_seq != prepare_seq);
+ auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
+ ROCKS_LOG_DETAILS(info_log_,
+ "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64,
+ evicted.prep_seq, evicted.commit_seq, prev_max);
+ if (prev_max < evicted.commit_seq) {
+ auto last = db_impl_->GetLastPublishedSequence(); // could be 0
+ SequenceNumber max_evicted_seq;
+ if (LIKELY(evicted.commit_seq < last)) {
+ assert(last > 0);
+ // Inc max in larger steps to avoid frequent updates
+ max_evicted_seq =
+ std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1);
+ } else {
+ // legit when a commit entry in a write batch overwrite the previous one
+ max_evicted_seq = evicted.commit_seq;
+ }
+#ifdef OS_LINUX
+ if (rocksdb_write_prepared_TEST_ShouldClearCommitCache &&
+ rocksdb_write_prepared_TEST_ShouldClearCommitCache()) {
+ max_evicted_seq = last;
+ }
+#endif // OS_LINUX
+ ROCKS_LOG_DETAILS(info_log_,
+ "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64
+ " => %lu",
+ prepare_seq, evicted.prep_seq, evicted.commit_seq,
+ prev_max, max_evicted_seq);
+ AdvanceMaxEvictedSeq(prev_max, max_evicted_seq);
+ }
+ if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) {
+ WriteLock wl(&prepared_mutex_);
+ auto dp_iter = delayed_prepared_.find(evicted.prep_seq);
+ if (dp_iter != delayed_prepared_.end()) {
+ // This is a rare case that txn is committed but prepared_txns_ is not
+ // cleaned up yet. Refer to delayed_prepared_commits_ definition for
+ // why it should be kept updated.
+ delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq;
+ ROCKS_LOG_DEBUG(info_log_,
+ "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64,
+ evicted.prep_seq, evicted.commit_seq);
+ }
+ }
+ // After each eviction from commit cache, check if the commit entry should
+ // be kept around because it overlaps with a live snapshot.
+ CheckAgainstSnapshots(evicted);
+ }
+ bool succ =
+ ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
+ if (UNLIKELY(!succ)) {
+ ROCKS_LOG_ERROR(info_log_,
+ "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64
+ ",%" PRIu64 " retrying...",
+ indexed_seq, prepare_seq, commit_seq);
+ // A very rare event, in which the commit entry is updated before we do.
+ // Here we apply a very simple solution of retrying.
+ if (loop_cnt > 100) {
+ throw std::runtime_error("Infinite loop in AddCommitted!");
+ }
+ AddCommitted(prepare_seq, commit_seq, ++loop_cnt);
+ return;
+ }
+ TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end");
+ TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause");
+}
+
+void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq,
+ const size_t batch_cnt) {
+ TEST_SYNC_POINT_CALLBACK(
+ "RemovePrepared:Start",
+ const_cast<void*>(reinterpret_cast<const void*>(&prepare_seq)));
+ TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause");
+ TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume");
+ ROCKS_LOG_DETAILS(info_log_,
+ "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt,
+ prepare_seq, batch_cnt);
+ WriteLock wl(&prepared_mutex_);
+ for (size_t i = 0; i < batch_cnt; i++) {
+ prepared_txns_.erase(prepare_seq + i);
+ bool was_empty = delayed_prepared_.empty();
+ if (!was_empty) {
+ delayed_prepared_.erase(prepare_seq + i);
+ auto it = delayed_prepared_commits_.find(prepare_seq + i);
+ if (it != delayed_prepared_commits_.end()) {
+ ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64,
+ prepare_seq + i);
+ delayed_prepared_commits_.erase(it);
+ }
+ bool is_empty = delayed_prepared_.empty();
+ if (was_empty != is_empty) {
+ delayed_prepared_empty_.store(is_empty, std::memory_order_release);
+ }
+ }
+ }
+}
+
+bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq,
+ CommitEntry64b* entry_64b,
+ CommitEntry* entry) const {
+ *entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].load(
+ std::memory_order_acquire);
+ bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT);
+ return valid;
+}
+
+bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq,
+ const CommitEntry& new_entry,
+ CommitEntry* evicted_entry) {
+ CommitEntry64b new_entry_64b(new_entry, FORMAT);
+ CommitEntry64b evicted_entry_64b =
+ commit_cache_[static_cast<size_t>(indexed_seq)].exchange(
+ new_entry_64b, std::memory_order_acq_rel);
+ bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT);
+ return valid;
+}
+
+bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq,
+ CommitEntry64b& expected_entry_64b,
+ const CommitEntry& new_entry) {
+ auto& atomic_entry = commit_cache_[static_cast<size_t>(indexed_seq)];
+ CommitEntry64b new_entry_64b(new_entry, FORMAT);
+ bool succ = atomic_entry.compare_exchange_strong(
+ expected_entry_64b, new_entry_64b, std::memory_order_acq_rel,
+ std::memory_order_acquire);
+ return succ;
+}
+
+void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+ const SequenceNumber& new_max) {
+ ROCKS_LOG_DETAILS(info_log_,
+ "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64,
+ prev_max, new_max);
+ // Declare the intention before getting snapshot from the DB. This helps a
+ // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if
+ // it has not already. Otherwise the new snapshot is when we ask DB for
+ // snapshots smaller than future max.
+ auto updated_future_max = prev_max;
+ while (updated_future_max < new_max &&
+ !future_max_evicted_seq_.compare_exchange_weak(
+ updated_future_max, new_max, std::memory_order_acq_rel,
+ std::memory_order_relaxed)) {
+ };
+
+ CheckPreparedAgainstMax(new_max, false /*locked*/);
+
+ // With each change to max_evicted_seq_ fetch the live snapshots behind it.
+ // We use max as the version of snapshots to identify how fresh are the
+ // snapshot list. This works because the snapshots are between 0 and
+ // max, so the larger the max, the more complete they are.
+ SequenceNumber new_snapshots_version = new_max;
+ std::vector<SequenceNumber> snapshots;
+ bool update_snapshots = false;
+ if (new_snapshots_version > snapshots_version_) {
+ // This is to avoid updating the snapshots_ if it already updated
+ // with a more recent vesion by a concrrent thread
+ update_snapshots = true;
+ // We only care about snapshots lower then max
+ snapshots = GetSnapshotListFromDB(new_max);
+ }
+ if (update_snapshots) {
+ UpdateSnapshots(snapshots, new_snapshots_version);
+ if (!snapshots.empty()) {
+ WriteLock wl(&old_commit_map_mutex_);
+ for (auto snap : snapshots) {
+ // This allows IsInSnapshot to tell apart the reads from in valid
+ // snapshots from the reads from committed values in valid snapshots.
+ old_commit_map_[snap];
+ }
+ old_commit_map_empty_.store(false, std::memory_order_release);
+ }
+ }
+ auto updated_prev_max = prev_max;
+ TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause");
+ TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume");
+ while (updated_prev_max < new_max &&
+ !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max,
+ std::memory_order_acq_rel,
+ std::memory_order_relaxed)) {
+ };
+}
+
+const Snapshot* WritePreparedTxnDB::GetSnapshot() {
+ const bool kForWWConflictCheck = true;
+ return GetSnapshotInternal(!kForWWConflictCheck);
+}
+
+SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
+ bool for_ww_conflict_check) {
+ // Note: for this optimization setting the last sequence number and obtaining
+ // the smallest uncommitted seq should be done atomically. However to avoid
+ // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the
+ // snapshot. Since we always updated the list of unprepared seq (via
+ // AddPrepared) AFTER the last sequence is updated, this guarantees that the
+ // smallest uncommitted seq that we pair with the snapshot is smaller or equal
+ // the value that would be obtained otherwise atomically. That is ok since
+ // this optimization works as long as min_uncommitted is less than or equal
+ // than the smallest uncommitted seq when the snapshot was taken.
+ auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq();
+ SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+ TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first");
+ assert(snap_impl);
+ SequenceNumber snap_seq = snap_impl->GetSequenceNumber();
+ // Note: Check against future_max_evicted_seq_ (in contrast with
+ // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq.
+ if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) {
+ // There is a very rare case in which the commit entry evicts another commit
+ // entry that is not published yet thus advancing max evicted seq beyond the
+ // last published seq. This case is not likely in real-world setup so we
+ // handle it with a few retries.
+ size_t retry = 0;
+ SequenceNumber max;
+ while ((max = future_max_evicted_seq_.load()) != 0 &&
+ snap_impl->GetSequenceNumber() <= max && retry < 100) {
+ ROCKS_LOG_WARN(info_log_,
+ "GetSnapshot snap: %" PRIu64 " max: %" PRIu64
+ " retry %" ROCKSDB_PRIszt,
+ snap_impl->GetSequenceNumber(), max, retry);
+ ReleaseSnapshot(snap_impl);
+ // Wait for last visible seq to catch up with max, and also go beyond it
+ // by one.
+ AdvanceSeqByOne();
+ snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+ assert(snap_impl);
+ retry++;
+ }
+ assert(snap_impl->GetSequenceNumber() > max);
+ if (snap_impl->GetSequenceNumber() <= max) {
+ throw std::runtime_error(
+ "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) +
+ " after " + std::to_string(retry) +
+ " retries is still less than futre_max_evicted_seq_" +
+ std::to_string(max));
+ }
+ }
+ EnhanceSnapshot(snap_impl, min_uncommitted);
+ ROCKS_LOG_DETAILS(
+ db_impl_->immutable_db_options().info_log,
+ "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64,
+ snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted);
+ TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end");
+ return snap_impl;
+}
+
+void WritePreparedTxnDB::AdvanceSeqByOne() {
+ // Inserting an empty value will i) let the max evicted entry to be
+ // published, i.e., max == last_published, increase the last published to
+ // be one beyond max, i.e., max < last_published.
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr);
+ std::hash<std::thread::id> hasher;
+ char name[64];
+ snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id()));
+ assert(strlen(name) < 64 - 1);
+ Status s = txn0->SetName(name);
+ assert(s.ok());
+ if (s.ok()) {
+ // Without prepare it would simply skip the commit
+ s = txn0->Prepare();
+ }
+ assert(s.ok());
+ if (s.ok()) {
+ s = txn0->Commit();
+ }
+ assert(s.ok());
+ delete txn0;
+}
+
+const std::vector<SequenceNumber> WritePreparedTxnDB::GetSnapshotListFromDB(
+ SequenceNumber max) {
+ ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max);
+ InstrumentedMutexLock dblock(db_impl_->mutex());
+ db_impl_->mutex()->AssertHeld();
+ return db_impl_->snapshots().GetAll(nullptr, max);
+}
+
+void WritePreparedTxnDB::ReleaseSnapshotInternal(
+ const SequenceNumber snap_seq) {
+ // TODO(myabandeh): relax should enough since the synchronizatin is already
+ // done by snapshots_mutex_ under which this function is called.
+ if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) {
+ // Then this is a rare case that transaction did not finish before max
+ // advances. It is expected for a few read-only backup snapshots. For such
+ // snapshots we might have kept around a couple of entries in the
+ // old_commit_map_. Check and do garbage collection if that is the case.
+ bool need_gc = false;
+ {
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+ ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+ snap_seq);
+ ReadLock rl(&old_commit_map_mutex_);
+ auto prep_set_entry = old_commit_map_.find(snap_seq);
+ need_gc = prep_set_entry != old_commit_map_.end();
+ }
+ if (need_gc) {
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+ ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+ snap_seq);
+ WriteLock wl(&old_commit_map_mutex_);
+ old_commit_map_.erase(snap_seq);
+ old_commit_map_empty_.store(old_commit_map_.empty(),
+ std::memory_order_release);
+ }
+ }
+}
+
+void WritePreparedTxnDB::CleanupReleasedSnapshots(
+ const std::vector<SequenceNumber>& new_snapshots,
+ const std::vector<SequenceNumber>& old_snapshots) {
+ auto newi = new_snapshots.begin();
+ auto oldi = old_snapshots.begin();
+ for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) {
+ assert(*newi >= *oldi); // cannot have new snapshots with lower seq
+ if (*newi == *oldi) { // still not released
+ auto value = *newi;
+ while (newi != new_snapshots.end() && *newi == value) {
+ newi++;
+ }
+ while (oldi != old_snapshots.end() && *oldi == value) {
+ oldi++;
+ }
+ } else {
+ assert(*newi > *oldi); // *oldi is released
+ ReleaseSnapshotInternal(*oldi);
+ oldi++;
+ }
+ }
+ // Everything remained in old_snapshots is released and must be cleaned up
+ for (; oldi != old_snapshots.end(); oldi++) {
+ ReleaseSnapshotInternal(*oldi);
+ }
+}
+
+void WritePreparedTxnDB::UpdateSnapshots(
+ const std::vector<SequenceNumber>& snapshots,
+ const SequenceNumber& version) {
+ ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64,
+ version);
+ TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start");
+ TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start");
+#ifndef NDEBUG
+ size_t sync_i = 0;
+#endif
+ ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
+ WriteLock wl(&snapshots_mutex_);
+ snapshots_version_ = version;
+ // We update the list concurrently with the readers.
+ // Both new and old lists are sorted and the new list is subset of the
+ // previous list plus some new items. Thus if a snapshot repeats in
+ // both new and old lists, it will appear upper in the new list. So if
+ // we simply insert the new snapshots in order, if an overwritten item
+ // is still valid in the new list is either written to the same place in
+ // the array or it is written in a higher palce before it gets
+ // overwritten by another item. This guarantess a reader that reads the
+ // list bottom-up will eventaully see a snapshot that repeats in the
+ // update, either before it gets overwritten by the writer or
+ // afterwards.
+ size_t i = 0;
+ auto it = snapshots.begin();
+ for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) {
+ snapshot_cache_[i].store(*it, std::memory_order_release);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+ }
+#ifndef NDEBUG
+ // Release the remaining sync points since they are useless given that the
+ // reader would also use lock to access snapshots
+ for (++sync_i; sync_i <= 10; ++sync_i) {
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+ }
+#endif
+ snapshots_.clear();
+ for (; it != snapshots.end(); ++it) {
+ // Insert them to a vector that is less efficient to access
+ // concurrently
+ snapshots_.push_back(*it);
+ }
+ // Update the size at the end. Otherwise a parallel reader might read
+ // items that are not set yet.
+ snapshots_total_.store(snapshots.size(), std::memory_order_release);
+
+ // Note: this must be done after the snapshots data structures are updated
+ // with the new list of snapshots.
+ CleanupReleasedSnapshots(snapshots, snapshots_all_);
+ snapshots_all_ = snapshots;
+
+ TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end");
+ TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end");
+}
+
+void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
+ TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start");
+ TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start");
+#ifndef NDEBUG
+ size_t sync_i = 0;
+#endif
+ // First check the snapshot cache that is efficient for concurrent access
+ auto cnt = snapshots_total_.load(std::memory_order_acquire);
+ // The list might get updated concurrently as we are reading from it. The
+ // reader should be able to read all the snapshots that are still valid
+ // after the update. Since the survived snapshots are written in a higher
+ // place before gets overwritten the reader that reads bottom-up will
+ // eventully see it.
+ const bool next_is_larger = true;
+ // We will set to true if the border line snapshot suggests that.
+ bool search_larger_list = false;
+ size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
+ for (; 0 < ip1; ip1--) {
+ SequenceNumber snapshot_seq =
+ snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:",
+ ++sync_i);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+ if (ip1 == SNAPSHOT_CACHE_SIZE) { // border line snapshot
+ // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq
+ // then later also continue the search to larger snapshots
+ search_larger_list = snapshot_seq < evicted.commit_seq;
+ }
+ if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+ snapshot_seq, !next_is_larger)) {
+ break;
+ }
+ }
+#ifndef NDEBUG
+ // Release the remaining sync points before accquiring the lock
+ for (++sync_i; sync_i <= 10; ++sync_i) {
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i);
+ TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+ }
+#endif
+ TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end");
+ TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end");
+ if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) {
+ // Then access the less efficient list of snapshots_
+ WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
+ ROCKS_LOG_WARN(info_log_,
+ "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64
+ "> with %" ROCKSDB_PRIszt " snapshots",
+ evicted.prep_seq, evicted.commit_seq, cnt);
+ ReadLock rl(&snapshots_mutex_);
+ // Items could have moved from the snapshots_ to snapshot_cache_ before
+ // accquiring the lock. To make sure that we do not miss a valid snapshot,
+ // read snapshot_cache_ again while holding the lock.
+ for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
+ SequenceNumber snapshot_seq =
+ snapshot_cache_[i].load(std::memory_order_acquire);
+ if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+ snapshot_seq, next_is_larger)) {
+ break;
+ }
+ }
+ for (auto snapshot_seq_2 : snapshots_) {
+ if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+ snapshot_seq_2, next_is_larger)) {
+ break;
+ }
+ }
+ }
+}
+
+bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
+ const uint64_t& prep_seq, const uint64_t& commit_seq,
+ const uint64_t& snapshot_seq, const bool next_is_larger = true) {
+ // If we do not store an entry in old_commit_map_ we assume it is committed in
+ // all snapshots. If commit_seq <= snapshot_seq, it is considered already in
+ // the snapshot so we need not to keep the entry around for this snapshot.
+ if (commit_seq <= snapshot_seq) {
+ // continue the search if the next snapshot could be smaller than commit_seq
+ return !next_is_larger;
+ }
+ // then snapshot_seq < commit_seq
+ if (prep_seq <= snapshot_seq) { // overlapping range
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+ ROCKS_LOG_WARN(info_log_,
+ "old_commit_map_mutex_ overhead for %" PRIu64
+ " commit entry: <%" PRIu64 ",%" PRIu64 ">",
+ snapshot_seq, prep_seq, commit_seq);
+ WriteLock wl(&old_commit_map_mutex_);
+ old_commit_map_empty_.store(false, std::memory_order_release);
+ auto& vec = old_commit_map_[snapshot_seq];
+ vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq);
+ // We need to store it once for each overlapping snapshot. Returning true to
+ // continue the search if there is more overlapping snapshot.
+ return true;
+ }
+ // continue the search if the next snapshot could be larger than prep_seq
+ return next_is_larger;
+}
+
+WritePreparedTxnDB::~WritePreparedTxnDB() {
+ // At this point there could be running compaction/flush holding a
+ // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB.
+ // Make sure those jobs finished before destructing WritePreparedTxnDB.
+ if (!db_impl_->shutting_down_) {
+ db_impl_->CancelAllBackgroundWork(true /*wait*/);
+ }
+}
+
+void SubBatchCounter::InitWithComp(const uint32_t cf) {
+ auto cmp = comparators_[cf];
+ keys_[cf] = CFKeys(SetComparator(cmp));
+}
+
+void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) {
+ CFKeys& cf_keys = keys_[cf];
+ if (cf_keys.size() == 0) { // just inserted
+ InitWithComp(cf);
+ }
+ auto it = cf_keys.insert(key);
+ if (it.second == false) { // second is false if a element already existed.
+ batches_++;
+ keys_.clear();
+ InitWithComp(cf);
+ keys_[cf].insert(key);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.h b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h
new file mode 100644
index 000000000..25a382473
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h
@@ -0,0 +1,1125 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/db_iter.h"
+#include "db/pre_release_callback.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "util/set_comparator.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot };
+
+// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
+// In this way some data in the DB might not be committed. The DB provides
+// mechanisms to tell such data apart from committed data.
+class WritePreparedTxnDB : public PessimisticTransactionDB {
+ public:
+ explicit WritePreparedTxnDB(DB* db,
+ const TransactionDBOptions& txn_db_options)
+ : PessimisticTransactionDB(db, txn_db_options),
+ SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+ SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+ COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+ COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+ FORMAT(COMMIT_CACHE_BITS) {
+ Init(txn_db_options);
+ }
+
+ explicit WritePreparedTxnDB(StackableDB* db,
+ const TransactionDBOptions& txn_db_options)
+ : PessimisticTransactionDB(db, txn_db_options),
+ SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+ SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+ COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+ COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+ FORMAT(COMMIT_CACHE_BITS) {
+ Init(txn_db_options);
+ }
+
+ virtual ~WritePreparedTxnDB();
+
+ virtual Status Initialize(
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles) override;
+
+ Transaction* BeginTransaction(const WriteOptions& write_options,
+ const TransactionOptions& txn_options,
+ Transaction* old_txn) override;
+
+ using TransactionDB::Write;
+ Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+ // Optimized version of ::Write that receives more optimization request such
+ // as skip_concurrency_control.
+ using PessimisticTransactionDB::Write;
+ Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&,
+ WriteBatch* updates) override;
+
+ // Write the batch to the underlying DB and mark it as committed. Could be
+ // used by both directly from TxnDB or through a transaction.
+ Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch,
+ size_t batch_cnt, WritePreparedTxn* txn);
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+
+ using DB::NewIterators;
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ // Check whether the transaction that wrote the value with sequence number seq
+ // is visible to the snapshot with sequence number snapshot_seq.
+ // Returns true if commit_seq <= snapshot_seq
+ // If the snapshot_seq is already released and snapshot_seq <= max, sets
+ // *snap_released to true and returns true as well.
+ inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq,
+ uint64_t min_uncommitted = kMinUnCommittedSeq,
+ bool* snap_released = nullptr) const {
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " min_uncommitted %" PRIu64,
+ prep_seq, snapshot_seq, min_uncommitted);
+ assert(min_uncommitted >= kMinUnCommittedSeq);
+ // Caller is responsible to initialize snap_released.
+ assert(snap_released == nullptr || *snap_released == false);
+ // Here we try to infer the return value without looking into prepare list.
+ // This would help avoiding synchronization over a shared map.
+ // TODO(myabandeh): optimize this. This sequence of checks must be correct
+ // but not necessary efficient
+ if (prep_seq == 0) {
+ // Compaction will output keys to bottom-level with sequence number 0 if
+ // it is visible to the earliest snapshot.
+ ROCKS_LOG_DETAILS(
+ info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, 1);
+ return true;
+ }
+ if (snapshot_seq < prep_seq) {
+ // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq
+ ROCKS_LOG_DETAILS(
+ info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, 0);
+ return false;
+ }
+ if (prep_seq < min_uncommitted) {
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32
+ " because of min_uncommitted %" PRIu64,
+ prep_seq, snapshot_seq, 1, min_uncommitted);
+ return true;
+ }
+ // Commit of delayed prepared has two non-atomic steps: add to commit cache,
+ // remove from delayed prepared. Our reads from these two is also
+ // non-atomic. By looking into commit cache first thus we might not find the
+ // prep_seq neither in commit cache not in delayed_prepared_. To fix that i)
+ // we check if there was any delayed prepared BEFORE looking into commit
+ // cache, ii) if there was, we complete the search steps to be these: i)
+ // commit cache, ii) delayed prepared, commit cache again. In this way if
+ // the first query to commit cache missed the commit, the 2nd will catch it.
+ bool was_empty;
+ SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub;
+ CommitEntry64b dont_care;
+ auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
+ size_t repeats = 0;
+ do {
+ repeats++;
+ assert(repeats < 100);
+ if (UNLIKELY(repeats >= 100)) {
+ throw std::runtime_error(
+ "The read was intrupted 100 times by update to max_evicted_seq_. "
+ "This is unexpected in all setups");
+ }
+ max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire);
+ TEST_SYNC_POINT(
+ "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause");
+ TEST_SYNC_POINT(
+ "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume");
+ was_empty = delayed_prepared_empty_.load(std::memory_order_acquire);
+ TEST_SYNC_POINT(
+ "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause");
+ TEST_SYNC_POINT(
+ "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume");
+ CommitEntry cached;
+ bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+ TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause");
+ TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume");
+ if (exist && prep_seq == cached.prep_seq) {
+ // It is committed and also not evicted from commit cache
+ ROCKS_LOG_DETAILS(
+ info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+ return cached.commit_seq <= snapshot_seq;
+ }
+ // else it could be committed but not inserted in the map which could
+ // happen after recovery, or it could be committed and evicted by another
+ // commit, or never committed.
+
+ // At this point we don't know if it was committed or it is still prepared
+ max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+ if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) {
+ continue;
+ }
+ // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub
+ if (max_evicted_seq_ub < prep_seq) {
+ // Not evicted from cache and also not present, so must be still
+ // prepared
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32,
+ prep_seq, snapshot_seq, 0);
+ return false;
+ }
+ TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause");
+ TEST_SYNC_POINT(
+ "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume");
+ if (!was_empty) {
+ // We should not normally reach here
+ WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
+ ReadLock rl(&prepared_mutex_);
+ ROCKS_LOG_WARN(
+ info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64,
+ static_cast<uint64_t>(delayed_prepared_.size()), prep_seq);
+ if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
+ // This is the order: 1) delayed_prepared_commits_ update, 2) publish
+ // 3) delayed_prepared_ clean up. So check if it is the case of a late
+ // clenaup.
+ auto it = delayed_prepared_commits_.find(prep_seq);
+ if (it == delayed_prepared_commits_.end()) {
+ // Then it is not committed yet
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32,
+ prep_seq, snapshot_seq, 0);
+ return false;
+ } else {
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " commit: %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, it->second,
+ snapshot_seq <= it->second);
+ return it->second <= snapshot_seq;
+ }
+ } else {
+ // 2nd query to commit cache. Refer to was_empty comment above.
+ exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+ if (exist && prep_seq == cached.prep_seq) {
+ ROCKS_LOG_DETAILS(
+ info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+ return cached.commit_seq <= snapshot_seq;
+ }
+ max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+ }
+ }
+ } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub));
+ // When advancing max_evicted_seq_, we move older entires from prepared to
+ // delayed_prepared_. Also we move evicted entries from commit cache to
+ // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <=
+ // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in
+ // old_commit_map_, iii) committed with no conflict with any snapshot. Case
+ // (i) delayed_prepared_ is checked above
+ if (max_evicted_seq_ub < snapshot_seq) { // then (ii) cannot be the case
+ // only (iii) is the case: committed
+ // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq <
+ // snapshot_seq
+ ROCKS_LOG_DETAILS(
+ info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, 1);
+ return true;
+ }
+ // else (ii) might be the case: check the commit data saved for this
+ // snapshot. If there was no overlapping commit entry, then it is committed
+ // with a commit_seq lower than any live snapshot, including snapshot_seq.
+ if (old_commit_map_empty_.load(std::memory_order_acquire)) {
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32 " released=1",
+ prep_seq, snapshot_seq, 0);
+ assert(snap_released);
+ // This snapshot is not valid anymore. We cannot tell if prep_seq is
+ // committed before or after the snapshot. Return true but also set
+ // snap_released to true.
+ *snap_released = true;
+ return true;
+ }
+ {
+ // We should not normally reach here unless sapshot_seq is old. This is a
+ // rare case and it is ok to pay the cost of mutex ReadLock for such old,
+ // reading transactions.
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+ ReadLock rl(&old_commit_map_mutex_);
+ auto prep_set_entry = old_commit_map_.find(snapshot_seq);
+ bool found = prep_set_entry != old_commit_map_.end();
+ if (found) {
+ auto& vec = prep_set_entry->second;
+ found = std::binary_search(vec.begin(), vec.end(), prep_seq);
+ } else {
+ // coming from compaction
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32 " released=1",
+ prep_seq, snapshot_seq, 0);
+ // This snapshot is not valid anymore. We cannot tell if prep_seq is
+ // committed before or after the snapshot. Return true but also set
+ // snap_released to true.
+ assert(snap_released);
+ *snap_released = true;
+ return true;
+ }
+
+ if (!found) {
+ ROCKS_LOG_DETAILS(info_log_,
+ "IsInSnapshot %" PRIu64 " in %" PRIu64
+ " returns %" PRId32,
+ prep_seq, snapshot_seq, 1);
+ return true;
+ }
+ }
+ // (ii) it the case: it is committed but after the snapshot_seq
+ ROCKS_LOG_DETAILS(
+ info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+ prep_seq, snapshot_seq, 0);
+ return false;
+ }
+
+ // Add the transaction with prepare sequence seq to the prepared list.
+ // Note: must be called serially with increasing seq on each call.
+ // locked is true if prepared_mutex_ is already locked.
+ void AddPrepared(uint64_t seq, bool locked = false);
+ // Check if any of the prepared txns are less than new max_evicted_seq_. Must
+ // be called with prepared_mutex_ write locked.
+ void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked);
+ // Remove the transaction with prepare sequence seq from the prepared list
+ void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1);
+ // Add the transaction with prepare sequence prepare_seq and commit sequence
+ // commit_seq to the commit map. loop_cnt is to detect infinite loops.
+ // Note: must be called serially.
+ void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+ uint8_t loop_cnt = 0);
+
+ struct CommitEntry {
+ uint64_t prep_seq;
+ uint64_t commit_seq;
+ CommitEntry() : prep_seq(0), commit_seq(0) {}
+ CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
+ bool operator==(const CommitEntry& rhs) const {
+ return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq;
+ }
+ };
+
+ struct CommitEntry64bFormat {
+ explicit CommitEntry64bFormat(size_t index_bits)
+ : INDEX_BITS(index_bits),
+ PREP_BITS(static_cast<size_t>(64 - PAD_BITS - INDEX_BITS)),
+ COMMIT_BITS(static_cast<size_t>(64 - PREP_BITS)),
+ COMMIT_FILTER(static_cast<uint64_t>((1ull << COMMIT_BITS) - 1)),
+ DELTA_UPPERBOUND(static_cast<uint64_t>((1ull << COMMIT_BITS))) {}
+ // Number of higher bits of a sequence number that is not used. They are
+ // used to encode the value type, ...
+ const size_t PAD_BITS = static_cast<size_t>(8);
+ // Number of lower bits from prepare seq that can be skipped as they are
+ // implied by the index of the entry in the array
+ const size_t INDEX_BITS;
+ // Number of bits we use to encode the prepare seq
+ const size_t PREP_BITS;
+ // Number of bits we use to encode the commit seq.
+ const size_t COMMIT_BITS;
+ // Filter to encode/decode commit seq
+ const uint64_t COMMIT_FILTER;
+ // The value of commit_seq - prepare_seq + 1 must be less than this bound
+ const uint64_t DELTA_UPPERBOUND;
+ };
+
+ // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ...
+ // INDEX Delta Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ...
+ // DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA
+ // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and
+ // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the
+ // bits that do not have to be encoded (will be provided externally) DELTA:
+ // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of
+ // index bits + PADs
+ struct CommitEntry64b {
+ constexpr CommitEntry64b() noexcept : rep_(0) {}
+
+ CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format)
+ : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {}
+
+ CommitEntry64b(const uint64_t ps, const uint64_t cs,
+ const CommitEntry64bFormat& format) {
+ assert(ps < static_cast<uint64_t>(
+ (1ull << (format.PREP_BITS + format.INDEX_BITS))));
+ assert(ps <= cs);
+ uint64_t delta = cs - ps + 1; // make initialized delta always >= 1
+ // zero is reserved for uninitialized entries
+ assert(0 < delta);
+ assert(delta < format.DELTA_UPPERBOUND);
+ if (delta >= format.DELTA_UPPERBOUND) {
+ throw std::runtime_error(
+ "commit_seq >> prepare_seq. The allowed distance is " +
+ std::to_string(format.DELTA_UPPERBOUND) + " commit_seq is " +
+ std::to_string(cs) + " prepare_seq is " + std::to_string(ps));
+ }
+ rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER;
+ rep_ = rep_ | delta;
+ }
+
+ // Return false if the entry is empty
+ bool Parse(const uint64_t indexed_seq, CommitEntry* entry,
+ const CommitEntry64bFormat& format) {
+ uint64_t delta = rep_ & format.COMMIT_FILTER;
+ // zero is reserved for uninitialized entries
+ assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS)));
+ if (delta == 0) {
+ return false; // initialized entry would have non-zero delta
+ }
+
+ assert(indexed_seq < static_cast<uint64_t>((1ull << format.INDEX_BITS)));
+ uint64_t prep_up = rep_ & ~format.COMMIT_FILTER;
+ prep_up >>= format.PAD_BITS;
+ const uint64_t& prep_low = indexed_seq;
+ entry->prep_seq = prep_up | prep_low;
+
+ entry->commit_seq = entry->prep_seq + delta - 1;
+ return true;
+ }
+
+ private:
+ uint64_t rep_;
+ };
+
+ // Struct to hold ownership of snapshot and read callback for cleanup.
+ struct IteratorState;
+
+ std::shared_ptr<std::map<uint32_t, const Comparator*>> GetCFComparatorMap() {
+ return cf_map_;
+ }
+ std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> GetCFHandleMap() {
+ return handle_map_;
+ }
+ void UpdateCFComparatorMap(
+ const std::vector<ColumnFamilyHandle*>& handles) override;
+ void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override;
+
+ virtual const Snapshot* GetSnapshot() override;
+ SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check);
+
+ protected:
+ virtual Status VerifyCFOptions(
+ const ColumnFamilyOptions& cf_options) override;
+ // Assign the min and max sequence numbers for reading from the db. A seq >
+ // max is not valid, and a seq < min is valid, and a min <= seq < max requires
+ // further checking. Normally max is defined by the snapshot and min is by
+ // minimum uncommitted seq.
+ inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot,
+ SequenceNumber* min,
+ SequenceNumber* max);
+ // Validate is a snapshot sequence number is still valid based on the latest
+ // db status. backed_by_snapshot specifies if the number is baked by an actual
+ // snapshot object. order specified the memory order with which we load the
+ // atomic variables: relax is enough for the default since we care about last
+ // value seen by same thread.
+ inline bool ValidateSnapshot(
+ const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+ std::memory_order order = std::memory_order_relaxed);
+ // Get a dummy snapshot that refers to kMaxSequenceNumber
+ Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; }
+
+ bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ return rollback_deletion_type_callback_
+ ? rollback_deletion_type_callback_(this, column_family, key)
+ : false;
+ }
+
+ std::function<bool(TransactionDB*, ColumnFamilyHandle*, const Slice&)>
+ rollback_deletion_type_callback_;
+
+ private:
+ friend class AddPreparedCallback;
+ friend class PreparedHeap_BasicsTest_Test;
+ friend class PreparedHeap_Concurrent_Test;
+ friend class PreparedHeap_EmptyAtTheEnd_Test;
+ friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccess_Test;
+ friend class WritePreparedCommitEntryPreReleaseCallback;
+ friend class WritePreparedTransactionTestBase;
+ friend class WritePreparedTxn;
+ friend class WritePreparedTxnDBMock;
+ friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test;
+ friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasic_Test;
+ friend class
+ WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicates_Test;
+ friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test;
+ friend class WritePreparedTransactionTest_BasicRecovery_Test;
+ friend class WritePreparedTransactionTest_CheckAgainstSnapshots_Test;
+ friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test;
+ friend class WritePreparedTransactionTest_ConflictDetectionAfterRecovery_Test;
+ friend class WritePreparedTransactionTest_CommitMap_Test;
+ friend class WritePreparedTransactionTest_DoubleSnapshot_Test;
+ friend class WritePreparedTransactionTest_IsInSnapshotEmptyMap_Test;
+ friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test;
+ friend class WritePreparedTransactionTest_IsInSnapshot_Test;
+ friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test;
+ friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test;
+ friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test;
+ friend class
+ WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test;
+ friend class
+ WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test;
+ friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
+ friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
+ friend class WritePreparedTransactionTest_Rollback_Test;
+ friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test;
+ friend class WriteUnpreparedTxn;
+ friend class WriteUnpreparedTxnDB;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+ friend class MultiOpsTxnsStressTest;
+
+ void Init(const TransactionDBOptions& txn_db_opts);
+
+ void WPRecordTick(uint32_t ticker_type) const {
+ RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+ }
+
+ // A heap with the amortized O(1) complexity for erase. It uses one extra heap
+ // to keep track of erased entries that are not yet on top of the main heap.
+ class PreparedHeap {
+ // The mutex is required for push and pop from PreparedHeap. ::erase will
+ // use external synchronization via prepared_mutex_.
+ port::Mutex push_pop_mutex_;
+ std::deque<uint64_t> heap_;
+ std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+ erased_heap_;
+ std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
+ // True when testing crash recovery
+ bool TEST_CRASH_ = false;
+ friend class WritePreparedTxnDB;
+
+ public:
+ ~PreparedHeap() {
+ if (!TEST_CRASH_) {
+ assert(heap_.empty());
+ assert(erased_heap_.empty());
+ }
+ }
+ port::Mutex* push_pop_mutex() { return &push_pop_mutex_; }
+
+ inline bool empty() { return top() == kMaxSequenceNumber; }
+ // Returns kMaxSequenceNumber if empty() and the smallest otherwise.
+ inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
+ inline void push(uint64_t v) {
+ push_pop_mutex_.AssertHeld();
+ if (heap_.empty()) {
+ heap_top_.store(v, std::memory_order_release);
+ } else {
+ assert(heap_top_.load() < v);
+ }
+ heap_.push_back(v);
+ }
+ void pop(bool locked = false) {
+ if (!locked) {
+ push_pop_mutex()->Lock();
+ }
+ push_pop_mutex_.AssertHeld();
+ heap_.pop_front();
+ while (!heap_.empty() && !erased_heap_.empty() &&
+ // heap_.top() > erased_heap_.top() could happen if we have erased
+ // a non-existent entry. Ideally the user should not do that but we
+ // should be resilient against it.
+ heap_.front() >= erased_heap_.top()) {
+ if (heap_.front() == erased_heap_.top()) {
+ heap_.pop_front();
+ }
+ uint64_t erased __attribute__((__unused__));
+ erased = erased_heap_.top();
+ erased_heap_.pop();
+ // No duplicate prepare sequence numbers
+ assert(erased_heap_.empty() || erased_heap_.top() != erased);
+ }
+ while (heap_.empty() && !erased_heap_.empty()) {
+ erased_heap_.pop();
+ }
+ heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber,
+ std::memory_order_release);
+ if (!locked) {
+ push_pop_mutex()->Unlock();
+ }
+ }
+ // Concurrrent calls needs external synchronization. It is safe to be called
+ // concurrent to push and pop though.
+ void erase(uint64_t seq) {
+ if (!empty()) {
+ auto top_seq = top();
+ if (seq < top_seq) {
+ // Already popped, ignore it.
+ } else if (top_seq == seq) {
+ pop();
+#ifndef NDEBUG
+ MutexLock ml(push_pop_mutex());
+ assert(heap_.empty() || heap_.front() != seq);
+#endif
+ } else { // top() > seq
+ // Down the heap, remember to pop it later
+ erased_heap_.push(seq);
+ }
+ }
+ }
+ };
+
+ void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; }
+
+ // Get the commit entry with index indexed_seq from the commit table. It
+ // returns true if such entry exists.
+ bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b,
+ CommitEntry* entry) const;
+
+ // Rewrite the entry with the index indexed_seq in the commit table with the
+ // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
+ // sets the evicted_entry and returns true.
+ bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry,
+ CommitEntry* evicted_entry);
+
+ // Rewrite the entry with the index indexed_seq in the commit table with the
+ // commit entry new_entry only if the existing entry matches the
+ // expected_entry. Returns false otherwise.
+ bool ExchangeCommitEntry(const uint64_t indexed_seq,
+ CommitEntry64b& expected_entry,
+ const CommitEntry& new_entry);
+
+ // Increase max_evicted_seq_ from the previous value prev_max to the new
+ // value. This also involves taking care of prepared txns that are not
+ // committed before new_max, as well as updating the list of live snapshots at
+ // the time of updating the max. Thread-safety: this function can be called
+ // concurrently. The concurrent invocations of this function is equivalent to
+ // a serial invocation in which the last invocation is the one with the
+ // largest new_max value.
+ void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+ const SequenceNumber& new_max);
+
+ inline SequenceNumber SmallestUnCommittedSeq() {
+ // Note: We have two lists to look into, but for performance reasons they
+ // are not read atomically. Since CheckPreparedAgainstMax copies the entry
+ // to delayed_prepared_ before removing it from prepared_txns_, to ensure
+ // that a prepared entry will not go unmissed, we look into them in opposite
+ // order: first read prepared_txns_ and then delayed_prepared_.
+
+ // This must be called before calling ::top. This is because the concurrent
+ // thread would call ::RemovePrepared before updating
+ // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
+ // that the ::top that we read would be lower the ::top if we had otherwise
+ // update/read them atomically.
+ auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
+ auto min_prepare = prepared_txns_.top();
+ // Since we update the prepare_heap always from the main write queue via
+ // PreReleaseCallback, the prepared_txns_.top() indicates the smallest
+ // prepared data in 2pc transactions. For non-2pc transactions that are
+ // written in two steps, we also update prepared_txns_ at the first step
+ // (via the same mechanism) so that their uncommitted data is reflected in
+ // SmallestUnCommittedSeq.
+ if (!delayed_prepared_empty_.load()) {
+ ReadLock rl(&prepared_mutex_);
+ if (!delayed_prepared_.empty()) {
+ return *delayed_prepared_.begin();
+ }
+ }
+ bool empty = min_prepare == kMaxSequenceNumber;
+ if (empty) {
+ // Since GetLatestSequenceNumber is updated
+ // after prepared_txns_ are, the value of GetLatestSequenceNumber would
+ // reflect any uncommitted data that is not added to prepared_txns_ yet.
+ // Otherwise, if there is no concurrent txn, this value simply reflects
+ // that latest value in the memtable.
+ return next_prepare;
+ } else {
+ return std::min(min_prepare, next_prepare);
+ }
+ }
+
+ // Enhance the snapshot object by recording in it the smallest uncommitted seq
+ inline void EnhanceSnapshot(SnapshotImpl* snapshot,
+ SequenceNumber min_uncommitted) {
+ assert(snapshot);
+ assert(min_uncommitted <= snapshot->number_ + 1);
+ snapshot->min_uncommitted_ = min_uncommitted;
+ }
+
+ virtual const std::vector<SequenceNumber> GetSnapshotListFromDB(
+ SequenceNumber max);
+
+ // Will be called by the public ReleaseSnapshot method. Does the maintenance
+ // internal to WritePreparedTxnDB
+ void ReleaseSnapshotInternal(const SequenceNumber snap_seq);
+
+ // Update the list of snapshots corresponding to the soon-to-be-updated
+ // max_evicted_seq_. Thread-safety: this function can be called concurrently.
+ // The concurrent invocations of this function is equivalent to a serial
+ // invocation in which the last invocation is the one with the largest
+ // version value.
+ void UpdateSnapshots(const std::vector<SequenceNumber>& snapshots,
+ const SequenceNumber& version);
+ // Check the new list of new snapshots against the old one to see if any of
+ // the snapshots are released and to do the cleanup for the released snapshot.
+ void CleanupReleasedSnapshots(
+ const std::vector<SequenceNumber>& new_snapshots,
+ const std::vector<SequenceNumber>& old_snapshots);
+
+ // Check an evicted entry against live snapshots to see if it should be kept
+ // around or it can be safely discarded (and hence assume committed for all
+ // snapshots). Thread-safety: this function can be called concurrently. If it
+ // is called concurrently with multiple UpdateSnapshots, the result is the
+ // same as checking the intersection of the snapshot list before updates with
+ // the snapshot list of all the concurrent updates.
+ void CheckAgainstSnapshots(const CommitEntry& evicted);
+
+ // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq <
+ // commit_seq. Return false if checking the next snapshot(s) is not needed.
+ // This is the case if none of the next snapshots could satisfy the condition.
+ // next_is_larger: the next snapshot will be a larger value
+ bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq,
+ const uint64_t& commit_seq,
+ const uint64_t& snapshot_seq,
+ const bool next_is_larger);
+
+ // A trick to increase the last visible sequence number by one and also wait
+ // for the in-flight commits to be visible.
+ void AdvanceSeqByOne();
+
+ // The list of live snapshots at the last time that max_evicted_seq_ advanced.
+ // The list stored into two data structures: in snapshot_cache_ that is
+ // efficient for concurrent reads, and in snapshots_ if the data does not fit
+ // into snapshot_cache_. The total number of snapshots in the two lists
+ std::atomic<size_t> snapshots_total_ = {};
+ // The list sorted in ascending order. Thread-safety for writes is provided
+ // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
+ // each entry. In x86_64 architecture such reads are compiled to simple read
+ // instructions.
+ const size_t SNAPSHOT_CACHE_BITS;
+ const size_t SNAPSHOT_CACHE_SIZE;
+ std::unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
+ // 2nd list for storing snapshots. The list sorted in ascending order.
+ // Thread-safety is provided with snapshots_mutex_.
+ std::vector<SequenceNumber> snapshots_;
+ // The list of all snapshots: snapshots_ + snapshot_cache_. This list although
+ // redundant but simplifies CleanupOldSnapshots implementation.
+ // Thread-safety is provided with snapshots_mutex_.
+ std::vector<SequenceNumber> snapshots_all_;
+ // The version of the latest list of snapshots. This can be used to avoid
+ // rewriting a list that is concurrently updated with a more recent version.
+ SequenceNumber snapshots_version_ = 0;
+
+ // A heap of prepared transactions. Thread-safety is provided with
+ // prepared_mutex_.
+ PreparedHeap prepared_txns_;
+ const size_t COMMIT_CACHE_BITS;
+ const size_t COMMIT_CACHE_SIZE;
+ const CommitEntry64bFormat FORMAT;
+ // commit_cache_ must be initialized to zero to tell apart an empty index from
+ // a filled one. Thread-safety is provided with commit_cache_mutex_.
+ std::unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
+ // The largest evicted *commit* sequence number from the commit_cache_. If a
+ // seq is smaller than max_evicted_seq_ is might or might not be present in
+ // commit_cache_. So commit_cache_ must first be checked before consulting
+ // with max_evicted_seq_.
+ std::atomic<uint64_t> max_evicted_seq_ = {};
+ // Order: 1) update future_max_evicted_seq_ = new_max, 2)
+ // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since
+ // GetSnapshotInternal guarantess that the snapshot seq is larger than
+ // future_max_evicted_seq_, this guarantes that if a snapshot is not larger
+ // than max has already being looked at via a GetSnapshotListFromDB(new_max).
+ std::atomic<uint64_t> future_max_evicted_seq_ = {};
+ // Advance max_evicted_seq_ by this value each time it needs an update. The
+ // larger the value, the less frequent advances we would have. We do not want
+ // it to be too large either as it would cause stalls by doing too much
+ // maintenance work under the lock.
+ size_t INC_STEP_FOR_MAX_EVICTED = 1;
+ // A map from old snapshots (expected to be used by a few read-only txns) to
+ // prepared sequence number of the evicted entries from commit_cache_ that
+ // overlaps with such snapshot. These are the prepared sequence numbers that
+ // the snapshot, to which they are mapped, cannot assume to be committed just
+ // because it is no longer in the commit_cache_. The vector must be sorted
+ // after each update.
+ // Thread-safety is provided with old_commit_map_mutex_.
+ std::map<SequenceNumber, std::vector<SequenceNumber>> old_commit_map_;
+ // A set of long-running prepared transactions that are not finished by the
+ // time max_evicted_seq_ advances their sequence number. This is expected to
+ // be empty normally. Thread-safety is provided with prepared_mutex_.
+ std::set<uint64_t> delayed_prepared_;
+ // Commit of a delayed prepared: 1) update commit cache, 2) update
+ // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_.
+ // delayed_prepared_commits_ will help us tell apart the unprepared txns from
+ // the ones that are committed but not cleaned up yet.
+ std::unordered_map<SequenceNumber, SequenceNumber> delayed_prepared_commits_;
+ // Update when delayed_prepared_.empty() changes. Expected to be true
+ // normally.
+ std::atomic<bool> delayed_prepared_empty_ = {true};
+ // Update when old_commit_map_.empty() changes. Expected to be true normally.
+ std::atomic<bool> old_commit_map_empty_ = {true};
+ mutable port::RWMutex prepared_mutex_;
+ mutable port::RWMutex old_commit_map_mutex_;
+ mutable port::RWMutex commit_cache_mutex_;
+ mutable port::RWMutex snapshots_mutex_;
+ // A cache of the cf comparators
+ // Thread safety: since it is a const it is safe to read it concurrently
+ std::shared_ptr<std::map<uint32_t, const Comparator*>> cf_map_;
+ // A cache of the cf handles
+ // Thread safety: since the handle is read-only object it is a const it is
+ // safe to read it concurrently
+ std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_;
+ // A dummy snapshot object that refers to kMaxSequenceNumber
+ SnapshotImpl dummy_max_snapshot_;
+};
+
+class WritePreparedTxnReadCallback : public ReadCallback {
+ public:
+ WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
+ : ReadCallback(snapshot),
+ db_(db),
+ backed_by_snapshot_(kBackedByDBSnapshot) {}
+ WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot,
+ SequenceNumber min_uncommitted,
+ SnapshotBackup backed_by_snapshot)
+ : ReadCallback(snapshot, min_uncommitted),
+ db_(db),
+ backed_by_snapshot_(backed_by_snapshot) {
+ (void)backed_by_snapshot_; // to silence unused private field warning
+ }
+
+ virtual ~WritePreparedTxnReadCallback() {
+ // If it is not backed by snapshot, the caller must check validity
+ assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+ }
+
+ // Will be called to see if the seq number visible; if not it moves on to
+ // the next seq number.
+ inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override {
+ auto snapshot = max_visible_seq_;
+ bool snap_released = false;
+ auto ret =
+ db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released);
+ assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+ snap_released_ |= snap_released;
+ return ret;
+ }
+
+ inline bool valid() {
+ valid_checked_ = true;
+ return snap_released_ == false;
+ }
+
+ // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
+ private:
+ WritePreparedTxnDB* db_;
+ // Whether max_visible_seq_ is backed by a snapshot
+ const SnapshotBackup backed_by_snapshot_;
+ bool snap_released_ = false;
+ // Safety check to ensure that the caller has checked invalid statuses
+ bool valid_checked_ = false;
+};
+
+class AddPreparedCallback : public PreReleaseCallback {
+ public:
+ AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl,
+ size_t sub_batch_cnt, bool two_write_queues,
+ bool first_prepare_batch)
+ : db_(db),
+ db_impl_(db_impl),
+ sub_batch_cnt_(sub_batch_cnt),
+ two_write_queues_(two_write_queues),
+ first_prepare_batch_(first_prepare_batch) {
+ (void)two_write_queues_; // to silence unused private field warning
+ }
+ virtual Status Callback(SequenceNumber prepare_seq,
+ bool is_mem_disabled __attribute__((__unused__)),
+ uint64_t log_number, size_t index,
+ size_t total) override {
+ assert(index < total);
+ // To reduce the cost of lock acquisition competing with the concurrent
+ // prepare requests, lock on the first callback and unlock on the last.
+ const bool do_lock = !two_write_queues_ || index == 0;
+ const bool do_unlock = !two_write_queues_ || index + 1 == total;
+ // Always Prepare from the main queue
+ assert(!two_write_queues_ || !is_mem_disabled); // implies the 1st queue
+ TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause");
+ TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume");
+ if (do_lock) {
+ db_->prepared_txns_.push_pop_mutex()->Lock();
+ }
+ const bool kLocked = true;
+ for (size_t i = 0; i < sub_batch_cnt_; i++) {
+ db_->AddPrepared(prepare_seq + i, kLocked);
+ }
+ if (do_unlock) {
+ db_->prepared_txns_.push_pop_mutex()->Unlock();
+ }
+ TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end");
+ if (first_prepare_batch_) {
+ assert(log_number != 0);
+ db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
+ log_number);
+ }
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ DBImpl* db_impl_;
+ size_t sub_batch_cnt_;
+ bool two_write_queues_;
+ // It is 2PC and this is the first prepare batch. Always the case in 2PC
+ // unless it is WriteUnPrepared.
+ bool first_prepare_batch_;
+};
+
+class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+ public:
+ // includes_data indicates that the commit also writes non-empty
+ // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+ WritePreparedCommitEntryPreReleaseCallback(
+ WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq,
+ size_t prep_batch_cnt, size_t data_batch_cnt = 0,
+ SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0)
+ : db_(db),
+ db_impl_(db_impl),
+ prep_seq_(prep_seq),
+ prep_batch_cnt_(prep_batch_cnt),
+ data_batch_cnt_(data_batch_cnt),
+ includes_data_(data_batch_cnt_ > 0),
+ aux_seq_(aux_seq),
+ aux_batch_cnt_(aux_batch_cnt),
+ includes_aux_batch_(aux_batch_cnt > 0) {
+ assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor
+ assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0);
+ assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber)); // xor
+ }
+
+ virtual Status Callback(SequenceNumber commit_seq,
+ bool is_mem_disabled __attribute__((__unused__)),
+ uint64_t, size_t /*index*/,
+ size_t /*total*/) override {
+ // Always commit from the 2nd queue
+ assert(!db_impl_->immutable_db_options().two_write_queues ||
+ is_mem_disabled);
+ assert(includes_data_ || prep_seq_ != kMaxSequenceNumber);
+ // Data batch is what accompanied with the commit marker and affects the
+ // last seq in the commit batch.
+ const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+ ? commit_seq
+ : commit_seq + data_batch_cnt_ - 1;
+ if (prep_seq_ != kMaxSequenceNumber) {
+ for (size_t i = 0; i < prep_batch_cnt_; i++) {
+ db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+ }
+ } // else there was no prepare phase
+ if (includes_aux_batch_) {
+ for (size_t i = 0; i < aux_batch_cnt_; i++) {
+ db_->AddCommitted(aux_seq_ + i, last_commit_seq);
+ }
+ }
+ if (includes_data_) {
+ assert(data_batch_cnt_);
+ // Commit the data that is accompanied with the commit request
+ for (size_t i = 0; i < data_batch_cnt_; i++) {
+ // For commit seq of each batch use the commit seq of the last batch.
+ // This would make debugging easier by having all the batches having
+ // the same sequence number.
+ db_->AddCommitted(commit_seq + i, last_commit_seq);
+ }
+ }
+ if (db_impl_->immutable_db_options().two_write_queues) {
+ assert(is_mem_disabled); // implies the 2nd queue
+ // Publish the sequence number. We can do that here assuming the callback
+ // is invoked only from one write queue, which would guarantee that the
+ // publish sequence numbers will be in order, i.e., once a seq is
+ // published all the seq prior to that are also publishable.
+ db_impl_->SetLastPublishedSequence(last_commit_seq);
+ // Note RemovePrepared should be called after publishing the seq.
+ // Otherwise SmallestUnCommittedSeq optimization breaks.
+ if (prep_seq_ != kMaxSequenceNumber) {
+ db_->RemovePrepared(prep_seq_, prep_batch_cnt_);
+ } // else there was no prepare phase
+ if (includes_aux_batch_) {
+ db_->RemovePrepared(aux_seq_, aux_batch_cnt_);
+ }
+ }
+ // else SequenceNumber that is updated as part of the write already does the
+ // publishing
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ DBImpl* db_impl_;
+ // kMaxSequenceNumber if there was no prepare phase
+ SequenceNumber prep_seq_;
+ size_t prep_batch_cnt_;
+ size_t data_batch_cnt_;
+ // Data here is the batch that is written with the commit marker, either
+ // because it is commit without prepare or commit has a CommitTimeWriteBatch.
+ bool includes_data_;
+ // Auxiliary batch (if there is any) is a batch that is written before, but
+ // gets the same commit seq as prepare batch or data batch. This is used in
+ // two write queues where the CommitTimeWriteBatch becomes the aux batch and
+ // we do a separate write to actually commit everything.
+ SequenceNumber aux_seq_;
+ size_t aux_batch_cnt_;
+ bool includes_aux_batch_;
+};
+
+// For two_write_queues commit both the aborted batch and the cleanup batch and
+// then published the seq
+class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback {
+ public:
+ WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db,
+ DBImpl* db_impl,
+ SequenceNumber prep_seq,
+ SequenceNumber rollback_seq,
+ size_t prep_batch_cnt)
+ : db_(db),
+ db_impl_(db_impl),
+ prep_seq_(prep_seq),
+ rollback_seq_(rollback_seq),
+ prep_batch_cnt_(prep_batch_cnt) {
+ assert(prep_seq != kMaxSequenceNumber);
+ assert(rollback_seq != kMaxSequenceNumber);
+ assert(prep_batch_cnt_ > 0);
+ }
+
+ Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t,
+ size_t /*index*/, size_t /*total*/) override {
+ // Always commit from the 2nd queue
+ assert(is_mem_disabled); // implies the 2nd queue
+ assert(db_impl_->immutable_db_options().two_write_queues);
+#ifdef NDEBUG
+ (void)is_mem_disabled;
+#endif
+ const uint64_t last_commit_seq = commit_seq;
+ db_->AddCommitted(rollback_seq_, last_commit_seq);
+ for (size_t i = 0; i < prep_batch_cnt_; i++) {
+ db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+ }
+ db_impl_->SetLastPublishedSequence(last_commit_seq);
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ DBImpl* db_impl_;
+ SequenceNumber prep_seq_;
+ SequenceNumber rollback_seq_;
+ size_t prep_batch_cnt_;
+};
+
+// Count the number of sub-batches inside a batch. A sub-batch does not have
+// duplicate keys.
+struct SubBatchCounter : public WriteBatch::Handler {
+ explicit SubBatchCounter(std::map<uint32_t, const Comparator*>& comparators)
+ : comparators_(comparators), batches_(1) {}
+ std::map<uint32_t, const Comparator*>& comparators_;
+ using CFKeys = std::set<Slice, SetComparator>;
+ std::map<uint32_t, CFKeys> keys_;
+ size_t batches_;
+ size_t BatchCount() { return batches_; }
+ void AddKey(const uint32_t cf, const Slice& key);
+ void InitWithComp(const uint32_t cf);
+ Status MarkNoop(bool) override { return Status::OK(); }
+ Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+ Status MarkCommit(const Slice&) override { return Status::OK(); }
+ Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+ AddKey(cf, key);
+ return Status::OK();
+ }
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ AddKey(cf, key);
+ return Status::OK();
+ }
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ AddKey(cf, key);
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+ AddKey(cf, key);
+ return Status::OK();
+ }
+ Status MarkBeginPrepare(bool) override { return Status::OK(); }
+ Status MarkRollback(const Slice&) override { return Status::OK(); }
+ Handler::OptionState WriteAfterCommit() const override {
+ return Handler::OptionState::kDisabled;
+ }
+};
+
+SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
+ SequenceNumber* min,
+ SequenceNumber* max) {
+ if (snapshot != nullptr) {
+ *min =
+ static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+ *max = static_cast_with_check<const SnapshotImpl>(snapshot)->number_;
+ // A duplicate of the check in EnhanceSnapshot().
+ assert(*min <= *max + 1);
+ return kBackedByDBSnapshot;
+ } else {
+ *min = SmallestUnCommittedSeq();
+ *max = 0; // to be assigned later after sv is referenced.
+ return kUnbackedByDBSnapshot;
+ }
+}
+
+bool WritePreparedTxnDB::ValidateSnapshot(
+ const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+ std::memory_order order) {
+ if (backed_by_snapshot == kBackedByDBSnapshot) {
+ return true;
+ } else {
+ SequenceNumber max = max_evicted_seq_.load(order);
+ // Validate that max has not advanced the snapshot seq that is not backed
+ // by a real snapshot. This is a very rare case that should not happen in
+ // real workloads.
+ if (UNLIKELY(snap_seq <= max && snap_seq != 0)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
new file mode 100644
index 000000000..6c8c62e0e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
@@ -0,0 +1,790 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
+ public:
+ WriteUnpreparedTransactionTestBase(bool use_stackable_db,
+ bool two_write_queue,
+ TxnDBWritePolicy write_policy)
+ : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+ kOrderedWrite) {}
+};
+
+class WriteUnpreparedTransactionTest
+ : public WriteUnpreparedTransactionTestBase,
+ virtual public ::testing::WithParamInterface<
+ std::tuple<bool, bool, TxnDBWritePolicy>> {
+ public:
+ WriteUnpreparedTransactionTest()
+ : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()),
+ std::get<1>(GetParam()),
+ std::get<2>(GetParam())) {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+ WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest,
+ ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
+ std::make_tuple(false, true, WRITE_UNPREPARED)));
+
+enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
+class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase,
+ virtual public ::testing::WithParamInterface<
+ std::tuple<bool, StressAction>> {
+ public:
+ WriteUnpreparedStressTest()
+ : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
+ WRITE_UNPREPARED),
+ action_(std::get<1>(GetParam())) {}
+ StressAction action_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+ WriteUnpreparedStressTest, WriteUnpreparedStressTest,
+ ::testing::Values(std::make_tuple(false, NO_SNAPSHOT),
+ std::make_tuple(false, RO_SNAPSHOT),
+ std::make_tuple(false, REFRESH_SNAPSHOT),
+ std::make_tuple(true, NO_SNAPSHOT),
+ std::make_tuple(true, RO_SNAPSHOT),
+ std::make_tuple(true, REFRESH_SNAPSHOT)));
+
+TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
+ // The following tests checks whether reading your own write for
+ // a transaction works for write unprepared, when there are uncommitted
+ // values written into DB.
+ auto verify_state = [](Iterator* iter, const std::string& key,
+ const std::string& value) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(key, iter->key().ToString());
+ ASSERT_EQ(value, iter->value().ToString());
+ };
+
+ // Test always reseeking vs never reseeking.
+ for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
+ options.max_sequential_skip_in_iterations = max_skip;
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ TransactionOptions txn_options;
+ WriteOptions woptions;
+ ReadOptions roptions;
+
+ ASSERT_OK(db->Put(woptions, "a", ""));
+ ASSERT_OK(db->Put(woptions, "b", ""));
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+ txn->SetSnapshot();
+
+ for (int i = 0; i < 5; i++) {
+ std::string stored_value = "v" + std::to_string(i);
+ ASSERT_OK(txn->Put("a", stored_value));
+ ASSERT_OK(txn->Put("b", stored_value));
+ ASSERT_OK(wup_txn->FlushWriteBatchToDB(false));
+
+ // Test Get()
+ std::string value;
+ ASSERT_OK(txn->Get(roptions, "a", &value));
+ ASSERT_EQ(value, stored_value);
+ ASSERT_OK(txn->Get(roptions, "b", &value));
+ ASSERT_EQ(value, stored_value);
+
+ // Test Next()
+ auto iter = txn->GetIterator(roptions);
+ iter->Seek("a");
+ verify_state(iter, "a", stored_value);
+
+ iter->Next();
+ verify_state(iter, "b", stored_value);
+
+ iter->SeekToFirst();
+ verify_state(iter, "a", stored_value);
+
+ iter->Next();
+ verify_state(iter, "b", stored_value);
+
+ delete iter;
+
+ // Test Prev()
+ iter = txn->GetIterator(roptions);
+ iter->SeekForPrev("b");
+ verify_state(iter, "b", stored_value);
+
+ iter->Prev();
+ verify_state(iter, "a", stored_value);
+
+ iter->SeekToLast();
+ verify_state(iter, "b", stored_value);
+
+ iter->Prev();
+ verify_state(iter, "a", stored_value);
+
+ delete iter;
+ }
+
+ delete txn;
+ }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
+ // This is a stress test where different threads are writing random keys, and
+ // then before committing or aborting the transaction, it validates to see
+ // that it can read the keys it wrote, and the keys it did not write respect
+ // the snapshot. To avoid row lock contention (and simply stressing the
+ // locking system), each thread is mostly only writing to its own set of keys.
+ const uint32_t kNumIter = 1000;
+ const uint32_t kNumThreads = 10;
+ const uint32_t kNumKeys = 5;
+
+ // Test with
+ // 1. no snapshots set
+ // 2. snapshot set on ReadOptions
+ // 3. snapshot set, and refreshing after every write.
+ StressAction a = action_;
+ WriteOptions write_options;
+ txn_db_options.transaction_lock_timeout = -1;
+ options.disable_auto_compactions = true;
+ ASSERT_OK(ReOpen());
+
+ std::vector<std::string> keys;
+ for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
+ keys.push_back("k" + std::to_string(k));
+ }
+ RandomShuffle(keys.begin(), keys.end());
+
+ // This counter will act as a "sequence number" to help us validate
+ // visibility logic with snapshots. If we had direct access to the seqno of
+ // snapshots and key/values, then we should directly compare those instead.
+ std::atomic<int64_t> counter(0);
+
+ std::function<void(uint32_t)> stress_thread = [&](int id) {
+ size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+ Random64 rnd(static_cast<uint32_t>(tid));
+
+ Transaction* txn;
+ TransactionOptions txn_options;
+ // batch_size of 1 causes writes to DB for every marker.
+ txn_options.write_batch_flush_threshold = 1;
+ ReadOptions read_options;
+
+ for (uint32_t i = 0; i < kNumIter; i++) {
+ std::set<std::string> owned_keys(keys.begin() + id * kNumKeys,
+ keys.begin() + (id + 1) * kNumKeys);
+ // Add unowned keys to make the workload more interesting, but this
+ // increases row lock contention, so just do it sometimes.
+ if (rnd.OneIn(2)) {
+ owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
+ }
+
+ txn = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn->SetName(std::to_string(id)));
+ txn->SetSnapshot();
+ if (a >= RO_SNAPSHOT) {
+ read_options.snapshot = txn->GetSnapshot();
+ ASSERT_TRUE(read_options.snapshot != nullptr);
+ }
+
+ uint64_t buf[2];
+ buf[0] = id;
+
+ // When scanning through the database, make sure that all unprepared
+ // keys have value >= snapshot and all other keys have value < snapshot.
+ int64_t snapshot_num = counter.fetch_add(1);
+
+ Status s;
+ for (const auto& key : owned_keys) {
+ buf[1] = counter.fetch_add(1);
+ s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
+ if (!s.ok()) {
+ break;
+ }
+ if (a == REFRESH_SNAPSHOT) {
+ txn->SetSnapshot();
+ read_options.snapshot = txn->GetSnapshot();
+ snapshot_num = counter.fetch_add(1);
+ }
+ }
+
+ // Failure is possible due to snapshot validation. In this case,
+ // rollback and move onto next iteration.
+ if (!s.ok()) {
+ ASSERT_TRUE(s.IsBusy());
+ ASSERT_OK(txn->Rollback());
+ delete txn;
+ continue;
+ }
+
+ auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
+ const std::string& key, const std::string& value) {
+ if (owned_keys.count(key) > 0) {
+ ASSERT_EQ(value.size(), 16);
+
+ // Since this key is part of owned_keys, then this key must be
+ // unprepared by this transaction identified by 'id'
+ ASSERT_EQ(((int64_t*)value.c_str())[0], id);
+ if (a == REFRESH_SNAPSHOT) {
+ // If refresh snapshot is true, then the snapshot is refreshed
+ // after every Put(), meaning that the current snapshot in
+ // snapshot_num must be greater than the "seqno" of any keys
+ // written by the current transaction.
+ ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+ } else {
+ // If refresh snapshot is not on, then the snapshot was taken at
+ // the beginning of the transaction, meaning all writes must come
+ // after snapshot_num
+ ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
+ }
+ } else if (a >= RO_SNAPSHOT) {
+ // If this is not an unprepared key, just assert that the key
+ // "seqno" is smaller than the snapshot seqno.
+ ASSERT_EQ(value.size(), 16);
+ ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+ }
+ };
+
+ // Validate Get()/Next()/Prev(). Do only one of them to save time, and
+ // reduce lock contention.
+ switch (rnd.Uniform(3)) {
+ case 0: // Validate Get()
+ {
+ for (const auto& key : keys) {
+ std::string value;
+ s = txn->Get(read_options, Slice(key), &value);
+ if (!s.ok()) {
+ ASSERT_TRUE(s.IsNotFound());
+ ASSERT_EQ(owned_keys.count(key), 0);
+ } else {
+ verify_key(key, value);
+ }
+ }
+ break;
+ }
+ case 1: // Validate Next()
+ {
+ Iterator* iter = txn->GetIterator(read_options);
+ ASSERT_OK(iter->status());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ verify_key(iter->key().ToString(), iter->value().ToString());
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+ break;
+ }
+ case 2: // Validate Prev()
+ {
+ Iterator* iter = txn->GetIterator(read_options);
+ ASSERT_OK(iter->status());
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ verify_key(iter->key().ToString(), iter->value().ToString());
+ }
+ ASSERT_OK(iter->status());
+ delete iter;
+ break;
+ }
+ default:
+ FAIL();
+ }
+
+ if (rnd.OneIn(2)) {
+ ASSERT_OK(txn->Commit());
+ } else {
+ ASSERT_OK(txn->Rollback());
+ }
+ delete txn;
+ }
+ };
+
+ std::vector<port::Thread> threads;
+ for (uint32_t i = 0; i < kNumThreads; i++) {
+ threads.emplace_back(stress_thread, i);
+ }
+
+ for (auto& t : threads) {
+ t.join();
+ }
+}
+#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// This tests how write unprepared behaves during recovery when the DB crashes
+// after a transaction has either been unprepared or prepared, and tests if
+// the changes are correctly applied for prepared transactions if we decide to
+// rollback/commit.
+TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
+ WriteOptions write_options;
+ write_options.disableWAL = false;
+ TransactionOptions txn_options;
+ std::vector<Transaction*> prepared_trans;
+ WriteUnpreparedTxnDB* wup_db;
+ options.disable_auto_compactions = true;
+
+ enum Action { UNPREPARED, ROLLBACK, COMMIT };
+
+ // batch_size of 1 causes writes to DB for every marker.
+ for (size_t batch_size : {1, 1000000}) {
+ txn_options.write_batch_flush_threshold = batch_size;
+ for (bool empty : {true, false}) {
+ for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
+ for (int num_batches = 1; num_batches < 10; num_batches++) {
+ // Reset database.
+ prepared_trans.clear();
+ ASSERT_OK(ReOpen());
+ wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+ if (!empty) {
+ for (int i = 0; i < num_batches; i++) {
+ ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i),
+ "before value" + std::to_string(i)));
+ }
+ }
+
+ // Write num_batches unprepared batches.
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+ ASSERT_OK(txn->SetName("xid"));
+ for (int i = 0; i < num_batches; i++) {
+ ASSERT_OK(
+ txn->Put("k" + std::to_string(i), "value" + std::to_string(i)));
+ if (txn_options.write_batch_flush_threshold == 1) {
+ // WriteUnprepared will check write_batch_flush_threshold and
+ // possibly flush before appending to the write batch. No flush
+ // will happen at the first write because the batch is still
+ // empty, so after k puts, there should be k-1 flushed batches.
+ ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
+ } else {
+ ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+ }
+ }
+ if (a == UNPREPARED) {
+ // This is done to prevent the destructor from rolling back the
+ // transaction for us, since we want to pretend we crashed and
+ // test that recovery does the rollback.
+ wup_txn->unprep_seqs_.clear();
+ } else {
+ ASSERT_OK(txn->Prepare());
+ }
+ delete txn;
+
+ // Crash and run recovery code paths.
+ ASSERT_OK(wup_db->db_impl_->FlushWAL(true));
+ wup_db->TEST_Crash();
+ ASSERT_OK(ReOpenNoDelete());
+ assert(db != nullptr);
+
+ db->GetAllPreparedTransactions(&prepared_trans);
+ ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1);
+ if (a == ROLLBACK) {
+ ASSERT_OK(prepared_trans[0]->Rollback());
+ delete prepared_trans[0];
+ } else if (a == COMMIT) {
+ ASSERT_OK(prepared_trans[0]->Commit());
+ delete prepared_trans[0];
+ }
+
+ Iterator* iter = db->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+ // Check that DB has before values.
+ if (!empty || a == COMMIT) {
+ for (int i = 0; i < num_batches; i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i));
+ if (a == COMMIT) {
+ ASSERT_EQ(iter->value().ToString(),
+ "value" + std::to_string(i));
+ } else {
+ ASSERT_EQ(iter->value().ToString(),
+ "before value" + std::to_string(i));
+ }
+ iter->Next();
+ }
+ }
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+ }
+ }
+ }
+}
+
+// Basic test to see that unprepared batch gets written to DB when batch size
+// is exceeded. It also does some basic checks to see if commit/rollback works
+// as expected for write unprepared.
+TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ const int kNumKeys = 10;
+
+ // batch_size of 1 causes writes to DB for every marker.
+ for (size_t batch_size : {1, 1000000}) {
+ txn_options.write_batch_flush_threshold = batch_size;
+ for (bool prepare : {false, true}) {
+ for (bool commit : {false, true}) {
+ ASSERT_OK(ReOpen());
+ Transaction* txn = db->BeginTransaction(write_options, txn_options);
+ WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+ ASSERT_OK(txn->SetName("xid"));
+
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(txn->Put("k" + std::to_string(i), "v" + std::to_string(i)));
+ if (txn_options.write_batch_flush_threshold == 1) {
+ // WriteUnprepared will check write_batch_flush_threshold and
+ // possibly flush before appending to the write batch. No flush will
+ // happen at the first write because the batch is still empty, so
+ // after k puts, there should be k-1 flushed batches.
+ ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
+ } else {
+ ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+ }
+ }
+
+ if (prepare) {
+ ASSERT_OK(txn->Prepare());
+ }
+
+ Iterator* iter = db->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+ assert(!iter->Valid());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+
+ if (commit) {
+ ASSERT_OK(txn->Commit());
+ } else {
+ ASSERT_OK(txn->Rollback());
+ }
+ delete txn;
+
+ iter = db->NewIterator(ReadOptions());
+ ASSERT_OK(iter->status());
+ iter->SeekToFirst();
+
+ for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i));
+ ASSERT_EQ(iter->value().ToString(), "v" + std::to_string(i));
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_OK(iter->status());
+ delete iter;
+ }
+ }
+ }
+}
+
+// Test whether logs containing unprepared/prepared batches are kept even
+// after memtable finishes flushing, and whether they are removed when
+// transaction commits/aborts.
+//
+// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests.
+TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
+ WriteOptions write_options;
+ TransactionOptions txn_options;
+ // batch_size of 1 causes writes to DB for every marker.
+ txn_options.write_batch_flush_threshold = 1;
+ const int kNumKeys = 10;
+
+ WriteOptions wopts;
+ wopts.sync = true;
+
+ for (bool prepare : {false, true}) {
+ for (bool commit : {false, true}) {
+ ASSERT_OK(ReOpen());
+ auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+ auto db_impl = wup_db->db_impl_;
+
+ Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn1->SetName("xid1"));
+
+ Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+ ASSERT_OK(txn2->SetName("xid2"));
+
+ // Spread this transaction across multiple log files.
+ for (int i = 0; i < kNumKeys; i++) {
+ ASSERT_OK(txn1->Put("k1" + std::to_string(i), "v" + std::to_string(i)));
+ if (i >= kNumKeys / 2) {
+ ASSERT_OK(
+ txn2->Put("k2" + std::to_string(i), "v" + std::to_string(i)));
+ }
+
+ if (i > 0) {
+ ASSERT_OK(db_impl->TEST_SwitchWAL());
+ }
+ }
+
+ ASSERT_GT(txn1->GetLogNumber(), 0);
+ ASSERT_GT(txn2->GetLogNumber(), 0);
+
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn1->GetLogNumber());
+ ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+
+ if (prepare) {
+ ASSERT_OK(txn1->Prepare());
+ ASSERT_OK(txn2->Prepare());
+ }
+
+ ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+ ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn1->GetLogNumber());
+ if (commit) {
+ ASSERT_OK(txn1->Commit());
+ } else {
+ ASSERT_OK(txn1->Rollback());
+ }
+
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+ txn2->GetLogNumber());
+
+ if (commit) {
+ ASSERT_OK(txn2->Commit());
+ } else {
+ ASSERT_OK(txn2->Rollback());
+ }
+
+ ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+ delete txn1;
+ delete txn2;
+ }
+ }
+}
+
+TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ txn_options.write_batch_flush_threshold = 1;
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+
+ // Do some writes with no snapshot
+ ASSERT_OK(txn->Put("a", "a"));
+ ASSERT_OK(txn->Put("b", "b"));
+ ASSERT_OK(txn->Put("c", "c"));
+
+ // Test that it is still possible to create iterators after writes with no
+ // snapshot, if iterator snapshot is fresh enough.
+ ReadOptions roptions;
+ auto iter = txn->GetIterator(roptions);
+ ASSERT_OK(iter->status());
+ int keys = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+ }
+ ASSERT_EQ(keys, 3);
+ ASSERT_OK(iter->status());
+
+ delete iter;
+ delete txn;
+}
+
+// Test whether write to a transaction while iterating is supported.
+TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ txn_options.write_batch_flush_threshold = 1;
+
+ enum Action { DO_DELETE, DO_UPDATE };
+
+ for (Action a : {DO_DELETE, DO_UPDATE}) {
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i)));
+ }
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ // write_batch_ now contains 1 key.
+ ASSERT_OK(txn->Put("9", "a"));
+
+ ReadOptions roptions;
+ auto iter = txn->GetIterator(roptions);
+ ASSERT_OK(iter->status());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(iter->status());
+ if (iter->key() == "9") {
+ ASSERT_EQ(iter->value().ToString(), "a");
+ } else {
+ ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+ }
+
+ if (a == DO_DELETE) {
+ ASSERT_OK(txn->Delete(iter->key()));
+ } else {
+ ASSERT_OK(txn->Put(iter->key(), "b"));
+ }
+ }
+ ASSERT_OK(iter->status());
+
+ delete iter;
+ ASSERT_OK(txn->Commit());
+
+ iter = db->NewIterator(roptions);
+ ASSERT_OK(iter->status());
+ if (a == DO_DELETE) {
+ // Check that db is empty.
+ iter->SeekToFirst();
+ ASSERT_FALSE(iter->Valid());
+ } else {
+ int keys = 0;
+ // Check that all values are updated to b.
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) {
+ ASSERT_OK(iter->status());
+ ASSERT_EQ(iter->value().ToString(), "b");
+ }
+ ASSERT_EQ(keys, 100);
+ }
+ ASSERT_OK(iter->status());
+
+ delete iter;
+ delete txn;
+ }
+}
+
+// Test that using an iterator after transaction clear is not supported
+TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ txn_options.write_batch_flush_threshold = 1;
+
+ enum Action { kCommit, kRollback };
+
+ for (Action a : {kCommit, kRollback}) {
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i)));
+ }
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ ASSERT_OK(txn->Put("9", "a"));
+
+ ReadOptions roptions;
+ auto iter1 = txn->GetIterator(roptions);
+ auto iter2 = txn->GetIterator(roptions);
+ iter1->SeekToFirst();
+ iter2->Seek("9");
+
+ // Check that iterators are valid before transaction finishes.
+ ASSERT_TRUE(iter1->Valid());
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_OK(iter1->status());
+ ASSERT_OK(iter2->status());
+
+ if (a == kCommit) {
+ ASSERT_OK(txn->Commit());
+ } else {
+ ASSERT_OK(txn->Rollback());
+ }
+
+ // Check that iterators are invalidated after transaction finishes.
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_FALSE(iter2->Valid());
+ ASSERT_TRUE(iter1->status().IsInvalidArgument());
+ ASSERT_TRUE(iter2->status().IsInvalidArgument());
+
+ delete iter1;
+ delete iter2;
+ delete txn;
+ }
+}
+
+TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ txn_options.write_batch_flush_threshold = 1;
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ txn->SetSavePoint();
+ ASSERT_OK(txn->Put("a", "a"));
+ ASSERT_OK(txn->Put("b", "b"));
+ ASSERT_OK(txn->Commit());
+
+ ReadOptions roptions;
+ std::string value;
+ ASSERT_OK(txn->Get(roptions, "a", &value));
+ ASSERT_EQ(value, "a");
+ ASSERT_OK(txn->Get(roptions, "b", &value));
+ ASSERT_EQ(value, "b");
+ delete txn;
+}
+
+TEST_P(WriteUnpreparedTransactionTest, UntrackedKeys) {
+ WriteOptions woptions;
+ TransactionOptions txn_options;
+ txn_options.write_batch_flush_threshold = 1;
+
+ Transaction* txn = db->BeginTransaction(woptions, txn_options);
+ auto wb = txn->GetWriteBatch()->GetWriteBatch();
+ ASSERT_OK(txn->Put("a", "a"));
+ ASSERT_OK(wb->Put("a_untrack", "a_untrack"));
+ txn->SetSavePoint();
+ ASSERT_OK(txn->Put("b", "b"));
+ ASSERT_OK(txn->Put("b_untrack", "b_untrack"));
+
+ ReadOptions roptions;
+ std::string value;
+ ASSERT_OK(txn->Get(roptions, "a", &value));
+ ASSERT_EQ(value, "a");
+ ASSERT_OK(txn->Get(roptions, "a_untrack", &value));
+ ASSERT_EQ(value, "a_untrack");
+ ASSERT_OK(txn->Get(roptions, "b", &value));
+ ASSERT_EQ(value, "b");
+ ASSERT_OK(txn->Get(roptions, "b_untrack", &value));
+ ASSERT_EQ(value, "b_untrack");
+
+ // b and b_untrack should be rolled back.
+ ASSERT_OK(txn->RollbackToSavePoint());
+ ASSERT_OK(txn->Get(roptions, "a", &value));
+ ASSERT_EQ(value, "a");
+ ASSERT_OK(txn->Get(roptions, "a_untrack", &value));
+ ASSERT_EQ(value, "a_untrack");
+ auto s = txn->Get(roptions, "b", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(roptions, "b_untrack", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ // Everything should be rolled back.
+ ASSERT_OK(txn->Rollback());
+ s = txn->Get(roptions, "a", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(roptions, "a_untrack", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(roptions, "b", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = txn->Get(roptions, "b_untrack", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ delete txn;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc
new file mode 100644
index 000000000..6e04d3344
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc
@@ -0,0 +1,1053 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_unprepared_txn.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
+ // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is
+ // in unprep_seqs, we have to check if seq is equal to prep_seq or any of
+ // the prepare_batch_cnt seq nums after it.
+ //
+ // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is
+ // large.
+ for (const auto& it : unprep_seqs_) {
+ if (it.first <= seq && seq < it.first + it.second) {
+ return true;
+ }
+ }
+
+ bool snap_released = false;
+ auto ret =
+ db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released);
+ assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+ snap_released_ |= snap_released;
+ return ret;
+}
+
+WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options)
+ : WritePreparedTxn(txn_db, write_options, txn_options),
+ wupt_db_(txn_db),
+ last_log_number_(0),
+ recovered_txn_(false),
+ largest_validated_seq_(0) {
+ if (txn_options.write_batch_flush_threshold < 0) {
+ write_batch_flush_threshold_ =
+ txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+ } else {
+ write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+ }
+}
+
+WriteUnpreparedTxn::~WriteUnpreparedTxn() {
+ if (!unprep_seqs_.empty()) {
+ assert(log_number_ > 0);
+ assert(GetId() > 0);
+ assert(!name_.empty());
+
+ // We should rollback regardless of GetState, but some unit tests that
+ // test crash recovery run the destructor assuming that rollback does not
+ // happen, so that rollback during recovery can be exercised.
+ if (GetState() == STARTED || GetState() == LOCKS_STOLEN) {
+ auto s = RollbackInternal();
+ assert(s.ok());
+ if (!s.ok()) {
+ ROCKS_LOG_FATAL(
+ wupt_db_->info_log_,
+ "Rollback of WriteUnprepared transaction failed in destructor: %s",
+ s.ToString().c_str());
+ }
+ dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+ log_number_);
+ }
+ }
+
+ // Clear the tracked locks so that ~PessimisticTransaction does not
+ // try to unlock keys for recovered transactions.
+ if (recovered_txn_) {
+ tracked_locks_->Clear();
+ }
+}
+
+void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
+ PessimisticTransaction::Initialize(txn_options);
+ if (txn_options.write_batch_flush_threshold < 0) {
+ write_batch_flush_threshold_ =
+ txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+ } else {
+ write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+ }
+
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ recovered_txn_ = false;
+ largest_validated_seq_ = 0;
+ assert(active_iterators_.empty());
+ active_iterators_.clear();
+ untracked_keys_.clear();
+}
+
+Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
+ Status s;
+ if (active_iterators_.empty()) {
+ s = MaybeFlushWriteBatchToDB();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ s = do_write();
+ if (s.ok()) {
+ if (snapshot_) {
+ largest_validated_seq_ =
+ std::max(largest_validated_seq_, snapshot_->GetSequenceNumber());
+ } else {
+ // TODO(lth): We should use the same number as tracked_at_seq in TryLock,
+ // because what is actually being tracked is the sequence number at which
+ // this key was locked at.
+ largest_validated_seq_ = db_impl_->GetLastPublishedSequence();
+ }
+ }
+ return s;
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+ const SliceParts& key, const SliceParts& value,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::Merge(column_family, key, value,
+ assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+ const Slice& key, const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::SingleDelete(column_family, key,
+ assume_tracked);
+ });
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked) {
+ return HandleWrite([&]() {
+ return TransactionBaseImpl::SingleDelete(column_family, key,
+ assume_tracked);
+ });
+}
+
+// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For
+// WriteUnprepared, the write batches have already been written into the
+// database during WAL replay, so all we have to do is just to "retrack" the key
+// so that rollbacks are possible.
+//
+// Calling TryLock instead of TrackKey is also possible, but as an optimization,
+// recovered transactions do not hold locks on their keys. This follows the
+// implementation in PessimisticTransactionDB::Initialize where we set
+// skip_concurrency_control to true.
+Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) {
+ struct TrackKeyHandler : public WriteBatch::Handler {
+ WriteUnpreparedTxn* txn_;
+ bool rollback_merge_operands_;
+
+ TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+ : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+ txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+ false /* read_only */, true /* exclusive */);
+ return Status::OK();
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+ false /* read_only */, true /* exclusive */);
+ return Status::OK();
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+ false /* read_only */, true /* exclusive */);
+ return Status::OK();
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+ if (rollback_merge_operands_) {
+ txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+ false /* read_only */, true /* exclusive */);
+ }
+ return Status::OK();
+ }
+
+ // Recovered batches do not contain 2PC markers.
+ Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+ Status MarkEndPrepare(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+
+ Status MarkCommit(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ };
+
+ TrackKeyHandler handler(this,
+ wupt_db_->txn_db_options_.rollback_merge_operands);
+ return wb->Iterate(&handler);
+}
+
+Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
+ const bool kPrepared = true;
+ Status s;
+ if (write_batch_flush_threshold_ > 0 &&
+ write_batch_.GetWriteBatch()->Count() > 0 &&
+ write_batch_.GetDataSize() >
+ static_cast<size_t>(write_batch_flush_threshold_)) {
+ assert(GetState() != PREPARED);
+ s = FlushWriteBatchToDB(!kPrepared);
+ }
+ return s;
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
+ // If the current write batch contains savepoints, then some special handling
+ // is required so that RollbackToSavepoint can work.
+ //
+ // RollbackToSavepoint is not supported after Prepare() is called, so only do
+ // this for unprepared batches.
+ if (!prepared && unflushed_save_points_ != nullptr &&
+ !unflushed_save_points_->empty()) {
+ return FlushWriteBatchWithSavePointToDB();
+ }
+
+ return FlushWriteBatchToDBInternal(prepared);
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
+ if (name_.empty()) {
+ assert(!prepared);
+#ifndef NDEBUG
+ static std::atomic_ullong autogen_id{0};
+ // To avoid changing all tests to call SetName, just autogenerate one.
+ if (wupt_db_->txn_db_options_.autogenerate_name) {
+ auto s = SetName(std::string("autoxid") +
+ std::to_string(autogen_id.fetch_add(1)));
+ assert(s.ok());
+ } else
+#endif
+ {
+ return Status::InvalidArgument("Cannot write to DB without SetName.");
+ }
+ }
+
+ struct UntrackedKeyHandler : public WriteBatch::Handler {
+ WriteUnpreparedTxn* txn_;
+ bool rollback_merge_operands_;
+
+ UntrackedKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+ : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+ Status AddUntrackedKey(uint32_t cf, const Slice& key) {
+ auto str = key.ToString();
+ PointLockStatus lock_status =
+ txn_->tracked_locks_->GetPointLockStatus(cf, str);
+ if (!lock_status.locked) {
+ txn_->untracked_keys_[cf].push_back(str);
+ }
+ return Status::OK();
+ }
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+ return AddUntrackedKey(cf, key);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return AddUntrackedKey(cf, key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return AddUntrackedKey(cf, key);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+ if (rollback_merge_operands_) {
+ return AddUntrackedKey(cf, key);
+ }
+ return Status::OK();
+ }
+
+ // The only expected 2PC marker is the initial Noop marker.
+ Status MarkNoop(bool empty_batch) override {
+ return empty_batch ? Status::OK() : Status::InvalidArgument();
+ }
+
+ Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+ Status MarkEndPrepare(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ };
+
+ UntrackedKeyHandler handler(
+ this, wupt_db_->txn_db_options_.rollback_merge_operands);
+ auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&handler);
+ assert(s.ok());
+
+ // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
+ WriteOptions write_options = write_options_;
+ write_options.disableWAL = false;
+ const bool WRITE_AFTER_COMMIT = true;
+ const bool first_prepare_batch = log_number_ == 0;
+ // MarkEndPrepare will change Noop marker to the appropriate marker.
+ s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+ name_, !WRITE_AFTER_COMMIT, !prepared);
+ assert(s.ok());
+ // For each duplicate key we account for a new sub-batch
+ prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+ // AddPrepared better to be called in the pre-release callback otherwise there
+ // is a non-zero chance of max advancing prepare_seq and readers assume the
+ // data as committed.
+ // Also having it in the PreReleaseCallback allows in-order addition of
+ // prepared entries to PreparedHeap and hence enables an optimization. Refer
+ // to SmallestUnCommittedSeq for more details.
+ AddPreparedCallback add_prepared_callback(
+ wpt_db_, db_impl_, prepare_batch_cnt_,
+ db_impl_->immutable_db_options().two_write_queues, first_prepare_batch);
+ const bool DISABLE_MEMTABLE = true;
+ uint64_t seq_used = kMaxSequenceNumber;
+ // log_number_ should refer to the oldest log containing uncommitted data
+ // from the current transaction. This means that if log_number_ is set,
+ // WriteImpl should not overwrite that value, so set log_used to nullptr if
+ // log_number_ is already set.
+ s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+ /*callback*/ nullptr, &last_log_number_,
+ /*log ref*/ 0, !DISABLE_MEMTABLE, &seq_used,
+ prepare_batch_cnt_, &add_prepared_callback);
+ if (log_number_ == 0) {
+ log_number_ = last_log_number_;
+ }
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ auto prepare_seq = seq_used;
+
+ // Only call SetId if it hasn't been set yet.
+ if (GetId() == 0) {
+ SetId(prepare_seq);
+ }
+ // unprep_seqs_ will also contain prepared seqnos since they are treated in
+ // the same way in the prepare/commit callbacks. See the comment on the
+ // definition of unprep_seqs_.
+ unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
+
+ // Reset transaction state.
+ if (!prepared) {
+ prepare_batch_cnt_ = 0;
+ const bool kClear = true;
+ TransactionBaseImpl::InitWriteBatch(kClear);
+ }
+
+ return s;
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
+ assert(unflushed_save_points_ != nullptr &&
+ unflushed_save_points_->size() > 0);
+ assert(save_points_ != nullptr && save_points_->size() > 0);
+ assert(save_points_->size() >= unflushed_save_points_->size());
+
+ // Handler class for creating an unprepared batch from a savepoint.
+ struct SavePointBatchHandler : public WriteBatch::Handler {
+ WriteBatchWithIndex* wb_;
+ const std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+
+ SavePointBatchHandler(
+ WriteBatchWithIndex* wb,
+ const std::map<uint32_t, ColumnFamilyHandle*>& handles)
+ : wb_(wb), handles_(handles) {}
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ return wb_->Put(handles_.at(cf), key, value);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return wb_->Delete(handles_.at(cf), key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return wb_->SingleDelete(handles_.at(cf), key);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ return wb_->Merge(handles_.at(cf), key, value);
+ }
+
+ // The only expected 2PC marker is the initial Noop marker.
+ Status MarkNoop(bool empty_batch) override {
+ return empty_batch ? Status::OK() : Status::InvalidArgument();
+ }
+
+ Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+ Status MarkEndPrepare(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkCommit(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ };
+
+ // The comparator of the default cf is passed in, similar to the
+ // initialization of TransactionBaseImpl::write_batch_. This comparator is
+ // only used if the write batch encounters an invalid cf id, and falls back to
+ // this comparator.
+ WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0,
+ true, 0, write_options_.protection_bytes_per_key);
+ // Swap with write_batch_ so that wb contains the complete write batch. The
+ // actual write batch that will be flushed to DB will be built in
+ // write_batch_, and will be read by FlushWriteBatchToDBInternal.
+ std::swap(wb, write_batch_);
+ TransactionBaseImpl::InitWriteBatch();
+
+ size_t prev_boundary = WriteBatchInternal::kHeader;
+ const bool kPrepared = true;
+ for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) {
+ bool trailing_batch = i == unflushed_save_points_->size();
+ SavePointBatchHandler sp_handler(&write_batch_,
+ *wupt_db_->GetCFHandleMap().get());
+ size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize()
+ : (*unflushed_save_points_)[i];
+
+ // Construct the partial write batch up to the savepoint.
+ //
+ // Theoretically, a memcpy between the write batches should be sufficient
+ // since the rewriting into the batch should produce the exact same byte
+ // representation. Rebuilding the WriteBatchWithIndex index is still
+ // necessary though, and would imply doing two passes over the batch though.
+ Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler,
+ prev_boundary, curr_boundary);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (write_batch_.GetWriteBatch()->Count() > 0) {
+ // Flush the write batch.
+ s = FlushWriteBatchToDBInternal(!kPrepared);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!trailing_batch) {
+ if (flushed_save_points_ == nullptr) {
+ flushed_save_points_.reset(
+ new autovector<WriteUnpreparedTxn::SavePoint>());
+ }
+ flushed_save_points_->emplace_back(
+ unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
+ }
+
+ prev_boundary = curr_boundary;
+ const bool kClear = true;
+ TransactionBaseImpl::InitWriteBatch(kClear);
+ }
+
+ unflushed_save_points_->clear();
+ return Status::OK();
+}
+
+Status WriteUnpreparedTxn::PrepareInternal() {
+ const bool kPrepared = true;
+ return FlushWriteBatchToDB(kPrepared);
+}
+
+Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() {
+ if (unprep_seqs_.empty()) {
+ assert(log_number_ == 0);
+ assert(GetId() == 0);
+ return WritePreparedTxn::CommitWithoutPrepareInternal();
+ }
+
+ // TODO(lth): We should optimize commit without prepare to not perform
+ // a prepare under the hood.
+ auto s = PrepareInternal();
+ if (!s.ok()) {
+ return s;
+ }
+ return CommitInternal();
+}
+
+Status WriteUnpreparedTxn::CommitInternal() {
+ // TODO(lth): Reduce duplicate code with WritePrepared commit logic.
+
+ // We take the commit-time batch and append the Commit marker. The Memtable
+ // will ignore the Commit marker in non-recovery mode
+ WriteBatch* working_batch = GetCommitTimeWriteBatch();
+ const bool empty = working_batch->Count() == 0;
+ auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+ assert(s.ok());
+
+ const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+ if (!empty) {
+ // When not writing to memtable, we can still cache the latest write batch.
+ // The cached batch will be written to memtable in WriteRecoverableState
+ // during FlushMemTable
+ if (for_recovery) {
+ WriteBatchInternal::SetAsLatestPersistentState(working_batch);
+ } else {
+ return Status::InvalidArgument(
+ "Commit-time-batch can only be used if "
+ "use_only_the_last_commit_time_batch_for_recovery is true");
+ }
+ }
+
+ const bool includes_data = !empty && !for_recovery;
+ size_t commit_batch_cnt = 0;
+ if (UNLIKELY(includes_data)) {
+ ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+ "Duplicate key overhead");
+ SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+ s = working_batch->Iterate(&counter);
+ assert(s.ok());
+ commit_batch_cnt = counter.BatchCount();
+ }
+ const bool disable_memtable = !includes_data;
+ const bool do_one_write =
+ !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+
+ WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+ wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt);
+ const bool kFirstPrepareBatch = true;
+ AddPreparedCallback add_prepared_callback(
+ wpt_db_, db_impl_, commit_batch_cnt,
+ db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+ PreReleaseCallback* pre_release_callback;
+ if (do_one_write) {
+ pre_release_callback = &update_commit_map;
+ } else {
+ pre_release_callback = &add_prepared_callback;
+ }
+ uint64_t seq_used = kMaxSequenceNumber;
+ // Since the prepared batch is directly written to memtable, there is
+ // already a connection between the memtable and its WAL, so there is no
+ // need to redundantly reference the log that contains the prepared data.
+ const uint64_t zero_log_number = 0ull;
+ size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+ s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+ zero_log_number, disable_memtable, &seq_used,
+ batch_cnt, pre_release_callback);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ const SequenceNumber commit_batch_seq = seq_used;
+ if (LIKELY(do_one_write || !s.ok())) {
+ if (LIKELY(s.ok())) {
+ // Note RemovePrepared should be called after WriteImpl that publishsed
+ // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+ for (const auto& seq : unprep_seqs_) {
+ wpt_db_->RemovePrepared(seq.first, seq.second);
+ }
+ }
+ if (UNLIKELY(!do_one_write)) {
+ wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+ }
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ return s;
+ } // else do the 2nd write to publish seq
+
+ // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the
+ // commit write batch as just another "unprepared" batch. This will also
+ // update the unprep_seqs_ in the update_commit_map callback.
+ unprep_seqs_[commit_batch_seq] = commit_batch_cnt;
+ WriteUnpreparedCommitEntryPreReleaseCallback
+ update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
+
+ // Note: the 2nd write comes with a performance penality. So if we have too
+ // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+ // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+ // two_write_queues should be disabled to avoid many additional writes here.
+
+ // Update commit map only from the 2nd queue
+ WriteBatch empty_batch;
+ s = empty_batch.PutLogData(Slice());
+ assert(s.ok());
+ // In the absence of Prepare markers, use Noop as a batch separator
+ s = WriteBatchInternal::InsertNoop(&empty_batch);
+ assert(s.ok());
+ const bool DISABLE_MEMTABLE = true;
+ const size_t ONE_BATCH = 1;
+ const uint64_t NO_REF_LOG = 0;
+ s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+ NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ &update_commit_map_with_commit_batch);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ // Note RemovePrepared should be called after WriteImpl that publishsed the
+ // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+ for (const auto& seq : unprep_seqs_) {
+ wpt_db_->RemovePrepared(seq.first, seq.second);
+ }
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ return s;
+}
+
+Status WriteUnpreparedTxn::WriteRollbackKeys(
+ const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch,
+ ReadCallback* callback, const ReadOptions& roptions) {
+ // This assertion can be removed when range lock is supported.
+ assert(lock_tracker.IsPointLockSupported());
+ const auto& cf_map = *wupt_db_->GetCFHandleMap();
+ auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) {
+ const auto& cf_handle = cf_map.at(cfid);
+ PinnableSlice pinnable_val;
+ bool not_used;
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cf_handle;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = &not_used;
+ get_impl_options.callback = callback;
+ auto s = db_impl_->GetImpl(roptions, key, get_impl_options);
+
+ if (s.ok()) {
+ s = rollback_batch->Put(cf_handle, key, pinnable_val);
+ assert(s.ok());
+ } else if (s.IsNotFound()) {
+ if (wupt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
+ s = rollback_batch->SingleDelete(cf_handle, key);
+ } else {
+ s = rollback_batch->Delete(cf_handle, key);
+ }
+ assert(s.ok());
+ } else {
+ return s;
+ }
+
+ return Status::OK();
+ };
+
+ std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+ lock_tracker.GetColumnFamilyIterator());
+ assert(cf_it != nullptr);
+ while (cf_it->HasNext()) {
+ ColumnFamilyId cf = cf_it->Next();
+ std::unique_ptr<LockTracker::KeyIterator> key_it(
+ lock_tracker.GetKeyIterator(cf));
+ assert(key_it != nullptr);
+ while (key_it->HasNext()) {
+ const std::string& key = key_it->Next();
+ auto s = WriteRollbackKey(key, cf);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ for (const auto& cfkey : untracked_keys_) {
+ const auto cfid = cfkey.first;
+ const auto& keys = cfkey.second;
+ for (const auto& key : keys) {
+ auto s = WriteRollbackKey(key, cfid);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+Status WriteUnpreparedTxn::RollbackInternal() {
+ // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+ WriteBatchWithIndex rollback_batch(
+ wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0,
+ write_options_.protection_bytes_per_key);
+ assert(GetId() != kMaxSequenceNumber);
+ assert(GetId() > 0);
+ Status s;
+ auto read_at_seq = kMaxSequenceNumber;
+ ReadOptions roptions;
+ // to prevent callback's seq to be overrriden inside DBImpk::Get
+ roptions.snapshot = wpt_db_->GetMaxSnapshot();
+ // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
+ // need to read our own writes when reading prior versions of the key for
+ // rollback.
+ WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
+ // TODO(lth): We write rollback batch all in a single batch here, but this
+ // should be subdivded into multiple batches as well. In phase 2, when key
+ // sets are read from WAL, this will happen naturally.
+ s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // The Rollback marker will be used as a batch separator
+ s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+ assert(s.ok());
+ bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+ const bool DISABLE_MEMTABLE = true;
+ const uint64_t NO_REF_LOG = 0;
+ uint64_t seq_used = kMaxSequenceNumber;
+ // Rollback batch may contain duplicate keys, because tracked_keys_ is not
+ // comparator aware.
+ auto rollback_batch_cnt = rollback_batch.SubBatchCnt();
+ // We commit the rolled back prepared batches. Although this is
+ // counter-intuitive, i) it is safe to do so, since the prepared batches are
+ // already canceled out by the rollback batch, ii) adding the commit entry to
+ // CommitCache will allow us to benefit from the existing mechanism in
+ // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+ // with a live snapshot around so that the live snapshot properly skips the
+ // entry even if its prepare seq is lower than max_evicted_seq_.
+ //
+ // TODO(lth): RollbackInternal is conceptually very similar to
+ // CommitInternal, with the rollback batch simply taking on the role of
+ // CommitTimeWriteBatch. We should be able to merge the two code paths.
+ WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+ wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt);
+ // Note: the rollback batch does not need AddPrepared since it is written to
+ // DB in one shot. min_uncommitted still works since it requires capturing
+ // data that is written to DB but not yet committed, while the rollback
+ // batch commits with PreReleaseCallback.
+ s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(),
+ nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE,
+ &seq_used, rollback_batch_cnt,
+ do_one_write ? &update_commit_map : nullptr);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ if (!s.ok()) {
+ return s;
+ }
+ if (do_one_write) {
+ for (const auto& seq : unprep_seqs_) {
+ wpt_db_->RemovePrepared(seq.first, seq.second);
+ }
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ return s;
+ } // else do the 2nd write for commit
+
+ uint64_t& prepare_seq = seq_used;
+ // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the
+ // rollback write batch as just another "unprepared" batch. This will also
+ // update the unprep_seqs_ in the update_commit_map callback.
+ unprep_seqs_[prepare_seq] = rollback_batch_cnt;
+ WriteUnpreparedCommitEntryPreReleaseCallback
+ update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
+
+ ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+ "RollbackInternal 2nd write prepare_seq: %" PRIu64,
+ prepare_seq);
+ WriteBatch empty_batch;
+ const size_t ONE_BATCH = 1;
+ s = empty_batch.PutLogData(Slice());
+ assert(s.ok());
+ // In the absence of Prepare markers, use Noop as a batch separator
+ s = WriteBatchInternal::InsertNoop(&empty_batch);
+ assert(s.ok());
+ s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+ NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+ &update_commit_map_with_rollback_batch);
+ assert(!s.ok() || seq_used != kMaxSequenceNumber);
+ // Mark the txn as rolled back
+ if (s.ok()) {
+ for (const auto& seq : unprep_seqs_) {
+ wpt_db_->RemovePrepared(seq.first, seq.second);
+ }
+ }
+
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ return s;
+}
+
+void WriteUnpreparedTxn::Clear() {
+ if (!recovered_txn_) {
+ txn_db_impl_->UnLock(this, *tracked_locks_);
+ }
+ unprep_seqs_.clear();
+ flushed_save_points_.reset(nullptr);
+ unflushed_save_points_.reset(nullptr);
+ recovered_txn_ = false;
+ largest_validated_seq_ = 0;
+ for (auto& it : active_iterators_) {
+ auto bdit = static_cast<BaseDeltaIterator*>(it);
+ bdit->Invalidate(Status::InvalidArgument(
+ "Cannot use iterator after transaction has finished"));
+ }
+ active_iterators_.clear();
+ untracked_keys_.clear();
+ TransactionBaseImpl::Clear();
+}
+
+void WriteUnpreparedTxn::SetSavePoint() {
+ assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+ (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+ (save_points_ ? save_points_->size() : 0));
+ PessimisticTransaction::SetSavePoint();
+ if (unflushed_save_points_ == nullptr) {
+ unflushed_save_points_.reset(new autovector<size_t>());
+ }
+ unflushed_save_points_->push_back(write_batch_.GetDataSize());
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePoint() {
+ assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+ (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+ (save_points_ ? save_points_->size() : 0));
+ if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+ Status s = PessimisticTransaction::RollbackToSavePoint();
+ assert(!s.IsNotFound());
+ unflushed_save_points_->pop_back();
+ return s;
+ }
+
+ if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+ return RollbackToSavePointInternal();
+ }
+
+ return Status::NotFound();
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
+ Status s;
+
+ const bool kClear = true;
+ TransactionBaseImpl::InitWriteBatch(kClear);
+
+ assert(flushed_save_points_->size() > 0);
+ WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
+
+ assert(save_points_ != nullptr && save_points_->size() > 0);
+ const LockTracker& tracked_keys = *save_points_->top().new_locks_;
+
+ ReadOptions roptions;
+ roptions.snapshot = top.snapshot_->snapshot();
+ SequenceNumber min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(roptions.snapshot)
+ ->min_uncommitted_;
+ SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
+ WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+ top.unprep_seqs_,
+ kBackedByDBSnapshot);
+ s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+ if (!s.ok()) {
+ return s;
+ }
+
+ const bool kPrepared = true;
+ s = FlushWriteBatchToDBInternal(!kPrepared);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // PessimisticTransaction::RollbackToSavePoint will call also call
+ // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has
+ // no savepoints because this savepoint has already been flushed. Work around
+ // this by setting a fake savepoint.
+ write_batch_.SetSavePoint();
+ s = PessimisticTransaction::RollbackToSavePoint();
+ assert(s.ok());
+ if (!s.ok()) {
+ return s;
+ }
+
+ flushed_save_points_->pop_back();
+ return s;
+}
+
+Status WriteUnpreparedTxn::PopSavePoint() {
+ assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+ (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+ (save_points_ ? save_points_->size() : 0));
+ if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+ Status s = PessimisticTransaction::PopSavePoint();
+ assert(!s.IsNotFound());
+ unflushed_save_points_->pop_back();
+ return s;
+ }
+
+ if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+ // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on
+ // write_batch_. However, write_batch_ is empty and has no savepoints
+ // because this savepoint has already been flushed. Work around this by
+ // setting a fake savepoint.
+ write_batch_.SetSavePoint();
+ Status s = PessimisticTransaction::PopSavePoint();
+ assert(!s.IsNotFound());
+ flushed_save_points_->pop_back();
+ return s;
+ }
+
+ return Status::NotFound();
+}
+
+void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ SequenceNumber min_uncommitted, snap_seq;
+ const SnapshotBackup backed_by_snapshot =
+ wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+ WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+ unprep_seqs_, backed_by_snapshot);
+ write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+ keys, values, statuses, sorted_input,
+ &callback);
+ if (UNLIKELY(!callback.valid() ||
+ !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+ wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+ for (size_t i = 0; i < num_keys; i++) {
+ statuses[i] = Status::TryAgain();
+ }
+ }
+}
+
+Status WriteUnpreparedTxn::Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) {
+ SequenceNumber min_uncommitted, snap_seq;
+ const SnapshotBackup backed_by_snapshot =
+ wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+ WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+ unprep_seqs_, backed_by_snapshot);
+ auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+ value, &callback);
+ if (LIKELY(callback.valid() &&
+ wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+ return res;
+ } else {
+ res.PermitUncheckedError();
+ wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+ return Status::TryAgain();
+ }
+}
+
+namespace {
+static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) {
+ auto txn = reinterpret_cast<WriteUnpreparedTxn*>(arg1);
+ auto iter = reinterpret_cast<Iterator*>(arg2);
+ txn->RemoveActiveIterator(iter);
+}
+} // anonymous namespace
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) {
+ return GetIterator(options, wupt_db_->DefaultColumnFamily());
+}
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) {
+ // Make sure to get iterator from WriteUnprepareTxnDB, not the root db.
+ Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this);
+ assert(db_iter);
+
+ auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter);
+ active_iterators_.push_back(iter);
+ iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter);
+ return iter;
+}
+
+Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ SequenceNumber* tracked_at_seq) {
+ // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic.
+ assert(snapshot_);
+
+ SequenceNumber min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(snapshot_.get())
+ ->min_uncommitted_;
+ SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+ // tracked_at_seq is either max or the last snapshot with which this key was
+ // trackeed so there is no need to apply the IsInSnapshot to this comparison
+ // here as tracked_at_seq is not a prepare seq.
+ if (*tracked_at_seq <= snap_seq) {
+ // If the key has been previous validated at a sequence number earlier
+ // than the curent snapshot's sequence number, we already know it has not
+ // been modified.
+ return Status::OK();
+ }
+
+ *tracked_at_seq = snap_seq;
+
+ ColumnFamilyHandle* cfh =
+ column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+ WriteUnpreparedTxnReadCallback snap_checker(
+ wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot);
+ // TODO(yanqin): Support user-defined timestamp.
+ return TransactionUtil::CheckKeyForConflicts(
+ db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+ false /* cache_only */, &snap_checker, min_uncommitted);
+}
+
+const std::map<SequenceNumber, size_t>&
+WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() {
+ return unprep_seqs_;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.h b/src/rocksdb/utilities/transactions/write_unprepared_txn.h
new file mode 100644
index 000000000..5a3227f4e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.h
@@ -0,0 +1,341 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "utilities/transactions/write_prepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTxnDB;
+class WriteUnpreparedTxn;
+
+// WriteUnprepared transactions needs to be able to read their own uncommitted
+// writes, and supporting this requires some careful consideration. Because
+// writes in the current transaction may be flushed to DB already, we cannot
+// rely on the contents of WriteBatchWithIndex to determine whether a key should
+// be visible or not, so we have to remember to check the DB for any uncommitted
+// keys that should be visible to us. First, we will need to change the seek to
+// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
+// Any key greater than max_visible_seq should not be visible because they
+// cannot be unprepared by the current transaction and they are not in its
+// snapshot.
+//
+// When we seek to max_visible_seq, one of these cases will happen:
+// 1. We hit a unprepared key from the current transaction.
+// 2. We hit a unprepared key from the another transaction.
+// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
+// 4. We hit a committed key with seq <= snap_seq.
+//
+// IsVisibleFullCheck handles all cases correctly.
+//
+// Other notes:
+// Note that max_visible_seq is only calculated once at iterator construction
+// time, meaning if the same transaction is adding more unprep seqs through
+// writes during iteration, these newer writes may not be visible. This is not a
+// problem for MySQL though because it avoids modifying the index as it is
+// scanning through it to avoid the Halloween Problem. Instead, it scans the
+// index once up front, and modifies based on a temporary copy.
+//
+// In DBIter, there is a "reseek" optimization if the iterator skips over too
+// many keys. However, this assumes that the reseek seeks exactly to the
+// required key. In write unprepared, even after seeking directly to
+// max_visible_seq, some iteration may be required before hitting a visible key,
+// and special precautions must be taken to avoid performing another reseek,
+// leading to an infinite loop.
+//
+class WriteUnpreparedTxnReadCallback : public ReadCallback {
+ public:
+ WriteUnpreparedTxnReadCallback(
+ WritePreparedTxnDB* db, SequenceNumber snapshot,
+ SequenceNumber min_uncommitted,
+ const std::map<SequenceNumber, size_t>& unprep_seqs,
+ SnapshotBackup backed_by_snapshot)
+ // Pass our last uncommitted seq as the snapshot to the parent class to
+ // ensure that the parent will not prematurely filter out own writes. We
+ // will do the exact comparison against snapshots in IsVisibleFullCheck
+ // override.
+ : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
+ db_(db),
+ unprep_seqs_(unprep_seqs),
+ wup_snapshot_(snapshot),
+ backed_by_snapshot_(backed_by_snapshot) {
+ (void)backed_by_snapshot_; // to silence unused private field warning
+ }
+
+ virtual ~WriteUnpreparedTxnReadCallback() {
+ // If it is not backed by snapshot, the caller must check validity
+ assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+ }
+
+ virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
+
+ inline bool valid() {
+ valid_checked_ = true;
+ return snap_released_ == false;
+ }
+
+ void Refresh(SequenceNumber seq) override {
+ max_visible_seq_ = std::max(max_visible_seq_, seq);
+ wup_snapshot_ = seq;
+ }
+
+ static SequenceNumber CalcMaxVisibleSeq(
+ const std::map<SequenceNumber, size_t>& unprep_seqs,
+ SequenceNumber snapshot_seq) {
+ SequenceNumber max_unprepared = 0;
+ if (unprep_seqs.size()) {
+ max_unprepared =
+ unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
+ }
+ return std::max(max_unprepared, snapshot_seq);
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ const std::map<SequenceNumber, size_t>& unprep_seqs_;
+ SequenceNumber wup_snapshot_;
+ // Whether max_visible_seq_ is backed by a snapshot
+ const SnapshotBackup backed_by_snapshot_;
+ bool snap_released_ = false;
+ // Safety check to ensure that the caller has checked invalid statuses
+ bool valid_checked_ = false;
+};
+
+class WriteUnpreparedTxn : public WritePreparedTxn {
+ public:
+ WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
+ const WriteOptions& write_options,
+ const TransactionOptions& txn_options);
+
+ virtual ~WriteUnpreparedTxn();
+
+ using TransactionBaseImpl::Put;
+ virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value,
+ const bool assume_tracked = false) override;
+ virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::Merge;
+ virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::Delete;
+ virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+ const bool assume_tracked = false) override;
+ virtual Status Delete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) override;
+
+ using TransactionBaseImpl::SingleDelete;
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ const bool assume_tracked = false) override;
+ virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+ const SliceParts& key,
+ const bool assume_tracked = false) override;
+
+ // In WriteUnprepared, untracked writes will break snapshot validation logic.
+ // Snapshot validation will only check the largest sequence number of a key to
+ // see if it was committed or not. However, an untracked unprepared write will
+ // hide smaller committed sequence numbers.
+ //
+ // TODO(lth): Investigate whether it is worth having snapshot validation
+ // validate all values larger than snap_seq. Otherwise, we should return
+ // Status::NotSupported for untracked writes.
+
+ virtual Status RebuildFromWriteBatch(WriteBatch*) override;
+
+ virtual uint64_t GetLastLogNumber() const override {
+ return last_log_number_;
+ }
+
+ void RemoveActiveIterator(Iterator* iter) {
+ active_iterators_.erase(
+ std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
+ active_iterators_.end());
+ }
+
+ protected:
+ void Initialize(const TransactionOptions& txn_options) override;
+
+ Status PrepareInternal() override;
+
+ Status CommitWithoutPrepareInternal() override;
+ Status CommitInternal() override;
+
+ Status RollbackInternal() override;
+
+ void Clear() override;
+
+ void SetSavePoint() override;
+ Status RollbackToSavePoint() override;
+ Status PopSavePoint() override;
+
+ // Get and GetIterator needs to be overridden so that a ReadCallback to
+ // handle read-your-own-write is used.
+ using Transaction::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using Transaction::MultiGet;
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ using Transaction::GetIterator;
+ virtual Iterator* GetIterator(const ReadOptions& options) override;
+ virtual Iterator* GetIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+ const Slice& key,
+ SequenceNumber* tracked_at_seq) override;
+
+ private:
+ friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+ friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
+ friend class WriteUnpreparedTxnDB;
+
+ const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+ Status WriteRollbackKeys(const LockTracker& tracked_keys,
+ WriteBatchWithIndex* rollback_batch,
+ ReadCallback* callback, const ReadOptions& roptions);
+
+ Status MaybeFlushWriteBatchToDB();
+ Status FlushWriteBatchToDB(bool prepared);
+ Status FlushWriteBatchToDBInternal(bool prepared);
+ Status FlushWriteBatchWithSavePointToDB();
+ Status RollbackToSavePointInternal();
+ Status HandleWrite(std::function<Status()> do_write);
+
+ // For write unprepared, we check on every writebatch append to see if
+ // write_batch_flush_threshold_ has been exceeded, and then call
+ // FlushWriteBatchToDB if so. This logic is encapsulated in
+ // MaybeFlushWriteBatchToDB.
+ int64_t write_batch_flush_threshold_;
+ WriteUnpreparedTxnDB* wupt_db_;
+
+ // Ordered list of unprep_seq sequence numbers that we have already written
+ // to DB.
+ //
+ // This maps unprep_seq => prepare_batch_cnt for each unprepared batch
+ // written by this transaction.
+ //
+ // Note that this contains both prepared and unprepared batches, since they
+ // are treated similarily in prepare heap/commit map, so it simplifies the
+ // commit callbacks.
+ std::map<SequenceNumber, size_t> unprep_seqs_;
+
+ uint64_t last_log_number_;
+
+ // Recovered transactions have tracked_keys_ populated, but are not actually
+ // locked for efficiency reasons. For recovered transactions, skip unlocking
+ // keys when transaction ends.
+ bool recovered_txn_;
+
+ // Track the largest sequence number at which we performed snapshot
+ // validation. If snapshot validation was skipped because no snapshot was set,
+ // then this is set to GetLastPublishedSequence. This value is useful because
+ // it means that for keys that have unprepared seqnos, we can guarantee that
+ // no committed keys by other transactions can exist between
+ // largest_validated_seq_ and max_unprep_seq. See
+ // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
+ // necessary for iterator Prev().
+ //
+ // Currently this value only increases during the lifetime of a transaction,
+ // but in some cases, we should be able to restore the previously largest
+ // value when calling RollbackToSavepoint.
+ SequenceNumber largest_validated_seq_;
+
+ struct SavePoint {
+ // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
+ // used during RollbackToSavepoint to determine visibility when restoring
+ // old values.
+ //
+ // TODO(lth): Since all unprep_seqs_ sets further down the stack must be
+ // subsets, this can potentially be deduplicated by just storing set
+ // difference. Investigate if this is worth it.
+ std::map<SequenceNumber, size_t> unprep_seqs_;
+
+ // This snapshot will be used to read keys at this savepoint if we call
+ // RollbackToSavePoint.
+ std::unique_ptr<ManagedSnapshot> snapshot_;
+
+ SavePoint(const std::map<SequenceNumber, size_t>& seqs,
+ ManagedSnapshot* snapshot)
+ : unprep_seqs_(seqs), snapshot_(snapshot){};
+ };
+
+ // We have 3 data structures holding savepoint information:
+ // 1. TransactionBaseImpl::save_points_
+ // 2. WriteUnpreparedTxn::flushed_save_points_
+ // 3. WriteUnpreparecTxn::unflushed_save_points_
+ //
+ // TransactionBaseImpl::save_points_ holds information about all write
+ // batches, including the current in-memory write_batch_, or unprepared
+ // batches that have been written out. Its responsibility is just to track
+ // which keys have been modified in every savepoint.
+ //
+ // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
+ // set on unprepared batches that have already flushed. It holds the snapshot
+ // and unprep_seqs at that savepoint, so that the rollback process can
+ // determine which keys were visible at that point in time.
+ //
+ // WriteUnpreparecTxn::unflushed_save_points_ holds information about
+ // savepoints on the current in-memory write_batch_. It simply records the
+ // size of the write batch at every savepoint.
+ //
+ // TODO(lth): Remove the redundancy between save_point_boundaries_ and
+ // write_batch_.save_points_.
+ //
+ // Based on this information, here are some invariants:
+ // size(unflushed_save_points_) = size(write_batch_.save_points_)
+ // size(flushed_save_points_) + size(unflushed_save_points_)
+ // = size(save_points_)
+ //
+ std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
+ flushed_save_points_;
+ std::unique_ptr<autovector<size_t>> unflushed_save_points_;
+
+ // It is currently unsafe to flush a write batch if there are active iterators
+ // created from this transaction. This is because we use WriteBatchWithIndex
+ // to do merging reads from the DB and the write batch. If we flush the write
+ // batch, it is possible that the delta iterator on the iterator will point to
+ // invalid memory.
+ std::vector<Iterator*> active_iterators_;
+
+ // Untracked keys that we have to rollback.
+ //
+ // TODO(lth): Currently we we do not record untracked keys per-savepoint.
+ // This means that when rolling back to savepoints, we have to check all
+ // keys in the current transaction for rollback. Note that this is only
+ // inefficient, but still correct because we take a snapshot at every
+ // savepoint, and we will use that snapshot to construct the rollback batch.
+ // The rollback batch will then contain a reissue of the same marker.
+ //
+ // A more optimal solution would be to only check keys changed since the
+ // last savepoint. Also, it may make sense to merge this into tracked_keys_
+ // and differentiate between tracked but not locked keys to avoid having two
+ // very similar data structures.
+ using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
+ KeySet untracked_keys_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
new file mode 100644
index 000000000..2ed2d5c59
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
@@ -0,0 +1,473 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Instead of reconstructing a Transaction object, and calling rollback on it,
+// we can be more efficient with RollbackRecoveredTransaction by skipping
+// unnecessary steps (eg. updating CommitMap, reconstructing keyset)
+Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
+ const DBImpl::RecoveredTransaction* rtxn) {
+ // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+ assert(rtxn->unprepared_);
+ auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
+ auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
+ // In theory we could write with disableWAL = true during recovery, and
+ // assume that if we crash again during recovery, we can just replay from
+ // the very beginning. Unfortunately, the XIDs from the application may not
+ // necessarily be unique across restarts, potentially leading to situations
+ // like this:
+ //
+ // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1)
+ // -- crash and recover with Put(a) rolled back as it was not prepared
+ // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1)
+ // COMMIT(xid = 1)
+ // -- crash and recover with both a, b
+ //
+ // We could just write the rollback marker, but then we would have to extend
+ // MemTableInserter during recovery to actually do writes into the DB
+ // instead of just dropping the in-memory write batch.
+ //
+ WriteOptions w_options;
+
+ class InvalidSnapshotReadCallback : public ReadCallback {
+ public:
+ InvalidSnapshotReadCallback(SequenceNumber snapshot)
+ : ReadCallback(snapshot) {}
+
+ inline bool IsVisibleFullCheck(SequenceNumber) override {
+ // The seq provided as snapshot is the seq right before we have locked and
+ // wrote to it, so whatever is there, it is committed.
+ return true;
+ }
+
+ // Ignore the refresh request since we are confident that our snapshot seq
+ // is not going to be affected by concurrent compactions (not enabled yet.)
+ void Refresh(SequenceNumber) override {}
+ };
+
+ // Iterate starting with largest sequence number.
+ for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
+ auto last_visible_txn = it->first - 1;
+ const auto& batch = it->second.batch_;
+ WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+ w_options.protection_bytes_per_key,
+ 0 /* default_cf_ts_sz */);
+
+ struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+ DBImpl* db_;
+ ReadOptions roptions;
+ InvalidSnapshotReadCallback callback;
+ WriteBatch* rollback_batch_;
+ std::map<uint32_t, const Comparator*>& comparators_;
+ std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+ using CFKeys = std::set<Slice, SetComparator>;
+ std::map<uint32_t, CFKeys> keys_;
+ bool rollback_merge_operands_;
+ RollbackWriteBatchBuilder(
+ DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch,
+ std::map<uint32_t, const Comparator*>& comparators,
+ std::map<uint32_t, ColumnFamilyHandle*>& handles,
+ bool rollback_merge_operands)
+ : db_(db),
+ callback(snap_seq),
+ // disable min_uncommitted optimization
+ rollback_batch_(dst_batch),
+ comparators_(comparators),
+ handles_(handles),
+ rollback_merge_operands_(rollback_merge_operands) {}
+
+ Status Rollback(uint32_t cf, const Slice& key) {
+ Status s;
+ CFKeys& cf_keys = keys_[cf];
+ if (cf_keys.size() == 0) { // just inserted
+ auto cmp = comparators_[cf];
+ keys_[cf] = CFKeys(SetComparator(cmp));
+ }
+ auto res = cf_keys.insert(key);
+ if (res.second ==
+ false) { // second is false if a element already existed.
+ return s;
+ }
+
+ PinnableSlice pinnable_val;
+ bool not_used;
+ auto cf_handle = handles_[cf];
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = cf_handle;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = &not_used;
+ get_impl_options.callback = &callback;
+ s = db_->GetImpl(roptions, key, get_impl_options);
+ assert(s.ok() || s.IsNotFound());
+ if (s.ok()) {
+ s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+ assert(s.ok());
+ } else if (s.IsNotFound()) {
+ // There has been no readable value before txn. By adding a delete we
+ // make sure that there will be none afterwards either.
+ s = rollback_batch_->Delete(cf_handle, key);
+ assert(s.ok());
+ } else {
+ // Unexpected status. Return it to the user.
+ }
+ return s;
+ }
+
+ Status PutCF(uint32_t cf, const Slice& key,
+ const Slice& /*val*/) override {
+ return Rollback(cf, key);
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ return Rollback(cf, key);
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ return Rollback(cf, key);
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key,
+ const Slice& /*val*/) override {
+ if (rollback_merge_operands_) {
+ return Rollback(cf, key);
+ } else {
+ return Status::OK();
+ }
+ }
+
+ // Recovered batches do not contain 2PC markers.
+ Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+ Status MarkBeginPrepare(bool) override {
+ return Status::InvalidArgument();
+ }
+ Status MarkEndPrepare(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ Status MarkCommit(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ Status MarkRollback(const Slice&) override {
+ return Status::InvalidArgument();
+ }
+ } rollback_handler(db_impl_, last_visible_txn, &rollback_batch,
+ *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+ txn_db_options_.rollback_merge_operands);
+
+ auto s = batch->Iterate(&rollback_handler);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // The Rollback marker will be used as a batch separator
+ s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ const uint64_t kNoLogRef = 0;
+ const bool kDisableMemtable = true;
+ const size_t kOneBatch = 1;
+ uint64_t seq_used = kMaxSequenceNumber;
+ s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr,
+ kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // If two_write_queues, we must manually release the sequence number to
+ // readers.
+ if (db_impl_->immutable_db_options().two_write_queues) {
+ db_impl_->SetLastPublishedSequence(seq_used);
+ }
+ }
+
+ return Status::OK();
+}
+
+Status WriteUnpreparedTxnDB::Initialize(
+ const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles) {
+ // TODO(lth): Reduce code duplication in this function.
+ auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+ assert(dbimpl != nullptr);
+
+ db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+ // A callback to commit a single sub-batch
+ class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+ public:
+ explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+ : db_(db) {}
+ Status Callback(SequenceNumber commit_seq,
+ bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+ size_t /*index*/, size_t /*total*/) override {
+ assert(!is_mem_disabled);
+ db_->AddCommitted(commit_seq, commit_seq);
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ };
+ db_impl_->SetRecoverableStatePreReleaseCallback(
+ new CommitSubBatchPreReleaseCallback(this));
+
+ // PessimisticTransactionDB::Initialize
+ for (auto cf_ptr : handles) {
+ AddColumnFamily(cf_ptr);
+ }
+ // Verify cf options
+ for (auto handle : handles) {
+ ColumnFamilyDescriptor cfd;
+ Status s = handle->GetDescriptor(&cfd);
+ if (!s.ok()) {
+ return s;
+ }
+ s = VerifyCFOptions(cfd.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // Re-enable compaction for the column families that initially had
+ // compaction enabled.
+ std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+ compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+ for (auto index : compaction_enabled_cf_indices) {
+ compaction_enabled_cf_handles.push_back(handles[index]);
+ }
+
+ // create 'real' transactions from recovered shell transactions
+ auto rtxns = dbimpl->recovered_transactions();
+ std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
+ for (auto rtxn : rtxns) {
+ auto recovered_trx = rtxn.second;
+ assert(recovered_trx);
+ assert(recovered_trx->batches_.size() >= 1);
+ assert(recovered_trx->name_.length());
+
+ // We can only rollback transactions after AdvanceMaxEvictedSeq is called,
+ // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why
+ // two iterations is required.
+ if (recovered_trx->unprepared_) {
+ continue;
+ }
+
+ WriteOptions w_options;
+ w_options.sync = true;
+ TransactionOptions t_options;
+
+ auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
+ auto first_seq = recovered_trx->batches_.begin()->first;
+ auto last_prepare_batch_cnt =
+ recovered_trx->batches_.begin()->second.batch_cnt_;
+
+ Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+ assert(real_trx);
+ auto wupt = static_cast_with_check<WriteUnpreparedTxn>(real_trx);
+ wupt->recovered_txn_ = true;
+
+ real_trx->SetLogNumber(first_log_number);
+ real_trx->SetId(first_seq);
+ Status s = real_trx->SetName(recovered_trx->name_);
+ if (!s.ok()) {
+ return s;
+ }
+ wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
+
+ for (auto batch : recovered_trx->batches_) {
+ const auto& seq = batch.first;
+ const auto& batch_info = batch.second;
+ auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+ assert(batch_info.log_number_);
+
+ ordered_seq_cnt[seq] = cnt;
+ assert(wupt->unprep_seqs_.count(seq) == 0);
+ wupt->unprep_seqs_[seq] = cnt;
+
+ s = wupt->RebuildFromWriteBatch(batch_info.batch_);
+ assert(s.ok());
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ const bool kClear = true;
+ wupt->InitWriteBatch(kClear);
+
+ real_trx->SetState(Transaction::PREPARED);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ // AddPrepared must be called in order
+ for (auto seq_cnt : ordered_seq_cnt) {
+ auto seq = seq_cnt.first;
+ auto cnt = seq_cnt.second;
+ for (size_t i = 0; i < cnt; i++) {
+ AddPrepared(seq + i);
+ }
+ }
+
+ SequenceNumber prev_max = max_evicted_seq_;
+ SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+ AdvanceMaxEvictedSeq(prev_max, last_seq);
+ // Create a gap between max and the next snapshot. This simplifies the logic
+ // in IsInSnapshot by not having to consider the special case of max ==
+ // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+ if (last_seq) {
+ db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+ db_impl_->versions_->SetLastSequence(last_seq + 1);
+ db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+ }
+
+ Status s;
+ // Rollback unprepared transactions.
+ for (auto rtxn : rtxns) {
+ auto recovered_trx = rtxn.second;
+ if (recovered_trx->unprepared_) {
+ s = RollbackRecoveredTransaction(recovered_trx);
+ if (!s.ok()) {
+ return s;
+ }
+ continue;
+ }
+ }
+
+ if (s.ok()) {
+ dbimpl->DeleteAllRecoveredTransactions();
+
+ // Compaction should start only after max_evicted_seq_ is set AND recovered
+ // transactions are either added to PrepareHeap or rolled back.
+ s = EnableAutoCompaction(compaction_enabled_cf_handles);
+ }
+
+ return s;
+}
+
+Transaction* WriteUnpreparedTxnDB::BeginTransaction(
+ const WriteOptions& write_options, const TransactionOptions& txn_options,
+ Transaction* old_txn) {
+ if (old_txn != nullptr) {
+ ReinitializeTransaction(old_txn, write_options, txn_options);
+ return old_txn;
+ } else {
+ return new WriteUnpreparedTxn(this, write_options, txn_options);
+ }
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WriteUnpreparedTxnDB::IteratorState {
+ IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+ std::shared_ptr<ManagedSnapshot> s,
+ SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
+ : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
+ kBackedByDBSnapshot),
+ snapshot(s) {}
+ SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
+
+ WriteUnpreparedTxnReadCallback callback;
+ std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+ delete reinterpret_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1);
+}
+} // anonymous namespace
+
+Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ WriteUnpreparedTxn* txn) {
+ // TODO(lth): Refactor so that this logic is shared with WritePrepared.
+ constexpr bool expose_blob_index = false;
+ constexpr bool allow_refresh = false;
+ std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+ SequenceNumber snapshot_seq = kMaxSequenceNumber;
+ SequenceNumber min_uncommitted = 0;
+
+ // Currently, the Prev() iterator logic does not work well without snapshot
+ // validation. The logic simply iterates through values of a key in
+ // ascending seqno order, stopping at the first non-visible value and
+ // returning the last visible value.
+ //
+ // For example, if snapshot sequence is 3, and we have the following keys:
+ // foo: v1 1
+ // foo: v2 2
+ // foo: v3 3
+ // foo: v4 4
+ // foo: v5 5
+ //
+ // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
+ // which is the last visible value.
+ //
+ // For unprepared transactions, if we have snap_seq = 3, but the current
+ // transaction has unprep_seq 5, then returning the first non-visible value
+ // would be incorrect, as we should return v5, and not v3. The problem is that
+ // there are committed values at snapshot_seq < commit_seq < unprep_seq.
+ //
+ // Snapshot validation can prevent this problem by ensuring that no committed
+ // values exist at snapshot_seq < commit_seq, and thus any value with a
+ // sequence number greater than snapshot_seq must be unprepared values. For
+ // example, if the transaction had a snapshot at 3, then snapshot validation
+ // would be performed during the Put(v5) call. It would find v4, and the Put
+ // would fail with snapshot validation failure.
+ //
+ // TODO(lth): Improve Prev() logic to continue iterating until
+ // max_visible_seq, and then return the last visible value, so that this
+ // restriction can be lifted.
+ const Snapshot* snapshot = nullptr;
+ if (options.snapshot == nullptr) {
+ snapshot = GetSnapshot();
+ own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+ } else {
+ snapshot = options.snapshot;
+ }
+
+ snapshot_seq = snapshot->GetSequenceNumber();
+ assert(snapshot_seq != kMaxSequenceNumber);
+ // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
+ // guaranteed that for keys that were modified by this transaction (and thus
+ // might have unprepared values), no committed values exist at
+ // largest_validated_seq < commit_seq (or the contrapositive: any committed
+ // value must exist at commit_seq <= largest_validated_seq). This implies
+ // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
+ // snapshot_seq. As explained above, the problem with Prev() only happens when
+ // snapshot_seq < commit_seq.
+ //
+ // For keys that were not modified by this transaction, largest_validated_seq_
+ // is meaningless, and Prev() should just work with the existing visibility
+ // logic.
+ if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
+ !txn->unprep_seqs_.empty()) {
+ ROCKS_LOG_ERROR(info_log_,
+ "WriteUnprepared iterator creation failed since the "
+ "transaction has performed unvalidated writes");
+ return nullptr;
+ }
+ min_uncommitted =
+ static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+
+ auto* cfd =
+ static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+ auto* state =
+ new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
+ auto* db_iter = db_impl_->NewIteratorImpl(
+ options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index,
+ allow_refresh);
+ db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
+ return db_iter;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h
new file mode 100644
index 000000000..c40e96d49
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTxn;
+
+class WriteUnpreparedTxnDB : public WritePreparedTxnDB {
+ public:
+ using WritePreparedTxnDB::WritePreparedTxnDB;
+
+ Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
+ const std::vector<ColumnFamilyHandle*>& handles) override;
+
+ Transaction* BeginTransaction(const WriteOptions& write_options,
+ const TransactionOptions& txn_options,
+ Transaction* old_txn) override;
+
+ // Struct to hold ownership of snapshot and read callback for cleanup.
+ struct IteratorState;
+
+ using WritePreparedTxnDB::NewIterator;
+ Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ WriteUnpreparedTxn* txn);
+
+ private:
+ Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn);
+};
+
+class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+ // TODO(lth): Reduce code duplication with
+ // WritePreparedCommitEntryPreReleaseCallback
+ public:
+ // includes_data indicates that the commit also writes non-empty
+ // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+ WriteUnpreparedCommitEntryPreReleaseCallback(
+ WritePreparedTxnDB* db, DBImpl* db_impl,
+ const std::map<SequenceNumber, size_t>& unprep_seqs,
+ size_t data_batch_cnt = 0, bool publish_seq = true)
+ : db_(db),
+ db_impl_(db_impl),
+ unprep_seqs_(unprep_seqs),
+ data_batch_cnt_(data_batch_cnt),
+ includes_data_(data_batch_cnt_ > 0),
+ publish_seq_(publish_seq) {
+ assert(unprep_seqs.size() > 0);
+ }
+
+ virtual Status Callback(SequenceNumber commit_seq,
+ bool is_mem_disabled __attribute__((__unused__)),
+ uint64_t, size_t /*index*/,
+ size_t /*total*/) override {
+ const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+ ? commit_seq
+ : commit_seq + data_batch_cnt_ - 1;
+ // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
+ for (const auto& s : unprep_seqs_) {
+ for (size_t i = 0; i < s.second; i++) {
+ db_->AddCommitted(s.first + i, last_commit_seq);
+ }
+ }
+
+ if (includes_data_) {
+ assert(data_batch_cnt_);
+ // Commit the data that is accompanied with the commit request
+ for (size_t i = 0; i < data_batch_cnt_; i++) {
+ // For commit seq of each batch use the commit seq of the last batch.
+ // This would make debugging easier by having all the batches having
+ // the same sequence number.
+ db_->AddCommitted(commit_seq + i, last_commit_seq);
+ }
+ }
+ if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) {
+ assert(is_mem_disabled); // implies the 2nd queue
+ // Publish the sequence number. We can do that here assuming the callback
+ // is invoked only from one write queue, which would guarantee that the
+ // publish sequence numbers will be in order, i.e., once a seq is
+ // published all the seq prior to that are also publishable.
+ db_impl_->SetLastPublishedSequence(last_commit_seq);
+ }
+ // else SequenceNumber that is updated as part of the write already does the
+ // publishing
+ return Status::OK();
+ }
+
+ private:
+ WritePreparedTxnDB* db_;
+ DBImpl* db_impl_;
+ const std::map<SequenceNumber, size_t>& unprep_seqs_;
+ size_t data_batch_cnt_;
+ // Either because it is commit without prepare or it has a
+ // CommitTimeWriteBatch
+ bool includes_data_;
+ // Should the callback also publishes the commit seq number
+ bool publish_seq_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.cc b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
new file mode 100644
index 000000000..6ec9d87b0
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
@@ -0,0 +1,609 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "utilities/ttl/db_ttl_impl.h"
+
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo> ttl_merge_op_type_info =
+ {{"user_operator",
+ OptionTypeInfo::AsCustomSharedPtr<MergeOperator>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}};
+
+TtlMergeOperator::TtlMergeOperator(
+ const std::shared_ptr<MergeOperator>& merge_op, SystemClock* clock)
+ : user_merge_op_(merge_op), clock_(clock) {
+ RegisterOptions("TtlMergeOptions", &user_merge_op_, &ttl_merge_op_type_info);
+}
+
+bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const {
+ const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+ if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) {
+ ROCKS_LOG_ERROR(merge_in.logger,
+ "Error: Could not remove timestamp from existing value.");
+ return false;
+ }
+
+ // Extract time-stamp from each operand to be passed to user_merge_op_
+ std::vector<Slice> operands_without_ts;
+ for (const auto& operand : merge_in.operand_list) {
+ if (operand.size() < ts_len) {
+ ROCKS_LOG_ERROR(merge_in.logger,
+ "Error: Could not remove timestamp from operand value.");
+ return false;
+ }
+ operands_without_ts.push_back(operand);
+ operands_without_ts.back().remove_suffix(ts_len);
+ }
+
+ // Apply the user merge operator (store result in *new_value)
+ bool good = true;
+ MergeOperationOutput user_merge_out(merge_out->new_value,
+ merge_out->existing_operand);
+ if (merge_in.existing_value) {
+ Slice existing_value_without_ts(merge_in.existing_value->data(),
+ merge_in.existing_value->size() - ts_len);
+ good = user_merge_op_->FullMergeV2(
+ MergeOperationInput(merge_in.key, &existing_value_without_ts,
+ operands_without_ts, merge_in.logger),
+ &user_merge_out);
+ } else {
+ good = user_merge_op_->FullMergeV2(
+ MergeOperationInput(merge_in.key, nullptr, operands_without_ts,
+ merge_in.logger),
+ &user_merge_out);
+ }
+
+ // Return false if the user merge operator returned false
+ if (!good) {
+ return false;
+ }
+
+ if (merge_out->existing_operand.data()) {
+ merge_out->new_value.assign(merge_out->existing_operand.data(),
+ merge_out->existing_operand.size());
+ merge_out->existing_operand = Slice(nullptr, 0);
+ }
+
+ // Augment the *new_value with the ttl time-stamp
+ int64_t curtime;
+ if (!clock_->GetCurrentTime(&curtime).ok()) {
+ ROCKS_LOG_ERROR(
+ merge_in.logger,
+ "Error: Could not get current time to be attached internally "
+ "to the new value.");
+ return false;
+ } else {
+ char ts_string[ts_len];
+ EncodeFixed32(ts_string, (int32_t)curtime);
+ merge_out->new_value.append(ts_string, ts_len);
+ return true;
+ }
+}
+
+bool TtlMergeOperator::PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value,
+ Logger* logger) const {
+ const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+ std::deque<Slice> operands_without_ts;
+
+ for (const auto& operand : operand_list) {
+ if (operand.size() < ts_len) {
+ ROCKS_LOG_ERROR(logger, "Error: Could not remove timestamp from value.");
+ return false;
+ }
+
+ operands_without_ts.push_back(
+ Slice(operand.data(), operand.size() - ts_len));
+ }
+
+ // Apply the user partial-merge operator (store result in *new_value)
+ assert(new_value);
+ if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value,
+ logger)) {
+ return false;
+ }
+
+ // Augment the *new_value with the ttl time-stamp
+ int64_t curtime;
+ if (!clock_->GetCurrentTime(&curtime).ok()) {
+ ROCKS_LOG_ERROR(
+ logger,
+ "Error: Could not get current time to be attached internally "
+ "to the new value.");
+ return false;
+ } else {
+ char ts_string[ts_len];
+ EncodeFixed32(ts_string, (int32_t)curtime);
+ new_value->append(ts_string, ts_len);
+ return true;
+ }
+}
+
+Status TtlMergeOperator::PrepareOptions(const ConfigOptions& config_options) {
+ if (clock_ == nullptr) {
+ clock_ = config_options.env->GetSystemClock().get();
+ }
+ return MergeOperator::PrepareOptions(config_options);
+}
+
+Status TtlMergeOperator::ValidateOptions(
+ const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+ if (user_merge_op_ == nullptr) {
+ return Status::InvalidArgument(
+ "UserMergeOperator required by TtlMergeOperator");
+ } else if (clock_ == nullptr) {
+ return Status::InvalidArgument("SystemClock required by TtlMergeOperator");
+ } else {
+ return MergeOperator::ValidateOptions(db_opts, cf_opts);
+ }
+}
+
+void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+ SystemClock* clock) {
+ if (options->compaction_filter) {
+ options->compaction_filter =
+ new TtlCompactionFilter(ttl, clock, options->compaction_filter);
+ } else {
+ options->compaction_filter_factory =
+ std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
+ ttl, clock, options->compaction_filter_factory));
+ }
+
+ if (options->merge_operator) {
+ options->merge_operator.reset(
+ new TtlMergeOperator(options->merge_operator, clock));
+ }
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_type_info = {
+ {"ttl", {0, OptionType::kInt32T}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_cff_type_info = {
+ {"user_filter_factory",
+ OptionTypeInfo::AsCustomSharedPtr<CompactionFilterFactory>(
+ 0, OptionVerificationType::kByNameAllowFromNull,
+ OptionTypeFlags::kNone)}};
+static std::unordered_map<std::string, OptionTypeInfo> user_cf_type_info = {
+ {"user_filter",
+ OptionTypeInfo::AsCustomRawPtr<const CompactionFilter>(
+ 0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}};
+
+TtlCompactionFilter::TtlCompactionFilter(
+ int32_t ttl, SystemClock* clock, const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+ : LayeredCompactionFilterBase(_user_comp_filter,
+ std::move(_user_comp_filter_from_factory)),
+ ttl_(ttl),
+ clock_(clock) {
+ RegisterOptions("TTL", &ttl_, &ttl_type_info);
+ RegisterOptions("UserFilter", &user_comp_filter_, &user_cf_type_info);
+}
+
+bool TtlCompactionFilter::Filter(int level, const Slice& key,
+ const Slice& old_val, std::string* new_val,
+ bool* value_changed) const {
+ if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) {
+ return true;
+ }
+ if (user_comp_filter() == nullptr) {
+ return false;
+ }
+ assert(old_val.size() >= DBWithTTLImpl::kTSLength);
+ Slice old_val_without_ts(old_val.data(),
+ old_val.size() - DBWithTTLImpl::kTSLength);
+ if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val,
+ value_changed)) {
+ return true;
+ }
+ if (*value_changed) {
+ new_val->append(old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength,
+ DBWithTTLImpl::kTSLength);
+ }
+ return false;
+}
+
+Status TtlCompactionFilter::PrepareOptions(
+ const ConfigOptions& config_options) {
+ if (clock_ == nullptr) {
+ clock_ = config_options.env->GetSystemClock().get();
+ }
+ return LayeredCompactionFilterBase::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilter::ValidateOptions(
+ const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+ if (clock_ == nullptr) {
+ return Status::InvalidArgument(
+ "SystemClock required by TtlCompactionFilter");
+ } else {
+ return LayeredCompactionFilterBase::ValidateOptions(db_opts, cf_opts);
+ }
+}
+
+TtlCompactionFilterFactory::TtlCompactionFilterFactory(
+ int32_t ttl, SystemClock* clock,
+ std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
+ : ttl_(ttl), clock_(clock), user_comp_filter_factory_(comp_filter_factory) {
+ RegisterOptions("UserOptions", &user_comp_filter_factory_,
+ &ttl_cff_type_info);
+ RegisterOptions("TTL", &ttl_, &ttl_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+TtlCompactionFilterFactory::CreateCompactionFilter(
+ const CompactionFilter::Context& context) {
+ std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
+ nullptr;
+ if (user_comp_filter_factory_) {
+ user_comp_filter_from_factory =
+ user_comp_filter_factory_->CreateCompactionFilter(context);
+ }
+
+ return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
+ ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory)));
+}
+
+Status TtlCompactionFilterFactory::PrepareOptions(
+ const ConfigOptions& config_options) {
+ if (clock_ == nullptr) {
+ clock_ = config_options.env->GetSystemClock().get();
+ }
+ return CompactionFilterFactory::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilterFactory::ValidateOptions(
+ const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+ if (clock_ == nullptr) {
+ return Status::InvalidArgument(
+ "SystemClock required by TtlCompactionFilterFactory");
+ } else {
+ return CompactionFilterFactory::ValidateOptions(db_opts, cf_opts);
+ }
+}
+
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) {
+ library.AddFactory<MergeOperator>(
+ TtlMergeOperator::kClassName(),
+ [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new TtlMergeOperator(nullptr, nullptr));
+ return guard->get();
+ });
+ library.AddFactory<CompactionFilterFactory>(
+ TtlCompactionFilterFactory::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilterFactory>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new TtlCompactionFilterFactory(0, nullptr, nullptr));
+ return guard->get();
+ });
+ library.AddFactory<CompactionFilter>(
+ TtlCompactionFilter::kClassName(),
+ [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilter>* /*guard*/,
+ std::string* /* errmsg */) {
+ return new TtlCompactionFilter(0, nullptr, nullptr);
+ });
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+// Open the db inside DBWithTTLImpl because options needs pointer to its ttl
+DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
+
+DBWithTTLImpl::~DBWithTTLImpl() {
+ if (!closed_) {
+ Close().PermitUncheckedError();
+ }
+}
+
+Status DBWithTTLImpl::Close() {
+ Status ret = Status::OK();
+ if (!closed_) {
+ Options default_options = GetOptions();
+ // Need to stop background compaction before getting rid of the filter
+ CancelAllBackgroundWork(db_, /* wait = */ true);
+ ret = db_->Close();
+ delete default_options.compaction_filter;
+ closed_ = true;
+ }
+ return ret;
+}
+
+void DBWithTTLImpl::RegisterTtlClasses() {
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ ObjectRegistry::Default()->AddLibrary("TTL", RegisterTtlObjects, "");
+ });
+}
+
+Status DBWithTTL::Open(const Options& options, const std::string& dbname,
+ DBWithTTL** dbptr, int32_t ttl, bool read_only) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles,
+ dbptr, {ttl}, read_only);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DBWithTTL::Open(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DBWithTTL** dbptr,
+ const std::vector<int32_t>& ttls, bool read_only) {
+ DBWithTTLImpl::RegisterTtlClasses();
+ if (ttls.size() != column_families.size()) {
+ return Status::InvalidArgument(
+ "ttls size has to be the same as number of column families");
+ }
+
+ SystemClock* clock = (db_options.env == nullptr)
+ ? SystemClock::Default().get()
+ : db_options.env->GetSystemClock().get();
+
+ std::vector<ColumnFamilyDescriptor> column_families_sanitized =
+ column_families;
+ for (size_t i = 0; i < column_families_sanitized.size(); ++i) {
+ DBWithTTLImpl::SanitizeOptions(
+ ttls[i], &column_families_sanitized[i].options, clock);
+ }
+ DB* db;
+
+ Status st;
+ if (read_only) {
+ st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized,
+ handles, &db);
+ } else {
+ st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db);
+ }
+ if (st.ok()) {
+ *dbptr = new DBWithTTLImpl(db);
+ } else {
+ *dbptr = nullptr;
+ }
+ return st;
+}
+
+Status DBWithTTLImpl::CreateColumnFamilyWithTtl(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ ColumnFamilyHandle** handle, int ttl) {
+ RegisterTtlClasses();
+ ColumnFamilyOptions sanitized_options = options;
+ DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options,
+ GetEnv()->GetSystemClock().get());
+
+ return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name,
+ handle);
+}
+
+Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) {
+ return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0);
+}
+
+// Appends the current timestamp to the string.
+// Returns false if could not get the current_time, true if append succeeds
+Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts,
+ SystemClock* clock) {
+ val_with_ts->reserve(kTSLength + val.size());
+ char ts_string[kTSLength];
+ int64_t curtime;
+ Status st = clock->GetCurrentTime(&curtime);
+ if (!st.ok()) {
+ return st;
+ }
+ EncodeFixed32(ts_string, (int32_t)curtime);
+ val_with_ts->append(val.data(), val.size());
+ val_with_ts->append(ts_string, kTSLength);
+ return st;
+}
+
+// Returns corruption if the length of the string is lesser than timestamp, or
+// timestamp refers to a time lesser than ttl-feature release time
+Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) {
+ if (str.size() < kTSLength) {
+ return Status::Corruption("Error: value's length less than timestamp's\n");
+ }
+ // Checks that TS is not lesser than kMinTimestamp
+ // Gaurds against corruption & normal database opened incorrectly in ttl mode
+ int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength);
+ if (timestamp_value < kMinTimestamp) {
+ return Status::Corruption("Error: Timestamp < ttl feature release time!\n");
+ }
+ return Status::OK();
+}
+
+// Checks if the string is stale or not according to TTl provided
+bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl,
+ SystemClock* clock) {
+ if (ttl <= 0) { // Data is fresh if TTL is non-positive
+ return false;
+ }
+ int64_t curtime;
+ if (!clock->GetCurrentTime(&curtime).ok()) {
+ return false; // Treat the data as fresh if could not get current time
+ }
+ int32_t timestamp_value =
+ DecodeFixed32(value.data() + value.size() - kTSLength);
+ return (timestamp_value + ttl) < curtime;
+}
+
+// Strips the TS from the end of the slice
+Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) {
+ if (pinnable_val->size() < kTSLength) {
+ return Status::Corruption("Bad timestamp in key-value");
+ }
+ // Erasing characters which hold the TS
+ pinnable_val->remove_suffix(kTSLength);
+ return Status::OK();
+}
+
+// Strips the TS from the end of the string
+Status DBWithTTLImpl::StripTS(std::string* str) {
+ if (str->length() < kTSLength) {
+ return Status::Corruption("Bad timestamp in key-value");
+ }
+ // Erasing characters which hold the TS
+ str->erase(str->length() - kTSLength, kTSLength);
+ return Status::OK();
+}
+
+Status DBWithTTLImpl::Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& val) {
+ WriteBatch batch;
+ Status st = batch.Put(column_family, key, val);
+ if (st.ok()) {
+ st = Write(options, &batch);
+ }
+ return st;
+}
+
+Status DBWithTTLImpl::Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ Status st = db_->Get(options, column_family, key, value);
+ if (!st.ok()) {
+ return st;
+ }
+ st = SanityCheckTimestamp(*value);
+ if (!st.ok()) {
+ return st;
+ }
+ return StripTS(value);
+}
+
+std::vector<Status> DBWithTTLImpl::MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ auto statuses = db_->MultiGet(options, column_family, keys, values);
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if (!statuses[i].ok()) {
+ continue;
+ }
+ statuses[i] = SanityCheckTimestamp((*values)[i]);
+ if (!statuses[i].ok()) {
+ continue;
+ }
+ statuses[i] = StripTS(&(*values)[i]);
+ }
+ return statuses;
+}
+
+bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, std::string* value,
+ bool* value_found) {
+ bool ret = db_->KeyMayExist(options, column_family, key, value, value_found);
+ if (ret && value != nullptr && value_found != nullptr && *value_found) {
+ if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
+ return false;
+ }
+ }
+ return ret;
+}
+
+Status DBWithTTLImpl::Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ WriteBatch batch;
+ Status st = batch.Merge(column_family, key, value);
+ if (st.ok()) {
+ st = Write(options, &batch);
+ }
+ return st;
+}
+
+Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+ class Handler : public WriteBatch::Handler {
+ public:
+ explicit Handler(SystemClock* clock) : clock_(clock) {}
+ WriteBatch updates_ttl;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ std::string value_with_ts;
+ Status st = AppendTS(value, &value_with_ts, clock_);
+ if (!st.ok()) {
+ return st;
+ }
+ return WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
+ value_with_ts);
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ std::string value_with_ts;
+ Status st = AppendTS(value, &value_with_ts, clock_);
+ if (!st.ok()) {
+ return st;
+ }
+ return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
+ value_with_ts);
+ }
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
+ }
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key) override {
+ return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id,
+ begin_key, end_key);
+ }
+ void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); }
+
+ private:
+ SystemClock* clock_;
+ };
+ Handler handler(GetEnv()->GetSystemClock().get());
+ Status st = updates->Iterate(&handler);
+ if (!st.ok()) {
+ return st;
+ } else {
+ return db_->Write(opts, &(handler.updates_ttl));
+ }
+}
+
+Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts,
+ ColumnFamilyHandle* column_family) {
+ return new TtlIterator(db_->NewIterator(opts, column_family));
+}
+
+void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) {
+ std::shared_ptr<TtlCompactionFilterFactory> filter;
+ Options opts;
+ opts = GetOptions(h);
+ filter = std::static_pointer_cast<TtlCompactionFilterFactory>(
+ opts.compaction_filter_factory);
+ if (!filter) return;
+ filter->SetTtl(ttl);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.h b/src/rocksdb/utilities/ttl/db_ttl_impl.h
new file mode 100644
index 000000000..dd67a6ddc
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.h
@@ -0,0 +1,245 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+class ObjectLibrary;
+class ObjectRegistry;
+class DBWithTTLImpl : public DBWithTTL {
+ public:
+ static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+ SystemClock* clock);
+
+ static void RegisterTtlClasses();
+ explicit DBWithTTLImpl(DB* db);
+
+ virtual ~DBWithTTLImpl();
+
+ virtual Status Close() override;
+
+ Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle,
+ int ttl) override;
+
+ Status CreateColumnFamily(const ColumnFamilyOptions& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) override;
+
+ using StackableDB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& val) override;
+
+ using StackableDB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using StackableDB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ using StackableDB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value,
+ bool* value_found = nullptr) override;
+
+ using StackableDB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+
+ virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+ using StackableDB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& opts,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual DB* GetBaseDB() override { return db_; }
+
+ static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock);
+
+ static Status AppendTS(const Slice& val, std::string* val_with_ts,
+ SystemClock* clock);
+
+ static Status SanityCheckTimestamp(const Slice& str);
+
+ static Status StripTS(std::string* str);
+
+ static Status StripTS(PinnableSlice* str);
+
+ static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp
+
+ static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8
+
+ static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8
+
+ void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); }
+
+ void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override;
+
+ private:
+ // remember whether the Close completes or not
+ bool closed_;
+};
+
+class TtlIterator : public Iterator {
+ public:
+ explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); }
+
+ ~TtlIterator() { delete iter_; }
+
+ bool Valid() const override { return iter_->Valid(); }
+
+ void SeekToFirst() override { iter_->SeekToFirst(); }
+
+ void SeekToLast() override { iter_->SeekToLast(); }
+
+ void Seek(const Slice& target) override { iter_->Seek(target); }
+
+ void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
+
+ void Next() override { iter_->Next(); }
+
+ void Prev() override { iter_->Prev(); }
+
+ Slice key() const override { return iter_->key(); }
+
+ int32_t ttl_timestamp() const {
+ return DecodeFixed32(iter_->value().data() + iter_->value().size() -
+ DBWithTTLImpl::kTSLength);
+ }
+
+ Slice value() const override {
+ // TODO: handle timestamp corruption like in general iterator semantics
+ assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok());
+ Slice trimmed_value = iter_->value();
+ trimmed_value.size_ -= DBWithTTLImpl::kTSLength;
+ return trimmed_value;
+ }
+
+ Status status() const override { return iter_->status(); }
+
+ private:
+ Iterator* iter_;
+};
+
+class TtlCompactionFilter : public LayeredCompactionFilterBase {
+ public:
+ TtlCompactionFilter(int32_t ttl, SystemClock* clock,
+ const CompactionFilter* _user_comp_filter,
+ std::unique_ptr<const CompactionFilter>
+ _user_comp_filter_from_factory = nullptr);
+
+ virtual bool Filter(int level, const Slice& key, const Slice& old_val,
+ std::string* new_val, bool* value_changed) const override;
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "TtlCompactionFilter"; }
+ bool IsInstanceOf(const std::string& name) const override {
+ if (name == "Delete By TTL") {
+ return true;
+ } else {
+ return LayeredCompactionFilterBase::IsInstanceOf(name);
+ }
+ }
+
+ Status PrepareOptions(const ConfigOptions& config_options) override;
+ Status ValidateOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const override;
+
+ private:
+ int32_t ttl_;
+ SystemClock* clock_;
+};
+
+class TtlCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+ TtlCompactionFilterFactory(
+ int32_t ttl, SystemClock* clock,
+ std::shared_ptr<CompactionFilterFactory> comp_filter_factory);
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& context) override;
+ void SetTtl(int32_t ttl) { ttl_ = ttl; }
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "TtlCompactionFilterFactory"; }
+ Status PrepareOptions(const ConfigOptions& config_options) override;
+ Status ValidateOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const override;
+ const Customizable* Inner() const override {
+ return user_comp_filter_factory_.get();
+ }
+
+ private:
+ int32_t ttl_;
+ SystemClock* clock_;
+ std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class TtlMergeOperator : public MergeOperator {
+ public:
+ explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
+ SystemClock* clock);
+
+ bool FullMergeV2(const MergeOperationInput& merge_in,
+ MergeOperationOutput* merge_out) const override;
+
+ bool PartialMergeMulti(const Slice& key,
+ const std::deque<Slice>& operand_list,
+ std::string* new_value, Logger* logger) const override;
+
+ static const char* kClassName() { return "TtlMergeOperator"; }
+
+ const char* Name() const override { return kClassName(); }
+ bool IsInstanceOf(const std::string& name) const override {
+ if (name == "Merge By TTL") {
+ return true;
+ } else {
+ return MergeOperator::IsInstanceOf(name);
+ }
+ }
+
+ Status PrepareOptions(const ConfigOptions& config_options) override;
+ Status ValidateOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const override;
+ const Customizable* Inner() const override { return user_merge_op_.get(); }
+
+ private:
+ std::shared_ptr<MergeOperator> user_merge_op_;
+ SystemClock* clock_;
+};
+extern "C" {
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/);
+} // extern "C"
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/ttl_test.cc b/src/rocksdb/utilities/ttl/ttl_test.cc
new file mode 100644
index 000000000..a42e0acb4
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/ttl_test.cc
@@ -0,0 +1,912 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/ttl/db_ttl_impl.h"
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+using KVMap = std::map<std::string, std::string>;
+
+enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 };
+} // namespace
+
+class SpecialTimeEnv : public EnvWrapper {
+ public:
+ explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
+ EXPECT_OK(base->GetCurrentTime(&current_time_));
+ }
+ const char* Name() const override { return "SpecialTimeEnv"; }
+ void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
+ Status GetCurrentTime(int64_t* current_time) override {
+ *current_time = current_time_;
+ return Status::OK();
+ }
+
+ private:
+ int64_t current_time_ = 0;
+};
+
+class TtlTest : public testing::Test {
+ public:
+ TtlTest() {
+ env_.reset(new SpecialTimeEnv(Env::Default()));
+ dbname_ = test::PerThreadDBPath("db_ttl");
+ options_.create_if_missing = true;
+ options_.env = env_.get();
+ // ensure that compaction is kicked in to always strip timestamp from kvs
+ options_.max_compaction_bytes = 1;
+ // compaction should take place always from level0 for determinism
+ db_ttl_ = nullptr;
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ ~TtlTest() override {
+ CloseTtl();
+ EXPECT_OK(DestroyDB(dbname_, Options()));
+ }
+
+ // Open database with TTL support when TTL not provided with db_ttl_ pointer
+ void OpenTtl() {
+ ASSERT_TRUE(db_ttl_ ==
+ nullptr); // db should be closed before opening again
+ ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_));
+ }
+
+ // Open database with TTL support when TTL provided with db_ttl_ pointer
+ void OpenTtl(int32_t ttl) {
+ ASSERT_TRUE(db_ttl_ == nullptr);
+ ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl));
+ }
+
+ // Open with TestFilter compaction filter
+ void OpenTtlWithTestCompaction(int32_t ttl) {
+ options_.compaction_filter_factory =
+ std::shared_ptr<CompactionFilterFactory>(
+ new TestFilterFactory(kSampleSize_, kNewValue_));
+ OpenTtl(ttl);
+ }
+
+ // Open database with TTL support in read_only mode
+ void OpenReadOnlyTtl(int32_t ttl) {
+ ASSERT_TRUE(db_ttl_ == nullptr);
+ ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true));
+ }
+
+ // Call db_ttl_->Close() before delete db_ttl_
+ void CloseTtl() { CloseTtlHelper(true); }
+
+ // No db_ttl_->Close() before delete db_ttl_
+ void CloseTtlNoDBClose() { CloseTtlHelper(false); }
+
+ void CloseTtlHelper(bool close_db) {
+ if (db_ttl_ != nullptr) {
+ if (close_db) {
+ EXPECT_OK(db_ttl_->Close());
+ }
+ delete db_ttl_;
+ db_ttl_ = nullptr;
+ }
+ }
+
+ // Populates and returns a kv-map
+ void MakeKVMap(int64_t num_entries) {
+ kvmap_.clear();
+ int digits = 1;
+ for (int64_t dummy = num_entries; dummy /= 10; ++digits) {
+ }
+ int digits_in_i = 1;
+ for (int64_t i = 0; i < num_entries; i++) {
+ std::string key = "key";
+ std::string value = "value";
+ if (i % 10 == 0) {
+ digits_in_i++;
+ }
+ for (int j = digits_in_i; j < digits; j++) {
+ key.append("0");
+ value.append("0");
+ }
+ AppendNumberTo(&key, i);
+ AppendNumberTo(&value, i);
+ kvmap_[key] = value;
+ }
+ ASSERT_EQ(static_cast<int64_t>(kvmap_.size()),
+ num_entries); // check all insertions done
+ }
+
+ // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
+ void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) {
+ ASSERT_LE(num_ops, static_cast<int64_t>(kvmap_.size()));
+ static WriteOptions wopts;
+ static FlushOptions flush_opts;
+ WriteBatch batch;
+ kv_it_ = kvmap_.begin();
+ for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
+ switch (batch_ops[i]) {
+ case OP_PUT:
+ ASSERT_OK(batch.Put(kv_it_->first, kv_it_->second));
+ break;
+ case OP_DELETE:
+ ASSERT_OK(batch.Delete(kv_it_->first));
+ break;
+ default:
+ FAIL();
+ }
+ }
+ ASSERT_OK(db_ttl_->Write(wopts, &batch));
+ ASSERT_OK(db_ttl_->Flush(flush_opts));
+ }
+
+ // Puts num_entries starting from start_pos_map from kvmap_ into the database
+ void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true,
+ ColumnFamilyHandle* cf = nullptr) {
+ ASSERT_TRUE(db_ttl_);
+ ASSERT_LE(start_pos_map + num_entries, static_cast<int64_t>(kvmap_.size()));
+ static WriteOptions wopts;
+ static FlushOptions flush_opts;
+ kv_it_ = kvmap_.begin();
+ advance(kv_it_, start_pos_map);
+ for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries;
+ i++, ++kv_it_) {
+ ASSERT_OK(cf == nullptr
+ ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
+ : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
+ }
+ // Put a mock kv at the end because CompactionFilter doesn't delete last key
+ ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, "keymock", "valuemock")
+ : db_ttl_->Put(wopts, cf, "keymock", "valuemock"));
+ if (flush) {
+ if (cf == nullptr) {
+ ASSERT_OK(db_ttl_->Flush(flush_opts));
+ } else {
+ ASSERT_OK(db_ttl_->Flush(flush_opts, cf));
+ }
+ }
+ }
+
+ // Runs a manual compaction
+ Status ManualCompact(ColumnFamilyHandle* cf = nullptr) {
+ assert(db_ttl_);
+ if (cf == nullptr) {
+ return db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ } else {
+ return db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
+ }
+ }
+
+ // Runs a DeleteRange
+ void MakeDeleteRange(std::string start, std::string end,
+ ColumnFamilyHandle* cf = nullptr) {
+ ASSERT_TRUE(db_ttl_);
+ static WriteOptions wops;
+ WriteBatch wb;
+ ASSERT_OK(cf == nullptr
+ ? wb.DeleteRange(db_ttl_->DefaultColumnFamily(), start, end)
+ : wb.DeleteRange(cf, start, end));
+ ASSERT_OK(db_ttl_->Write(wops, &wb));
+ }
+
+ // checks the whole kvmap_ to return correct values using KeyMayExist
+ void SimpleKeyMayExistCheck() {
+ static ReadOptions ropts;
+ bool value_found;
+ std::string val;
+ for (auto& kv : kvmap_) {
+ bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found);
+ if (ret == false || value_found == false) {
+ fprintf(stderr,
+ "KeyMayExist could not find key=%s in the database but"
+ " should have\n",
+ kv.first.c_str());
+ FAIL();
+ } else if (val.compare(kv.second) != 0) {
+ fprintf(stderr,
+ " value for key=%s present in database is %s but"
+ " should be %s\n",
+ kv.first.c_str(), val.c_str(), kv.second.c_str());
+ FAIL();
+ }
+ }
+ }
+
+ // checks the whole kvmap_ to return correct values using MultiGet
+ void SimpleMultiGetTest() {
+ static ReadOptions ropts;
+ std::vector<Slice> keys;
+ std::vector<std::string> values;
+
+ for (auto& kv : kvmap_) {
+ keys.emplace_back(kv.first);
+ }
+
+ auto statuses = db_ttl_->MultiGet(ropts, keys, &values);
+ size_t i = 0;
+ for (auto& kv : kvmap_) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], kv.second);
+ ++i;
+ }
+ }
+
+ void CompactCheck(int64_t st_pos, int64_t span, bool check = true,
+ bool test_compaction_change = false,
+ ColumnFamilyHandle* cf = nullptr) {
+ static ReadOptions ropts;
+ kv_it_ = kvmap_.begin();
+ advance(kv_it_, st_pos);
+ std::string v;
+ for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
+ Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
+ : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
+ if (s.ok() != check) {
+ fprintf(stderr, "key=%s ", kv_it_->first.c_str());
+ if (!s.ok()) {
+ fprintf(stderr, "is absent from db but was expected to be present\n");
+ } else {
+ fprintf(stderr, "is present in db but was expected to be absent\n");
+ }
+ FAIL();
+ } else if (s.ok()) {
+ if (test_compaction_change && v.compare(kNewValue_) != 0) {
+ fprintf(stderr,
+ " value for key=%s present in database is %s but "
+ " should be %s\n",
+ kv_it_->first.c_str(), v.c_str(), kNewValue_.c_str());
+ FAIL();
+ } else if (!test_compaction_change && v.compare(kv_it_->second) != 0) {
+ fprintf(stderr,
+ " value for key=%s present in database is %s but "
+ " should be %s\n",
+ kv_it_->first.c_str(), v.c_str(), kv_it_->second.c_str());
+ FAIL();
+ }
+ }
+ }
+ }
+ // Sleeps for slp_tim then runs a manual compaction
+ // Checks span starting from st_pos from kvmap_ in the db and
+ // Gets should return true if check is true and false otherwise
+ // Also checks that value that we got is the same as inserted; and =kNewValue
+ // if test_compaction_change is true
+ void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+ bool check = true, bool test_compaction_change = false,
+ ColumnFamilyHandle* cf = nullptr) {
+ ASSERT_TRUE(db_ttl_);
+
+ env_->Sleep(slp_tim);
+ ASSERT_OK(ManualCompact(cf));
+ CompactCheck(st_pos, span, check, test_compaction_change, cf);
+ }
+
+ // Similar as SleepCompactCheck but uses TtlIterator to read from db
+ void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
+ bool check = true) {
+ ASSERT_TRUE(db_ttl_);
+ env_->Sleep(slp);
+ ASSERT_OK(ManualCompact());
+ static ReadOptions ropts;
+ Iterator* dbiter = db_ttl_->NewIterator(ropts);
+ kv_it_ = kvmap_.begin();
+ advance(kv_it_, st_pos);
+
+ dbiter->Seek(kv_it_->first);
+ if (!check) {
+ if (dbiter->Valid()) {
+ ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
+ }
+ } else { // dbiter should have found out kvmap_[st_pos]
+ for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span;
+ i++, ++kv_it_) {
+ ASSERT_TRUE(dbiter->Valid());
+ ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
+ dbiter->Next();
+ }
+ }
+ ASSERT_OK(dbiter->status());
+ delete dbiter;
+ }
+
+ // Set ttl on open db
+ void SetTtl(int32_t ttl, ColumnFamilyHandle* cf = nullptr) {
+ ASSERT_TRUE(db_ttl_);
+ cf == nullptr ? db_ttl_->SetTtl(ttl) : db_ttl_->SetTtl(cf, ttl);
+ }
+
+ class TestFilter : public CompactionFilter {
+ public:
+ TestFilter(const int64_t kSampleSize, const std::string& kNewValue)
+ : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {}
+
+ // Works on keys of the form "key<number>"
+ // Drops key if number at the end of key is in [0, kSampleSize_/3),
+ // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3),
+ // Change value if it is in [2*kSampleSize_/3, kSampleSize_)
+ // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5...
+ bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+ std::string* new_value, bool* value_changed) const override {
+ assert(new_value != nullptr);
+
+ std::string search_str = "0123456789";
+ std::string key_string = key.ToString();
+ size_t pos = key_string.find_first_of(search_str);
+ int num_key_end;
+ if (pos != std::string::npos) {
+ auto key_substr = key_string.substr(pos, key.size() - pos);
+#ifndef CYGWIN
+ num_key_end = std::stoi(key_substr);
+#else
+ num_key_end = std::strtol(key_substr.c_str(), 0, 10);
+#endif
+
+ } else {
+ return false; // Keep keys not matching the format "key<NUMBER>"
+ }
+
+ int64_t partition = kSampleSize_ / 3;
+ if (num_key_end < partition) {
+ return true;
+ } else if (num_key_end < partition * 2) {
+ return false;
+ } else {
+ *new_value = kNewValue_;
+ *value_changed = true;
+ return false;
+ }
+ }
+
+ const char* Name() const override { return "TestFilter"; }
+
+ private:
+ const int64_t kSampleSize_;
+ const std::string kNewValue_;
+ };
+
+ class TestFilterFactory : public CompactionFilterFactory {
+ public:
+ TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue)
+ : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {}
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context& /*context*/) override {
+ return std::unique_ptr<CompactionFilter>(
+ new TestFilter(kSampleSize_, kNewValue_));
+ }
+
+ const char* Name() const override { return "TestFilterFactory"; }
+
+ private:
+ const int64_t kSampleSize_;
+ const std::string kNewValue_;
+ };
+
+ // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer
+ static const int64_t kSampleSize_ = 100;
+ std::string dbname_;
+ DBWithTTL* db_ttl_;
+ std::unique_ptr<SpecialTimeEnv> env_;
+
+ private:
+ Options options_;
+ KVMap kvmap_;
+ KVMap::iterator kv_it_;
+ const std::string kNewValue_ = "new_value";
+ std::unique_ptr<CompactionFilter> test_comp_filter_;
+}; // class TtlTest
+
+// If TTL is non positive or not provided, the behaviour is TTL = infinity
+// This test opens the db 3 times with such default behavior and inserts a
+// bunch of kvs each time. All kvs should accumulate in the db till the end
+// Partitions the sample-size provided into 3 sets over boundary1 and boundary2
+TEST_F(TtlTest, NoEffect) {
+ MakeKVMap(kSampleSize_);
+ int64_t boundary1 = kSampleSize_ / 3;
+ int64_t boundary2 = 2 * boundary1;
+
+ OpenTtl();
+ PutValues(0, boundary1); // T=0: Set1 never deleted
+ SleepCompactCheck(1, 0, boundary1); // T=1: Set1 still there
+ CloseTtl();
+
+ OpenTtl(0);
+ PutValues(boundary1, boundary2 - boundary1); // T=1: Set2 never deleted
+ SleepCompactCheck(1, 0, boundary2); // T=2: Sets1 & 2 still there
+ CloseTtl();
+
+ OpenTtl(-1);
+ PutValues(boundary2, kSampleSize_ - boundary2); // T=3: Set3 never deleted
+ SleepCompactCheck(1, 0, kSampleSize_, true); // T=4: Sets 1,2,3 still there
+ CloseTtl();
+}
+
+// Rerun the NoEffect test with a different version of CloseTtl
+// function, where db is directly deleted without close.
+TEST_F(TtlTest, DestructWithoutClose) {
+ MakeKVMap(kSampleSize_);
+ int64_t boundary1 = kSampleSize_ / 3;
+ int64_t boundary2 = 2 * boundary1;
+
+ OpenTtl();
+ PutValues(0, boundary1); // T=0: Set1 never deleted
+ SleepCompactCheck(1, 0, boundary1); // T=1: Set1 still there
+ CloseTtlNoDBClose();
+
+ OpenTtl(0);
+ PutValues(boundary1, boundary2 - boundary1); // T=1: Set2 never deleted
+ SleepCompactCheck(1, 0, boundary2); // T=2: Sets1 & 2 still there
+ CloseTtlNoDBClose();
+
+ OpenTtl(-1);
+ PutValues(boundary2, kSampleSize_ - boundary2); // T=3: Set3 never deleted
+ SleepCompactCheck(1, 0, kSampleSize_, true); // T=4: Sets 1,2,3 still there
+ CloseTtlNoDBClose();
+}
+
+// Puts a set of values and checks its presence using Get during ttl
+TEST_F(TtlTest, PresentDuringTTL) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(2); // T=0:Open the db with ttl = 2
+ PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2
+ SleepCompactCheck(1, 0, kSampleSize_,
+ true); // T=1:Set1 should still be there
+ CloseTtl();
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST_F(TtlTest, AbsentAfterTTL) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1); // T=0:Open the db with ttl = 2
+ PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2
+ SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there
+ CloseTtl();
+}
+
+// Resets the timestamp of a set of kvs by updating them and checks that they
+// are not deleted according to the old timestamp
+TEST_F(TtlTest, ResetTimestamp) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(3);
+ PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3
+ env_->Sleep(2); // T=2
+ PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5
+ SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there
+ CloseTtl();
+}
+
+// Similar to PresentDuringTTL but uses Iterator
+TEST_F(TtlTest, IterPresentDuringTTL) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(2);
+ PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2
+ SleepCompactCheckIter(1, 0, kSampleSize_); // T=1: Set should be there
+ CloseTtl();
+}
+
+// Similar to AbsentAfterTTL but uses Iterator
+TEST_F(TtlTest, IterAbsentAfterTTL) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1);
+ PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1
+ SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there
+ CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST_F(TtlTest, MultiOpenSamePresent) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(2);
+ PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2
+ CloseTtl();
+
+ OpenTtl(2); // T=0. Delete at t=2
+ SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there
+ CloseTtl();
+}
+
+// Checks absence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST_F(TtlTest, MultiOpenSameAbsent) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1);
+ PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1
+ CloseTtl();
+
+ OpenTtl(1); // T=0.Delete at t=1
+ SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there
+ CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with bigger ttl
+TEST_F(TtlTest, MultiOpenDifferent) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1);
+ PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1
+ CloseTtl();
+
+ OpenTtl(3); // T=0: Set deleted at t=3
+ SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there
+ CloseTtl();
+}
+
+// Checks presence during ttl in read_only mode
+TEST_F(TtlTest, ReadOnlyPresentForever) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1); // T=0:Open the db normally
+ PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1
+ CloseTtl();
+
+ OpenReadOnlyTtl(1);
+ ASSERT_TRUE(db_ttl_);
+
+ env_->Sleep(2);
+ Status s = ManualCompact(); // T=2:Set1 should still be there
+ ASSERT_TRUE(s.IsNotSupported());
+ CompactCheck(0, kSampleSize_);
+ CloseTtl();
+}
+
+// Checks whether WriteBatch works well with TTL
+// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half
+TEST_F(TtlTest, WriteBatchTest) {
+ MakeKVMap(kSampleSize_);
+ BatchOperation batch_ops[kSampleSize_];
+ for (int i = 0; i < kSampleSize_; i++) {
+ batch_ops[i] = OP_PUT;
+ }
+
+ OpenTtl(2);
+ MakePutWriteBatch(batch_ops, kSampleSize_);
+ for (int i = 0; i < kSampleSize_ / 2; i++) {
+ batch_ops[i] = OP_DELETE;
+ }
+ MakePutWriteBatch(batch_ops, kSampleSize_ / 2);
+ SleepCompactCheck(0, 0, kSampleSize_ / 2, false);
+ SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2);
+ CloseTtl();
+}
+
+// Checks user's compaction filter for correctness with TTL logic
+TEST_F(TtlTest, CompactionFilter) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtlWithTestCompaction(1);
+ PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1
+ // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there
+ SleepCompactCheck(2, 0, kSampleSize_, false);
+ CloseTtl();
+
+ OpenTtlWithTestCompaction(3);
+ PutValues(0, kSampleSize_); // T=0:Insert Set1.
+ int64_t partition = kSampleSize_ / 3;
+ SleepCompactCheck(1, 0, partition, false); // Part dropped
+ SleepCompactCheck(0, partition, partition); // Part kept
+ SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed
+ CloseTtl();
+}
+
+// Insert some key-values which KeyMayExist should be able to get and check that
+// values returned are fine
+TEST_F(TtlTest, KeyMayExist) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl();
+ PutValues(0, kSampleSize_, false);
+
+ SimpleKeyMayExistCheck();
+
+ CloseTtl();
+}
+
+TEST_F(TtlTest, MultiGetTest) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl();
+ PutValues(0, kSampleSize_, false);
+
+ SimpleMultiGetTest();
+
+ CloseTtl();
+}
+
+TEST_F(TtlTest, ColumnFamiliesTest) {
+ DB* db;
+ Options options;
+ options.create_if_missing = true;
+ options.env = env_.get();
+
+ DB::Open(options, dbname_, &db);
+ ColumnFamilyHandle* handle;
+ ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options),
+ "ttl_column_family", &handle));
+
+ delete handle;
+ delete db;
+
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(ColumnFamilyDescriptor(
+ kDefaultColumnFamilyName, ColumnFamilyOptions(options)));
+ column_families.push_back(ColumnFamilyDescriptor(
+ "ttl_column_family", ColumnFamilyOptions(options)));
+
+ std::vector<ColumnFamilyHandle*> handles;
+
+ ASSERT_OK(DBWithTTL::Open(DBOptions(options), dbname_, column_families,
+ &handles, &db_ttl_, {3, 5}, false));
+ ASSERT_EQ(handles.size(), 2U);
+ ColumnFamilyHandle* new_handle;
+ ASSERT_OK(db_ttl_->CreateColumnFamilyWithTtl(options, "ttl_column_family_2",
+ &new_handle, 2));
+ handles.push_back(new_handle);
+
+ MakeKVMap(kSampleSize_);
+ PutValues(0, kSampleSize_, false, handles[0]);
+ PutValues(0, kSampleSize_, false, handles[1]);
+ PutValues(0, kSampleSize_, false, handles[2]);
+
+ // everything should be there after 1 second
+ SleepCompactCheck(1, 0, kSampleSize_, true, false, handles[0]);
+ SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+ SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[2]);
+
+ // only column family 1 should be alive after 4 seconds
+ SleepCompactCheck(3, 0, kSampleSize_, false, false, handles[0]);
+ SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+ SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+ // nothing should be there after 6 seconds
+ SleepCompactCheck(2, 0, kSampleSize_, false, false, handles[0]);
+ SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[1]);
+ SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+ for (auto h : handles) {
+ delete h;
+ }
+ delete db_ttl_;
+ db_ttl_ = nullptr;
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST_F(TtlTest, ChangeTtlOnOpenDb) {
+ MakeKVMap(kSampleSize_);
+
+ OpenTtl(1); // T=0:Open the db with ttl = 2
+ SetTtl(3);
+ PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2
+ SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there
+ CloseTtl();
+}
+
+// Test DeleteRange for DBWithTtl
+TEST_F(TtlTest, DeleteRangeTest) {
+ OpenTtl();
+ ASSERT_OK(db_ttl_->Put(WriteOptions(), "a", "val"));
+ MakeDeleteRange("a", "b");
+ ASSERT_OK(db_ttl_->Put(WriteOptions(), "c", "val"));
+ MakeDeleteRange("b", "d");
+ ASSERT_OK(db_ttl_->Put(WriteOptions(), "e", "val"));
+ MakeDeleteRange("d", "e");
+ // first iteration verifies query correctness in memtable, second verifies
+ // query correctness for a single SST file
+ for (int i = 0; i < 2; i++) {
+ if (i > 0) {
+ ASSERT_OK(db_ttl_->Flush(FlushOptions()));
+ }
+ std::string value;
+ ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "a", &value).IsNotFound());
+ ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "c", &value).IsNotFound());
+ ASSERT_OK(db_ttl_->Get(ReadOptions(), "e", &value));
+ }
+ CloseTtl();
+}
+
+class DummyFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "DummyFilter"; }
+};
+
+class DummyFilterFactory : public CompactionFilterFactory {
+ public:
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "DummyFilterFactory"; }
+
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+ const CompactionFilter::Context&) override {
+ std::unique_ptr<CompactionFilter> f(new DummyFilter());
+ return f;
+ }
+};
+
+static int RegisterTestObjects(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ library.AddFactory<CompactionFilter>(
+ "DummyFilter", [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilter>* /*guard*/,
+ std::string* /* errmsg */) {
+ static DummyFilter dummy;
+ return &dummy;
+ });
+ library.AddFactory<CompactionFilterFactory>(
+ "DummyFilterFactory", [](const std::string& /*uri*/,
+ std::unique_ptr<CompactionFilterFactory>* guard,
+ std::string* /* errmsg */) {
+ guard->reset(new DummyFilterFactory());
+ return guard->get();
+ });
+ return 2;
+}
+
+class TtlOptionsTest : public testing::Test {
+ public:
+ TtlOptionsTest() {
+ config_options_.registry->AddLibrary("RegisterTtlObjects",
+ RegisterTtlObjects, "");
+ config_options_.registry->AddLibrary("RegisterTtlTestObjects",
+ RegisterTestObjects, "");
+ }
+ ConfigOptions config_options_;
+};
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilter) {
+ const CompactionFilter* filter = nullptr;
+
+ ASSERT_OK(CompactionFilter::CreateFromString(
+ config_options_, TtlCompactionFilter::kClassName(), &filter));
+ ASSERT_NE(filter, nullptr);
+ ASSERT_STREQ(filter->Name(), TtlCompactionFilter::kClassName());
+ auto ttl = filter->GetOptions<int32_t>("TTL");
+ ASSERT_NE(ttl, nullptr);
+ ASSERT_EQ(*ttl, 0);
+ ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ delete filter;
+ filter = nullptr;
+
+ ASSERT_OK(CompactionFilter::CreateFromString(
+ config_options_, "id=TtlCompactionFilter; ttl=123", &filter));
+ ASSERT_NE(filter, nullptr);
+ ttl = filter->GetOptions<int32_t>("TTL");
+ ASSERT_NE(ttl, nullptr);
+ ASSERT_EQ(*ttl, 123);
+ ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ delete filter;
+ filter = nullptr;
+
+ ASSERT_OK(CompactionFilter::CreateFromString(
+ config_options_,
+ "id=TtlCompactionFilter; ttl=456; user_filter=DummyFilter;", &filter));
+ ASSERT_NE(filter, nullptr);
+ auto inner = filter->CheckedCast<DummyFilter>();
+ ASSERT_NE(inner, nullptr);
+ ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ std::string mismatch;
+ std::string opts_str = filter->ToString(config_options_);
+ const CompactionFilter* copy = nullptr;
+ ASSERT_OK(
+ CompactionFilter::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(filter->AreEquivalent(config_options_, copy, &mismatch));
+ delete filter;
+ delete copy;
+}
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilterFactory) {
+ std::shared_ptr<CompactionFilterFactory> cff;
+
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(
+ config_options_, TtlCompactionFilterFactory::kClassName(), &cff));
+ ASSERT_NE(cff.get(), nullptr);
+ ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+ auto ttl = cff->GetOptions<int32_t>("TTL");
+ ASSERT_NE(ttl, nullptr);
+ ASSERT_EQ(*ttl, 0);
+ ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(
+ config_options_, "id=TtlCompactionFilterFactory; ttl=123", &cff));
+ ASSERT_NE(cff.get(), nullptr);
+ ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+ ttl = cff->GetOptions<int32_t>("TTL");
+ ASSERT_NE(ttl, nullptr);
+ ASSERT_EQ(*ttl, 123);
+ ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(
+ config_options_,
+ "id=TtlCompactionFilterFactory; ttl=456; "
+ "user_filter_factory=DummyFilterFactory;",
+ &cff));
+ ASSERT_NE(cff.get(), nullptr);
+ auto filter = cff->CreateCompactionFilter(CompactionFilter::Context());
+ ASSERT_NE(filter.get(), nullptr);
+ auto ttlf = filter->CheckedCast<TtlCompactionFilter>();
+ ASSERT_EQ(filter.get(), ttlf);
+ auto user = filter->CheckedCast<DummyFilter>();
+ ASSERT_NE(user, nullptr);
+ ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+ std::string opts_str = cff->ToString(config_options_);
+ std::string mismatch;
+ std::shared_ptr<CompactionFilterFactory> copy;
+ ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_, opts_str,
+ &copy));
+ ASSERT_TRUE(cff->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(TtlOptionsTest, LoadTtlMergeOperator) {
+ std::shared_ptr<MergeOperator> mo;
+
+ config_options_.invoke_prepare_options = false;
+ ASSERT_OK(MergeOperator::CreateFromString(
+ config_options_, TtlMergeOperator::kClassName(), &mo));
+ ASSERT_NE(mo.get(), nullptr);
+ ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+ ASSERT_NOK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+ config_options_.invoke_prepare_options = true;
+ ASSERT_OK(MergeOperator::CreateFromString(
+ config_options_, "id=TtlMergeOperator; user_operator=bytesxor", &mo));
+ ASSERT_NE(mo.get(), nullptr);
+ ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+ ASSERT_OK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+ auto ttl_mo = mo->CheckedCast<TtlMergeOperator>();
+ ASSERT_EQ(mo.get(), ttl_mo);
+ auto user = ttl_mo->CheckedCast<BytesXOROperator>();
+ ASSERT_NE(user, nullptr);
+
+ std::string mismatch;
+ std::string opts_str = mo->ToString(config_options_);
+ std::shared_ptr<MergeOperator> copy;
+ ASSERT_OK(MergeOperator::CreateFromString(config_options_, opts_str, &copy));
+ ASSERT_TRUE(mo->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+} // namespace ROCKSDB_NAMESPACE
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/util_merge_operators_test.cc b/src/rocksdb/utilities/util_merge_operators_test.cc
new file mode 100644
index 000000000..fed6f1a75
--- /dev/null
+++ b/src/rocksdb/utilities/util_merge_operators_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class UtilMergeOperatorTest : public testing::Test {
+ public:
+ UtilMergeOperatorTest() {}
+
+ std::string FullMergeV2(std::string existing_value,
+ std::vector<std::string> operands,
+ std::string key = "") {
+ std::string result;
+ Slice result_operand(nullptr, 0);
+
+ Slice existing_value_slice(existing_value);
+ std::vector<Slice> operands_slice(operands.begin(), operands.end());
+
+ const MergeOperator::MergeOperationInput merge_in(
+ key, &existing_value_slice, operands_slice, nullptr);
+ MergeOperator::MergeOperationOutput merge_out(result, result_operand);
+ merge_operator_->FullMergeV2(merge_in, &merge_out);
+
+ if (result_operand.data()) {
+ result.assign(result_operand.data(), result_operand.size());
+ }
+ return result;
+ }
+
+ std::string FullMergeV2(std::vector<std::string> operands,
+ std::string key = "") {
+ std::string result;
+ Slice result_operand(nullptr, 0);
+
+ std::vector<Slice> operands_slice(operands.begin(), operands.end());
+
+ const MergeOperator::MergeOperationInput merge_in(key, nullptr,
+ operands_slice, nullptr);
+ MergeOperator::MergeOperationOutput merge_out(result, result_operand);
+ merge_operator_->FullMergeV2(merge_in, &merge_out);
+
+ if (result_operand.data()) {
+ result.assign(result_operand.data(), result_operand.size());
+ }
+ return result;
+ }
+
+ std::string PartialMerge(std::string left, std::string right,
+ std::string key = "") {
+ std::string result;
+
+ merge_operator_->PartialMerge(key, left, right, &result, nullptr);
+ return result;
+ }
+
+ std::string PartialMergeMulti(std::deque<std::string> operands,
+ std::string key = "") {
+ std::string result;
+ std::deque<Slice> operands_slice(operands.begin(), operands.end());
+
+ merge_operator_->PartialMergeMulti(key, operands_slice, &result, nullptr);
+ return result;
+ }
+
+ protected:
+ std::shared_ptr<MergeOperator> merge_operator_;
+};
+
+TEST_F(UtilMergeOperatorTest, MaxMergeOperator) {
+ merge_operator_ = MergeOperators::CreateMaxOperator();
+
+ EXPECT_EQ("B", FullMergeV2("B", {"A"}));
+ EXPECT_EQ("B", FullMergeV2("A", {"B"}));
+ EXPECT_EQ("", FullMergeV2({"", "", ""}));
+ EXPECT_EQ("A", FullMergeV2({"A"}));
+ EXPECT_EQ("ABC", FullMergeV2({"ABC"}));
+ EXPECT_EQ("Z", FullMergeV2({"ABC", "Z", "C", "AXX"}));
+ EXPECT_EQ("ZZZ", FullMergeV2({"ABC", "CC", "Z", "ZZZ"}));
+ EXPECT_EQ("a", FullMergeV2("a", {"ABC", "CC", "Z", "ZZZ"}));
+
+ EXPECT_EQ("z", PartialMergeMulti({"a", "z", "efqfqwgwew", "aaz", "hhhhh"}));
+
+ EXPECT_EQ("b", PartialMerge("a", "b"));
+ EXPECT_EQ("z", PartialMerge("z", "azzz"));
+ EXPECT_EQ("a", PartialMerge("a", ""));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/wal_filter.cc b/src/rocksdb/utilities/wal_filter.cc
new file mode 100644
index 000000000..98bba3610
--- /dev/null
+++ b/src/rocksdb/utilities/wal_filter.cc
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/wal_filter.h"
+
+#include <memory>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status WalFilter::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ WalFilter** filter) {
+ Status s =
+ LoadStaticObject<WalFilter>(config_options, value, nullptr, filter);
+ return s;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
new file mode 100644
index 000000000..408243b3f
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -0,0 +1,695 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "options/db_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct WriteBatchWithIndex::Rep {
+ explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
+ size_t max_bytes = 0, bool _overwrite_key = false,
+ size_t protection_bytes_per_key = 0)
+ : write_batch(reserved_bytes, max_bytes, protection_bytes_per_key,
+ index_comparator ? index_comparator->timestamp_size() : 0),
+ comparator(index_comparator, &write_batch),
+ skip_list(comparator, &arena),
+ overwrite_key(_overwrite_key),
+ last_entry_offset(0),
+ last_sub_batch_offset(0),
+ sub_batch_cnt(1) {}
+ ReadableWriteBatch write_batch;
+ WriteBatchEntryComparator comparator;
+ Arena arena;
+ WriteBatchEntrySkipList skip_list;
+ bool overwrite_key;
+ size_t last_entry_offset;
+ // The starting offset of the last sub-batch. A sub-batch starts right before
+ // inserting a key that is a duplicate of a key in the last sub-batch. Zero,
+ // the default, means that no duplicate key is detected so far.
+ size_t last_sub_batch_offset;
+ // Total number of sub-batches in the write batch. Default is 1.
+ size_t sub_batch_cnt;
+
+ // Remember current offset of internal write batch, which is used as
+ // the starting offset of the next record.
+ void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); }
+
+ // In overwrite mode, find the existing entry for the same key and update it
+ // to point to the current entry.
+ // Return true if the key is found and updated.
+ bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key,
+ WriteType type);
+ bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key,
+ WriteType type);
+
+ // Add the recent entry to the update.
+ // In overwrite mode, if key already exists in the index, update it.
+ void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key,
+ WriteType type);
+ void AddOrUpdateIndex(const Slice& key, WriteType type);
+
+ // Allocate an index entry pointing to the last entry in the write batch and
+ // put it to skip list.
+ void AddNewEntry(uint32_t column_family_id);
+
+ // Clear all updates buffered in this batch.
+ void Clear();
+ void ClearIndex();
+
+ // Rebuild index by reading all records from the batch.
+ // Returns non-ok status on corruption.
+ Status ReBuildIndex();
+};
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
+ ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+ uint32_t cf_id = GetColumnFamilyID(column_family);
+ return UpdateExistingEntryWithCfId(cf_id, key, type);
+}
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
+ uint32_t column_family_id, const Slice& key, WriteType type) {
+ if (!overwrite_key) {
+ return false;
+ }
+
+ WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch,
+ &comparator);
+ iter.Seek(key);
+ if (!iter.Valid()) {
+ return false;
+ } else if (!iter.MatchesKey(column_family_id, key)) {
+ return false;
+ } else {
+ // Move to the end of this key (NextKey-Prev)
+ iter.NextKey(); // Move to the next key
+ if (iter.Valid()) {
+ iter.Prev(); // Move back one entry
+ } else {
+ iter.SeekToLast();
+ }
+ }
+ WriteBatchIndexEntry* non_const_entry =
+ const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
+ if (LIKELY(last_sub_batch_offset <= non_const_entry->offset)) {
+ last_sub_batch_offset = last_entry_offset;
+ sub_batch_cnt++;
+ }
+ if (type == kMergeRecord) {
+ return false;
+ } else {
+ non_const_entry->offset = last_entry_offset;
+ return true;
+ }
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
+ ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+ if (!UpdateExistingEntry(column_family, key, type)) {
+ uint32_t cf_id = GetColumnFamilyID(column_family);
+ const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
+ if (cf_cmp != nullptr) {
+ comparator.SetComparatorForCF(cf_id, cf_cmp);
+ }
+ AddNewEntry(cf_id);
+ }
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key,
+ WriteType type) {
+ if (!UpdateExistingEntryWithCfId(0, key, type)) {
+ AddNewEntry(0);
+ }
+}
+
+void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
+ const std::string& wb_data = write_batch.Data();
+ Slice entry_ptr = Slice(wb_data.data() + last_entry_offset,
+ wb_data.size() - last_entry_offset);
+ // Extract key
+ Slice key;
+ bool success =
+ ReadKeyFromWriteBatchEntry(&entry_ptr, &key, column_family_id != 0);
+#ifdef NDEBUG
+ (void)success;
+#endif
+ assert(success);
+
+ const Comparator* const ucmp = comparator.GetComparator(column_family_id);
+ size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+
+ if (ts_sz > 0) {
+ key.remove_suffix(ts_sz);
+ }
+
+ auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
+ auto* index_entry =
+ new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id,
+ key.data() - wb_data.data(), key.size());
+ skip_list.Insert(index_entry);
+}
+
+void WriteBatchWithIndex::Rep::Clear() {
+ write_batch.Clear();
+ ClearIndex();
+}
+
+void WriteBatchWithIndex::Rep::ClearIndex() {
+ skip_list.~WriteBatchEntrySkipList();
+ arena.~Arena();
+ new (&arena) Arena();
+ new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
+ last_entry_offset = 0;
+ last_sub_batch_offset = 0;
+ sub_batch_cnt = 1;
+}
+
+Status WriteBatchWithIndex::Rep::ReBuildIndex() {
+ Status s;
+
+ ClearIndex();
+
+ if (write_batch.Count() == 0) {
+ // Nothing to re-index
+ return s;
+ }
+
+ size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
+
+ Slice input(write_batch.Data());
+ input.remove_prefix(offset);
+
+ // Loop through all entries in Rep and add each one to the index
+ uint32_t found = 0;
+ while (s.ok() && !input.empty()) {
+ Slice key, value, blob, xid;
+ uint32_t column_family_id = 0; // default
+ char tag = 0;
+
+ // set offset of current entry for call to AddNewEntry()
+ last_entry_offset = input.data() - write_batch.Data().data();
+
+ s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value,
+ &blob, &xid);
+ if (!s.ok()) {
+ break;
+ }
+
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ case kTypeValue:
+ found++;
+ if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) {
+ AddNewEntry(column_family_id);
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeDeletion:
+ found++;
+ if (!UpdateExistingEntryWithCfId(column_family_id, key,
+ kDeleteRecord)) {
+ AddNewEntry(column_family_id);
+ }
+ break;
+ case kTypeColumnFamilySingleDeletion:
+ case kTypeSingleDeletion:
+ found++;
+ if (!UpdateExistingEntryWithCfId(column_family_id, key,
+ kSingleDeleteRecord)) {
+ AddNewEntry(column_family_id);
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ case kTypeMerge:
+ found++;
+ if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) {
+ AddNewEntry(column_family_id);
+ }
+ break;
+ case kTypeLogData:
+ case kTypeBeginPrepareXID:
+ case kTypeBeginPersistedPrepareXID:
+ case kTypeBeginUnprepareXID:
+ case kTypeEndPrepareXID:
+ case kTypeCommitXID:
+ case kTypeCommitXIDAndTimestamp:
+ case kTypeRollbackXID:
+ case kTypeNoop:
+ break;
+ default:
+ return Status::Corruption(
+ "unknown WriteBatch tag in ReBuildIndex",
+ std::to_string(static_cast<unsigned int>(tag)));
+ }
+ }
+
+ if (s.ok() && found != write_batch.Count()) {
+ s = Status::Corruption("WriteBatch has wrong count");
+ }
+
+ return s;
+}
+
+WriteBatchWithIndex::WriteBatchWithIndex(
+ const Comparator* default_index_comparator, size_t reserved_bytes,
+ bool overwrite_key, size_t max_bytes, size_t protection_bytes_per_key)
+ : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes,
+ overwrite_key, protection_bytes_per_key)) {}
+
+WriteBatchWithIndex::~WriteBatchWithIndex() {}
+
+WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default;
+
+WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) =
+ default;
+
+WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
+
+size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
+
+WBWIIterator* WriteBatchWithIndex::NewIterator() {
+ return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+ &(rep->comparator));
+}
+
+WBWIIterator* WriteBatchWithIndex::NewIterator(
+ ColumnFamilyHandle* column_family) {
+ return new WBWIIteratorImpl(GetColumnFamilyID(column_family),
+ &(rep->skip_list), &rep->write_batch,
+ &(rep->comparator));
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+ ColumnFamilyHandle* column_family, Iterator* base_iterator,
+ const ReadOptions* read_options) {
+ auto wbwiii =
+ new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list),
+ &rep->write_batch, &rep->comparator);
+ return new BaseDeltaIterator(column_family, base_iterator, wbwiii,
+ GetColumnFamilyUserComparator(column_family),
+ read_options);
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+ // default column family's comparator
+ auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+ &rep->comparator);
+ return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
+ rep->comparator.default_comparator());
+}
+
+Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Put(column_family, key, value);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(column_family, key, kPutRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Put(key, value);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(key, kPutRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
+ const Slice& /*key*/, const Slice& /*ts*/,
+ const Slice& /*value*/) {
+ if (!column_family) {
+ return Status::InvalidArgument("column family handle cannot be nullptr");
+ }
+ // TODO: support WBWI::Put() with timestamp.
+ return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Delete(column_family, key);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(column_family, key, kDeleteRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::Delete(const Slice& key) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Delete(key);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(key, kDeleteRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+ const Slice& /*key*/, const Slice& /*ts*/) {
+ if (!column_family) {
+ return Status::InvalidArgument("column family handle cannot be nullptr");
+ }
+ // TODO: support WBWI::Delete() with timestamp.
+ return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.SingleDelete(column_family, key);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::SingleDelete(const Slice& key) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.SingleDelete(key);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(key, kSingleDeleteRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
+ const Slice& /*key*/,
+ const Slice& /*ts*/) {
+ if (!column_family) {
+ return Status::InvalidArgument("column family handle cannot be nullptr");
+ }
+ // TODO: support WBWI::SingleDelete() with timestamp.
+ return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Merge(column_family, key, value);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(column_family, key, kMergeRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
+ rep->SetLastEntryOffset();
+ auto s = rep->write_batch.Merge(key, value);
+ if (s.ok()) {
+ rep->AddOrUpdateIndex(key, kMergeRecord);
+ }
+ return s;
+}
+
+Status WriteBatchWithIndex::PutLogData(const Slice& blob) {
+ return rep->write_batch.PutLogData(blob);
+}
+
+void WriteBatchWithIndex::Clear() { rep->Clear(); }
+
+Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
+ const DBOptions& options,
+ const Slice& key, std::string* value) {
+ Status s;
+ WriteBatchWithIndexInternal wbwii(&options, column_family);
+ auto result = wbwii.GetFromBatch(this, key, value, &s);
+
+ switch (result) {
+ case WBWIIteratorImpl::kFound:
+ case WBWIIteratorImpl::kError:
+ // use returned status
+ break;
+ case WBWIIteratorImpl::kDeleted:
+ case WBWIIteratorImpl::kNotFound:
+ s = Status::NotFound();
+ break;
+ case WBWIIteratorImpl::kMergeInProgress:
+ s = Status::MergeInProgress();
+ break;
+ default:
+ assert(false);
+ }
+
+ return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+ const ReadOptions& read_options,
+ const Slice& key,
+ std::string* value) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s = GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+ &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+ const ReadOptions& read_options,
+ const Slice& key,
+ PinnableSlice* pinnable_val) {
+ return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+ pinnable_val);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+ const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key,
+ std::string* value) {
+ assert(value != nullptr);
+ PinnableSlice pinnable_val(value);
+ assert(!pinnable_val.IsPinned());
+ auto s =
+ GetFromBatchAndDB(db, read_options, column_family, key, &pinnable_val);
+ if (s.ok() && pinnable_val.IsPinned()) {
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ } // else value is already assigned
+ return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+ const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key,
+ PinnableSlice* pinnable_val) {
+ return GetFromBatchAndDB(db, read_options, column_family, key, pinnable_val,
+ nullptr);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(
+ DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) {
+ const Comparator* const ucmp = rep->comparator.GetComparator(column_family);
+ size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+ if (ts_sz > 0 && !read_options.timestamp) {
+ return Status::InvalidArgument("Must specify timestamp");
+ }
+
+ Status s;
+ WriteBatchWithIndexInternal wbwii(db, column_family);
+
+ // Since the lifetime of the WriteBatch is the same as that of the transaction
+ // we cannot pin it as otherwise the returned value will not be available
+ // after the transaction finishes.
+ std::string& batch_value = *pinnable_val->GetSelf();
+ auto result = wbwii.GetFromBatch(this, key, &batch_value, &s);
+
+ if (result == WBWIIteratorImpl::kFound) {
+ pinnable_val->PinSelf();
+ return s;
+ } else if (!s.ok() || result == WBWIIteratorImpl::kError) {
+ return s;
+ } else if (result == WBWIIteratorImpl::kDeleted) {
+ return Status::NotFound();
+ }
+ assert(result == WBWIIteratorImpl::kMergeInProgress ||
+ result == WBWIIteratorImpl::kNotFound);
+
+ // Did not find key in batch OR could not resolve Merges. Try DB.
+ if (!callback) {
+ s = db->Get(read_options, column_family, key, pinnable_val);
+ } else {
+ DBImpl::GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = pinnable_val;
+ get_impl_options.callback = callback;
+ s = static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->GetImpl(read_options, key, get_impl_options);
+ }
+
+ if (s.ok() || s.IsNotFound()) { // DB Get Succeeded
+ if (result == WBWIIteratorImpl::kMergeInProgress) {
+ // Merge result from DB with merges in Batch
+ std::string merge_result;
+ if (s.ok()) {
+ s = wbwii.MergeKey(key, pinnable_val, &merge_result);
+ } else { // Key not present in db (s.IsNotFound())
+ s = wbwii.MergeKey(key, nullptr, &merge_result);
+ }
+ if (s.ok()) {
+ pinnable_val->Reset();
+ *pinnable_val->GetSelf() = std::move(merge_result);
+ pinnable_val->PinSelf();
+ }
+ }
+ }
+
+ return s;
+}
+
+void WriteBatchWithIndex::MultiGetFromBatchAndDB(
+ DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys, PinnableSlice* values,
+ Status* statuses, bool sorted_input) {
+ MultiGetFromBatchAndDB(db, read_options, column_family, num_keys, keys,
+ values, statuses, sorted_input, nullptr);
+}
+
+void WriteBatchWithIndex::MultiGetFromBatchAndDB(
+ DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys, PinnableSlice* values,
+ Status* statuses, bool sorted_input, ReadCallback* callback) {
+ const Comparator* const ucmp = rep->comparator.GetComparator(column_family);
+ size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+ if (ts_sz > 0 && !read_options.timestamp) {
+ for (size_t i = 0; i < num_keys; ++i) {
+ statuses[i] = Status::InvalidArgument("Must specify timestamp");
+ }
+ return;
+ }
+
+ WriteBatchWithIndexInternal wbwii(db, column_family);
+
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ // To hold merges from the write batch
+ autovector<std::pair<WBWIIteratorImpl::Result, MergeContext>,
+ MultiGetContext::MAX_BATCH_SIZE>
+ merges;
+ // Since the lifetime of the WriteBatch is the same as that of the transaction
+ // we cannot pin it as otherwise the returned value will not be available
+ // after the transaction finishes.
+ for (size_t i = 0; i < num_keys; ++i) {
+ MergeContext merge_context;
+ std::string batch_value;
+ Status* s = &statuses[i];
+ PinnableSlice* pinnable_val = &values[i];
+ pinnable_val->Reset();
+ auto result =
+ wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s);
+
+ if (result == WBWIIteratorImpl::kFound) {
+ *pinnable_val->GetSelf() = std::move(batch_value);
+ pinnable_val->PinSelf();
+ continue;
+ }
+ if (result == WBWIIteratorImpl::kDeleted) {
+ *s = Status::NotFound();
+ continue;
+ }
+ if (result == WBWIIteratorImpl::kError) {
+ continue;
+ }
+ assert(result == WBWIIteratorImpl::kMergeInProgress ||
+ result == WBWIIteratorImpl::kNotFound);
+ key_context.emplace_back(column_family, keys[i], &values[i],
+ /*timestamp*/ nullptr, &statuses[i]);
+ merges.emplace_back(result, std::move(merge_context));
+ }
+
+ for (KeyContext& key : key_context) {
+ sorted_keys.emplace_back(&key);
+ }
+
+ // Did not find key in batch OR could not resolve Merges. Try DB.
+ static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
+ static_cast_with_check<DBImpl>(db->GetRootDB())
+ ->MultiGetWithCallback(read_options, column_family, callback,
+ &sorted_keys);
+
+ for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) {
+ KeyContext& key = *iter;
+ if (key.s->ok() || key.s->IsNotFound()) { // DB Get Succeeded
+ size_t index = iter - key_context.begin();
+ std::pair<WBWIIteratorImpl::Result, MergeContext>& merge_result =
+ merges[index];
+ if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) {
+ std::string merged_value;
+ // Merge result from DB with merges in Batch
+ if (key.s->ok()) {
+ *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second,
+ &merged_value);
+ } else { // Key not present in db (s.IsNotFound())
+ *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second,
+ &merged_value);
+ }
+ if (key.s->ok()) {
+ key.value->Reset();
+ *key.value->GetSelf() = std::move(merged_value);
+ key.value->PinSelf();
+ }
+ }
+ }
+ }
+}
+
+void WriteBatchWithIndex::SetSavePoint() { rep->write_batch.SetSavePoint(); }
+
+Status WriteBatchWithIndex::RollbackToSavePoint() {
+ Status s = rep->write_batch.RollbackToSavePoint();
+
+ if (s.ok()) {
+ rep->sub_batch_cnt = 1;
+ rep->last_sub_batch_offset = 0;
+ s = rep->ReBuildIndex();
+ }
+
+ return s;
+}
+
+Status WriteBatchWithIndex::PopSavePoint() {
+ return rep->write_batch.PopSavePoint();
+}
+
+void WriteBatchWithIndex::SetMaxBytes(size_t max_bytes) {
+ rep->write_batch.SetMaxBytes(max_bytes);
+}
+
+size_t WriteBatchWithIndex::GetDataSize() const {
+ return rep->write_batch.GetDataSize();
+}
+
+const Comparator* WriteBatchWithIndexInternal::GetUserComparator(
+ const WriteBatchWithIndex& wbwi, uint32_t cf_id) {
+ const WriteBatchEntryComparator& ucmps = wbwi.rep->comparator;
+ return ucmps.GetComparator(cf_id);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
new file mode 100644
index 000000000..3c9205bf7
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -0,0 +1,735 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family,
+ Iterator* base_iterator,
+ WBWIIteratorImpl* delta_iterator,
+ const Comparator* comparator,
+ const ReadOptions* read_options)
+ : forward_(true),
+ current_at_base_(true),
+ equal_keys_(false),
+ status_(Status::OK()),
+ base_iterator_(base_iterator),
+ delta_iterator_(delta_iterator),
+ comparator_(comparator),
+ iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+ : nullptr) {
+ assert(comparator_);
+ wbwii_.reset(new WriteBatchWithIndexInternal(column_family));
+}
+
+bool BaseDeltaIterator::Valid() const {
+ return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false;
+}
+
+void BaseDeltaIterator::SeekToFirst() {
+ forward_ = true;
+ base_iterator_->SeekToFirst();
+ delta_iterator_->SeekToFirst();
+ UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekToLast() {
+ forward_ = false;
+ base_iterator_->SeekToLast();
+ delta_iterator_->SeekToLast();
+ UpdateCurrent();
+}
+
+void BaseDeltaIterator::Seek(const Slice& k) {
+ forward_ = true;
+ base_iterator_->Seek(k);
+ delta_iterator_->Seek(k);
+ UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekForPrev(const Slice& k) {
+ forward_ = false;
+ base_iterator_->SeekForPrev(k);
+ delta_iterator_->SeekForPrev(k);
+ UpdateCurrent();
+}
+
+void BaseDeltaIterator::Next() {
+ if (!Valid()) {
+ status_ = Status::NotSupported("Next() on invalid iterator");
+ return;
+ }
+
+ if (!forward_) {
+ // Need to change direction
+ // if our direction was backward and we're not equal, we have two states:
+ // * both iterators are valid: we're already in a good state (current
+ // shows to smaller)
+ // * only one iterator is valid: we need to advance that iterator
+ forward_ = true;
+ equal_keys_ = false;
+ if (!BaseValid()) {
+ assert(DeltaValid());
+ base_iterator_->SeekToFirst();
+ } else if (!DeltaValid()) {
+ delta_iterator_->SeekToFirst();
+ } else if (current_at_base_) {
+ // Change delta from larger than base to smaller
+ AdvanceDelta();
+ } else {
+ // Change base from larger than delta to smaller
+ AdvanceBase();
+ }
+ if (DeltaValid() && BaseValid()) {
+ if (0 == comparator_->CompareWithoutTimestamp(
+ delta_iterator_->Entry().key, /*a_has_ts=*/false,
+ base_iterator_->key(), /*b_has_ts=*/false)) {
+ equal_keys_ = true;
+ }
+ }
+ }
+ Advance();
+}
+
+void BaseDeltaIterator::Prev() {
+ if (!Valid()) {
+ status_ = Status::NotSupported("Prev() on invalid iterator");
+ return;
+ }
+
+ if (forward_) {
+ // Need to change direction
+ // if our direction was backward and we're not equal, we have two states:
+ // * both iterators are valid: we're already in a good state (current
+ // shows to smaller)
+ // * only one iterator is valid: we need to advance that iterator
+ forward_ = false;
+ equal_keys_ = false;
+ if (!BaseValid()) {
+ assert(DeltaValid());
+ base_iterator_->SeekToLast();
+ } else if (!DeltaValid()) {
+ delta_iterator_->SeekToLast();
+ } else if (current_at_base_) {
+ // Change delta from less advanced than base to more advanced
+ AdvanceDelta();
+ } else {
+ // Change base from less advanced than delta to more advanced
+ AdvanceBase();
+ }
+ if (DeltaValid() && BaseValid()) {
+ if (0 == comparator_->CompareWithoutTimestamp(
+ delta_iterator_->Entry().key, /*a_has_ts=*/false,
+ base_iterator_->key(), /*b_has_ts=*/false)) {
+ equal_keys_ = true;
+ }
+ }
+ }
+
+ Advance();
+}
+
+Slice BaseDeltaIterator::key() const {
+ return current_at_base_ ? base_iterator_->key()
+ : delta_iterator_->Entry().key;
+}
+
+Slice BaseDeltaIterator::value() const {
+ if (current_at_base_) {
+ return base_iterator_->value();
+ } else {
+ WriteEntry delta_entry = delta_iterator_->Entry();
+ if (wbwii_->GetNumOperands() == 0) {
+ return delta_entry.value;
+ } else if (delta_entry.type == kDeleteRecord ||
+ delta_entry.type == kSingleDeleteRecord) {
+ status_ =
+ wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+ } else if (delta_entry.type == kPutRecord) {
+ status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value,
+ merge_result_.GetSelf());
+ } else if (delta_entry.type == kMergeRecord) {
+ if (equal_keys_) {
+ Slice base_value = base_iterator_->value();
+ status_ = wbwii_->MergeKey(delta_entry.key, &base_value,
+ merge_result_.GetSelf());
+ } else {
+ status_ =
+ wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+ }
+ }
+ merge_result_.PinSelf();
+ return merge_result_;
+ }
+}
+
+Status BaseDeltaIterator::status() const {
+ if (!status_.ok()) {
+ return status_;
+ }
+ if (!base_iterator_->status().ok()) {
+ return base_iterator_->status();
+ }
+ return delta_iterator_->status();
+}
+
+void BaseDeltaIterator::Invalidate(Status s) { status_ = s; }
+
+void BaseDeltaIterator::AssertInvariants() {
+#ifndef NDEBUG
+ bool not_ok = false;
+ if (!base_iterator_->status().ok()) {
+ assert(!base_iterator_->Valid());
+ not_ok = true;
+ }
+ if (!delta_iterator_->status().ok()) {
+ assert(!delta_iterator_->Valid());
+ not_ok = true;
+ }
+ if (not_ok) {
+ assert(!Valid());
+ assert(!status().ok());
+ return;
+ }
+
+ if (!Valid()) {
+ return;
+ }
+ if (!BaseValid()) {
+ assert(!current_at_base_ && delta_iterator_->Valid());
+ return;
+ }
+ if (!DeltaValid()) {
+ assert(current_at_base_ && base_iterator_->Valid());
+ return;
+ }
+ // we don't support those yet
+ assert(delta_iterator_->Entry().type != kMergeRecord &&
+ delta_iterator_->Entry().type != kLogDataRecord);
+ int compare = comparator_->CompareWithoutTimestamp(
+ delta_iterator_->Entry().key, /*a_has_ts=*/false, base_iterator_->key(),
+ /*b_has_ts=*/false);
+ if (forward_) {
+ // current_at_base -> compare < 0
+ assert(!current_at_base_ || compare < 0);
+ // !current_at_base -> compare <= 0
+ assert(current_at_base_ && compare >= 0);
+ } else {
+ // current_at_base -> compare > 0
+ assert(!current_at_base_ || compare > 0);
+ // !current_at_base -> compare <= 0
+ assert(current_at_base_ && compare <= 0);
+ }
+ // equal_keys_ <=> compare == 0
+ assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+}
+
+void BaseDeltaIterator::Advance() {
+ if (equal_keys_) {
+ assert(BaseValid() && DeltaValid());
+ AdvanceBase();
+ AdvanceDelta();
+ } else {
+ if (current_at_base_) {
+ assert(BaseValid());
+ AdvanceBase();
+ } else {
+ assert(DeltaValid());
+ AdvanceDelta();
+ }
+ }
+ UpdateCurrent();
+}
+
+void BaseDeltaIterator::AdvanceDelta() {
+ if (forward_) {
+ delta_iterator_->NextKey();
+ } else {
+ delta_iterator_->PrevKey();
+ }
+}
+void BaseDeltaIterator::AdvanceBase() {
+ if (forward_) {
+ base_iterator_->Next();
+ } else {
+ base_iterator_->Prev();
+ }
+}
+
+bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); }
+bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); }
+void BaseDeltaIterator::UpdateCurrent() {
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+ status_ = Status::OK();
+ while (true) {
+ auto delta_result = WBWIIteratorImpl::kNotFound;
+ WriteEntry delta_entry;
+ if (DeltaValid()) {
+ assert(delta_iterator_->status().ok());
+ delta_result =
+ delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext());
+ delta_entry = delta_iterator_->Entry();
+ } else if (!delta_iterator_->status().ok()) {
+ // Expose the error status and stop.
+ current_at_base_ = false;
+ return;
+ }
+ equal_keys_ = false;
+ if (!BaseValid()) {
+ if (!base_iterator_->status().ok()) {
+ // Expose the error status and stop.
+ current_at_base_ = true;
+ return;
+ }
+
+ // Base has finished.
+ if (!DeltaValid()) {
+ // Finished
+ return;
+ }
+ if (iterate_upper_bound_) {
+ if (comparator_->CompareWithoutTimestamp(
+ delta_entry.key, /*a_has_ts=*/false, *iterate_upper_bound_,
+ /*b_has_ts=*/false) >= 0) {
+ // out of upper bound -> finished.
+ return;
+ }
+ }
+ if (delta_result == WBWIIteratorImpl::kDeleted &&
+ wbwii_->GetNumOperands() == 0) {
+ AdvanceDelta();
+ } else {
+ current_at_base_ = false;
+ return;
+ }
+ } else if (!DeltaValid()) {
+ // Delta has finished.
+ current_at_base_ = true;
+ return;
+ } else {
+ int compare =
+ (forward_ ? 1 : -1) * comparator_->CompareWithoutTimestamp(
+ delta_entry.key, /*a_has_ts=*/false,
+ base_iterator_->key(), /*b_has_ts=*/false);
+ if (compare <= 0) { // delta bigger or equal
+ if (compare == 0) {
+ equal_keys_ = true;
+ }
+ if (delta_result != WBWIIteratorImpl::kDeleted ||
+ wbwii_->GetNumOperands() > 0) {
+ current_at_base_ = false;
+ return;
+ }
+ // Delta is less advanced and is delete.
+ AdvanceDelta();
+ if (equal_keys_) {
+ AdvanceBase();
+ }
+ } else {
+ current_at_base_ = true;
+ return;
+ }
+ }
+ }
+
+ AssertInvariants();
+#endif // __clang_analyzer__
+}
+
+void WBWIIteratorImpl::AdvanceKey(bool forward) {
+ if (Valid()) {
+ Slice key = Entry().key;
+ do {
+ if (forward) {
+ Next();
+ } else {
+ Prev();
+ }
+ } while (MatchesKey(column_family_id_, key));
+ }
+}
+
+void WBWIIteratorImpl::NextKey() { AdvanceKey(true); }
+
+void WBWIIteratorImpl::PrevKey() {
+ AdvanceKey(false); // Move to the tail of the previous key
+ if (Valid()) {
+ AdvanceKey(false); // Move back another key. Now we are at the start of
+ // the previous key
+ if (Valid()) { // Still a valid
+ Next(); // Move forward one onto this key
+ } else {
+ SeekToFirst(); // Not valid, move to the start
+ }
+ }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+ MergeContext* merge_context) {
+ if (Valid()) {
+ Slice key = Entry().key;
+ return FindLatestUpdate(key, merge_context);
+ } else {
+ merge_context->Clear(); // Clear any entries in the MergeContext
+ return WBWIIteratorImpl::kNotFound;
+ }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+ const Slice& key, MergeContext* merge_context) {
+ Result result = WBWIIteratorImpl::kNotFound;
+ merge_context->Clear(); // Clear any entries in the MergeContext
+ // TODO(agiardullo): consider adding support for reverse iteration
+ if (!Valid()) {
+ return result;
+ } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) !=
+ 0) {
+ return result;
+ } else {
+ // We want to iterate in the reverse order that the writes were added to the
+ // batch. Since we don't have a reverse iterator, we must seek past the
+ // end. We do this by seeking to the next key, and then back one step
+ NextKey();
+ if (Valid()) {
+ Prev();
+ } else {
+ SeekToLast();
+ }
+
+ // We are at the end of the iterator for this key. Search backwards for the
+ // last Put or Delete, accumulating merges along the way.
+ while (Valid()) {
+ const WriteEntry entry = Entry();
+ if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) {
+ break; // Unexpected error or we've reached a different next key
+ }
+
+ switch (entry.type) {
+ case kPutRecord:
+ return WBWIIteratorImpl::kFound;
+ case kDeleteRecord:
+ return WBWIIteratorImpl::kDeleted;
+ case kSingleDeleteRecord:
+ return WBWIIteratorImpl::kDeleted;
+ case kMergeRecord:
+ result = WBWIIteratorImpl::kMergeInProgress;
+ merge_context->PushOperand(entry.value);
+ break;
+ case kLogDataRecord:
+ break; // ignore
+ case kXIDRecord:
+ break; // ignore
+ default:
+ return WBWIIteratorImpl::kError;
+ } // end switch statement
+ Prev();
+ } // End while Valid()
+ // At this point, we have been through the whole list and found no Puts or
+ // Deletes. The iterator points to the previous key. Move the iterator back
+ // onto this one.
+ if (Valid()) {
+ Next();
+ } else {
+ SeekToFirst();
+ }
+ }
+ return result;
+}
+
+Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
+ WriteType* type, Slice* Key,
+ Slice* value, Slice* blob,
+ Slice* xid) const {
+ if (type == nullptr || Key == nullptr || value == nullptr ||
+ blob == nullptr || xid == nullptr) {
+ return Status::InvalidArgument("Output parameters cannot be null");
+ }
+
+ if (data_offset == GetDataSize()) {
+ // reached end of batch.
+ return Status::NotFound();
+ }
+
+ if (data_offset > GetDataSize()) {
+ return Status::InvalidArgument("data offset exceed write batch size");
+ }
+ Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset);
+ char tag;
+ uint32_t column_family;
+ Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value,
+ blob, xid);
+ if (!s.ok()) {
+ return s;
+ }
+
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ case kTypeValue:
+ *type = kPutRecord;
+ break;
+ case kTypeColumnFamilyDeletion:
+ case kTypeDeletion:
+ *type = kDeleteRecord;
+ break;
+ case kTypeColumnFamilySingleDeletion:
+ case kTypeSingleDeletion:
+ *type = kSingleDeleteRecord;
+ break;
+ case kTypeColumnFamilyRangeDeletion:
+ case kTypeRangeDeletion:
+ *type = kDeleteRangeRecord;
+ break;
+ case kTypeColumnFamilyMerge:
+ case kTypeMerge:
+ *type = kMergeRecord;
+ break;
+ case kTypeLogData:
+ *type = kLogDataRecord;
+ break;
+ case kTypeNoop:
+ case kTypeBeginPrepareXID:
+ case kTypeBeginPersistedPrepareXID:
+ case kTypeBeginUnprepareXID:
+ case kTypeEndPrepareXID:
+ case kTypeCommitXID:
+ case kTypeRollbackXID:
+ *type = kXIDRecord;
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag ",
+ std::to_string(static_cast<unsigned int>(tag)));
+ }
+ return Status::OK();
+}
+
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+// and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and key, the one with larger offset
+// will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
+int WriteBatchEntryComparator::operator()(
+ const WriteBatchIndexEntry* entry1,
+ const WriteBatchIndexEntry* entry2) const {
+ if (entry1->column_family > entry2->column_family) {
+ return 1;
+ } else if (entry1->column_family < entry2->column_family) {
+ return -1;
+ }
+
+ // Deal with special case of seeking to the beginning of a column family
+ if (entry1->is_min_in_cf()) {
+ return -1;
+ } else if (entry2->is_min_in_cf()) {
+ return 1;
+ }
+
+ Slice key1, key2;
+ if (entry1->search_key == nullptr) {
+ key1 = Slice(write_batch_->Data().data() + entry1->key_offset,
+ entry1->key_size);
+ } else {
+ key1 = *(entry1->search_key);
+ }
+ if (entry2->search_key == nullptr) {
+ key2 = Slice(write_batch_->Data().data() + entry2->key_offset,
+ entry2->key_size);
+ } else {
+ key2 = *(entry2->search_key);
+ }
+
+ int cmp = CompareKey(entry1->column_family, key1, key2);
+ if (cmp != 0) {
+ return cmp;
+ } else if (entry1->offset > entry2->offset) {
+ return 1;
+ } else if (entry1->offset < entry2->offset) {
+ return -1;
+ }
+ return 0;
+}
+
+int WriteBatchEntryComparator::CompareKey(uint32_t column_family,
+ const Slice& key1,
+ const Slice& key2) const {
+ if (column_family < cf_comparators_.size() &&
+ cf_comparators_[column_family] != nullptr) {
+ return cf_comparators_[column_family]->CompareWithoutTimestamp(
+ key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false);
+ } else {
+ return default_comparator_->CompareWithoutTimestamp(
+ key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false);
+ }
+}
+
+const Comparator* WriteBatchEntryComparator::GetComparator(
+ const ColumnFamilyHandle* column_family) const {
+ return column_family ? column_family->GetComparator() : default_comparator_;
+}
+
+const Comparator* WriteBatchEntryComparator::GetComparator(
+ uint32_t column_family) const {
+ if (column_family < cf_comparators_.size() &&
+ cf_comparators_[column_family]) {
+ return cf_comparators_[column_family];
+ }
+ return default_comparator_;
+}
+
+WriteEntry WBWIIteratorImpl::Entry() const {
+ WriteEntry ret;
+ Slice blob, xid;
+ const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+ // this is guaranteed with Valid()
+ assert(iter_entry != nullptr &&
+ iter_entry->column_family == column_family_id_);
+ auto s = write_batch_->GetEntryFromDataOffset(
+ iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
+ assert(s.ok());
+ assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
+ ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
+ ret.type == kMergeRecord);
+ // Make sure entry.key does not include user-defined timestamp.
+ const Comparator* const ucmp = comparator_->GetComparator(column_family_id_);
+ size_t ts_sz = ucmp->timestamp_size();
+ if (ts_sz > 0) {
+ ret.key = StripTimestampFromUserKey(ret.key, ts_sz);
+ }
+ return ret;
+}
+
+bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) {
+ if (Valid()) {
+ return comparator_->CompareKey(cf_id, key, Entry().key) == 0;
+ } else {
+ return false;
+ }
+}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+ ColumnFamilyHandle* column_family)
+ : db_(nullptr), db_options_(nullptr), column_family_(column_family) {}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+ DB* db, ColumnFamilyHandle* column_family)
+ : db_(db), db_options_(nullptr), column_family_(column_family) {
+ if (db_ != nullptr && column_family_ == nullptr) {
+ column_family_ = db_->DefaultColumnFamily();
+ }
+}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+ const DBOptions* db_options, ColumnFamilyHandle* column_family)
+ : db_(nullptr), db_options_(db_options), column_family_(column_family) {}
+
+Status WriteBatchWithIndexInternal::MergeKey(const Slice& key,
+ const Slice* value,
+ const MergeContext& context,
+ std::string* result) const {
+ if (column_family_ != nullptr) {
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family_);
+ const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get();
+ if (merge_operator == nullptr) {
+ return Status::InvalidArgument(
+ "Merge_operator must be set for column_family");
+ } else if (db_ != nullptr) {
+ const ImmutableDBOptions& immutable_db_options =
+ static_cast_with_check<DBImpl>(db_->GetRootDB())
+ ->immutable_db_options();
+ Statistics* statistics = immutable_db_options.statistics.get();
+ Logger* logger = immutable_db_options.info_log.get();
+ SystemClock* clock = immutable_db_options.clock;
+ return MergeHelper::TimedFullMerge(
+ merge_operator, key, value, context.GetOperands(), result, logger,
+ statistics, clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ } else if (db_options_ != nullptr) {
+ Statistics* statistics = db_options_->statistics.get();
+ Env* env = db_options_->env;
+ Logger* logger = db_options_->info_log.get();
+ SystemClock* clock = env->GetSystemClock().get();
+ return MergeHelper::TimedFullMerge(
+ merge_operator, key, value, context.GetOperands(), result, logger,
+ statistics, clock, /* result_operand */ nullptr,
+ /* update_num_ops_stats */ false);
+ } else {
+ const auto cf_opts = cfh->cfd()->ioptions();
+ return MergeHelper::TimedFullMerge(
+ merge_operator, key, value, context.GetOperands(), result,
+ cf_opts->logger, cf_opts->stats, cf_opts->clock,
+ /* result_operand */ nullptr, /* update_num_ops_stats */ false);
+ }
+ } else {
+ return Status::InvalidArgument("Must provide a column_family");
+ }
+}
+
+WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch(
+ WriteBatchWithIndex* batch, const Slice& key, MergeContext* context,
+ std::string* value, Status* s) {
+ *s = Status::OK();
+
+ std::unique_ptr<WBWIIteratorImpl> iter(
+ static_cast_with_check<WBWIIteratorImpl>(
+ batch->NewIterator(column_family_)));
+
+ // Search the iterator for this key, and updates/merges to it.
+ iter->Seek(key);
+ auto result = iter->FindLatestUpdate(key, context);
+ if (result == WBWIIteratorImpl::kError) {
+ (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
+ std::to_string(iter->Entry().type));
+ return result;
+ } else if (result == WBWIIteratorImpl::kNotFound) {
+ return result;
+ } else if (result == WBWIIteratorImpl::Result::kFound) { // PUT
+ Slice entry_value = iter->Entry().value;
+ if (context->GetNumOperands() > 0) {
+ *s = MergeKey(key, &entry_value, *context, value);
+ if (!s->ok()) {
+ result = WBWIIteratorImpl::Result::kError;
+ }
+ } else {
+ value->assign(entry_value.data(), entry_value.size());
+ }
+ } else if (result == WBWIIteratorImpl::kDeleted) {
+ if (context->GetNumOperands() > 0) {
+ *s = MergeKey(key, nullptr, *context, value);
+ if (s->ok()) {
+ result = WBWIIteratorImpl::Result::kFound;
+ } else {
+ result = WBWIIteratorImpl::Result::kError;
+ }
+ }
+ }
+ return result;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
new file mode 100644
index 000000000..edabc95bc
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "memtable/skiplist.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeContext;
+class WBWIIteratorImpl;
+class WriteBatchWithIndexInternal;
+struct Options;
+
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+ BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
+ WBWIIteratorImpl* delta_iterator,
+ const Comparator* comparator,
+ const ReadOptions* read_options = nullptr);
+
+ ~BaseDeltaIterator() override {}
+
+ bool Valid() const override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Seek(const Slice& k) override;
+ void SeekForPrev(const Slice& k) override;
+ void Next() override;
+ void Prev() override;
+ Slice key() const override;
+ Slice value() const override;
+ Status status() const override;
+ void Invalidate(Status s);
+
+ private:
+ void AssertInvariants();
+ void Advance();
+ void AdvanceDelta();
+ void AdvanceBase();
+ bool BaseValid() const;
+ bool DeltaValid() const;
+ void UpdateCurrent();
+
+ std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
+ bool forward_;
+ bool current_at_base_;
+ bool equal_keys_;
+ mutable Status status_;
+ std::unique_ptr<Iterator> base_iterator_;
+ std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
+ const Comparator* comparator_; // not owned
+ const Slice* iterate_upper_bound_;
+ mutable PinnableSlice merge_result_;
+};
+
+// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
+struct WriteBatchIndexEntry {
+ WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
+ : offset(o),
+ column_family(c),
+ key_offset(ko),
+ key_size(ksz),
+ search_key(nullptr) {}
+ // Create a dummy entry as the search key. This index entry won't be backed
+ // by an entry from the write batch, but a pointer to the search key. Or a
+ // special flag of offset can indicate we are seek to first.
+ // @_search_key: the search key
+ // @_column_family: column family
+ // @is_forward_direction: true for Seek(). False for SeekForPrev()
+ // @is_seek_to_first: true if we seek to the beginning of the column family
+ // _search_key should be null in this case.
+ WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+ bool is_forward_direction, bool is_seek_to_first)
+ // For SeekForPrev(), we need to make the dummy entry larger than any
+ // entry who has the same search key. Otherwise, we'll miss those entries.
+ : offset(is_forward_direction ? 0 : std::numeric_limits<size_t>::max()),
+ column_family(_column_family),
+ key_offset(0),
+ key_size(is_seek_to_first ? kFlagMinInCf : 0),
+ search_key(_search_key) {
+ assert(_search_key != nullptr || is_seek_to_first);
+ }
+
+ // If this flag appears in the key_size, it indicates a
+ // key that is smaller than any other entry for the same column family.
+ static const size_t kFlagMinInCf = std::numeric_limits<size_t>::max();
+
+ bool is_min_in_cf() const {
+ assert(key_size != kFlagMinInCf ||
+ (key_offset == 0 && search_key == nullptr));
+ return key_size == kFlagMinInCf;
+ }
+
+ // offset of an entry in write batch's string buffer. If this is a dummy
+ // lookup key, in which case search_key != nullptr, offset is set to either
+ // 0 or max, only for comparison purpose. Because when entries have the same
+ // key, the entry with larger offset is larger, offset = 0 will make a seek
+ // key small or equal than all the entries with the seek key, so that Seek()
+ // will find all the entries of the same key. Similarly, offset = MAX will
+ // make the entry just larger than all entries with the search key so
+ // SeekForPrev() will see all the keys with the same key.
+ size_t offset;
+ uint32_t column_family; // column family of the entry.
+ size_t key_offset; // offset of the key in write batch's string buffer.
+ size_t key_size; // size of the key. kFlagMinInCf indicates
+ // that this is a dummy look up entry for
+ // SeekToFirst() to the beginning of the column
+ // family. We use the flag here to save a boolean
+ // in the struct.
+
+ const Slice* search_key; // if not null, instead of reading keys from
+ // write batch, use it to compare. This is used
+ // for lookup key.
+};
+
+class ReadableWriteBatch : public WriteBatch {
+ public:
+ explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0,
+ size_t protection_bytes_per_key = 0,
+ size_t default_cf_ts_sz = 0)
+ : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key,
+ default_cf_ts_sz) {}
+ // Retrieve some information from a write entry in the write batch, given
+ // the start offset of the write entry.
+ Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
+ Slice* value, Slice* blob, Slice* xid) const;
+};
+
+class WriteBatchEntryComparator {
+ public:
+ WriteBatchEntryComparator(const Comparator* _default_comparator,
+ const ReadableWriteBatch* write_batch)
+ : default_comparator_(_default_comparator), write_batch_(write_batch) {}
+ // Compare a and b. Return a negative value if a is less than b, 0 if they
+ // are equal, and a positive value if a is greater than b
+ int operator()(const WriteBatchIndexEntry* entry1,
+ const WriteBatchIndexEntry* entry2) const;
+
+ int CompareKey(uint32_t column_family, const Slice& key1,
+ const Slice& key2) const;
+
+ void SetComparatorForCF(uint32_t column_family_id,
+ const Comparator* comparator) {
+ if (column_family_id >= cf_comparators_.size()) {
+ cf_comparators_.resize(column_family_id + 1, nullptr);
+ }
+ cf_comparators_[column_family_id] = comparator;
+ }
+
+ const Comparator* default_comparator() { return default_comparator_; }
+
+ const Comparator* GetComparator(
+ const ColumnFamilyHandle* column_family) const;
+
+ const Comparator* GetComparator(uint32_t column_family) const;
+
+ private:
+ const Comparator* const default_comparator_;
+ std::vector<const Comparator*> cf_comparators_;
+ const ReadableWriteBatch* const write_batch_;
+};
+
+using WriteBatchEntrySkipList =
+ SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>;
+
+class WBWIIteratorImpl : public WBWIIterator {
+ public:
+ enum Result : uint8_t {
+ kFound,
+ kDeleted,
+ kNotFound,
+ kMergeInProgress,
+ kError
+ };
+ WBWIIteratorImpl(uint32_t column_family_id,
+ WriteBatchEntrySkipList* skip_list,
+ const ReadableWriteBatch* write_batch,
+ WriteBatchEntryComparator* comparator)
+ : column_family_id_(column_family_id),
+ skip_list_iter_(skip_list),
+ write_batch_(write_batch),
+ comparator_(comparator) {}
+
+ ~WBWIIteratorImpl() override {}
+
+ bool Valid() const override {
+ if (!skip_list_iter_.Valid()) {
+ return false;
+ }
+ const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+ return (iter_entry != nullptr &&
+ iter_entry->column_family == column_family_id_);
+ }
+
+ void SeekToFirst() override {
+ WriteBatchIndexEntry search_entry(
+ nullptr /* search_key */, column_family_id_,
+ true /* is_forward_direction */, true /* is_seek_to_first */);
+ skip_list_iter_.Seek(&search_entry);
+ }
+
+ void SeekToLast() override {
+ WriteBatchIndexEntry search_entry(
+ nullptr /* search_key */, column_family_id_ + 1,
+ true /* is_forward_direction */, true /* is_seek_to_first */);
+ skip_list_iter_.Seek(&search_entry);
+ if (!skip_list_iter_.Valid()) {
+ skip_list_iter_.SeekToLast();
+ } else {
+ skip_list_iter_.Prev();
+ }
+ }
+
+ void Seek(const Slice& key) override {
+ WriteBatchIndexEntry search_entry(&key, column_family_id_,
+ true /* is_forward_direction */,
+ false /* is_seek_to_first */);
+ skip_list_iter_.Seek(&search_entry);
+ }
+
+ void SeekForPrev(const Slice& key) override {
+ WriteBatchIndexEntry search_entry(&key, column_family_id_,
+ false /* is_forward_direction */,
+ false /* is_seek_to_first */);
+ skip_list_iter_.SeekForPrev(&search_entry);
+ }
+
+ void Next() override { skip_list_iter_.Next(); }
+
+ void Prev() override { skip_list_iter_.Prev(); }
+
+ WriteEntry Entry() const override;
+
+ Status status() const override {
+ // this is in-memory data structure, so the only way status can be non-ok is
+ // through memory corruption
+ return Status::OK();
+ }
+
+ const WriteBatchIndexEntry* GetRawEntry() const {
+ return skip_list_iter_.key();
+ }
+
+ bool MatchesKey(uint32_t cf_id, const Slice& key);
+
+ // Moves the iterator to first entry of the previous key.
+ void PrevKey();
+ // Moves the iterator to first entry of the next key.
+ void NextKey();
+
+ // Moves the iterator to the Update (Put or Delete) for the current key
+ // If there are no Put/Delete, the Iterator will point to the first entry for
+ // this key
+ // @return kFound if a Put was found for the key
+ // @return kDeleted if a delete was found for the key
+ // @return kMergeInProgress if only merges were fouund for the key
+ // @return kError if an unsupported operation was found for the key
+ // @return kNotFound if no operations were found for this key
+ //
+ Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
+ Result FindLatestUpdate(MergeContext* merge_context);
+
+ protected:
+ void AdvanceKey(bool forward);
+
+ private:
+ uint32_t column_family_id_;
+ WriteBatchEntrySkipList::Iterator skip_list_iter_;
+ const ReadableWriteBatch* write_batch_;
+ WriteBatchEntryComparator* comparator_;
+};
+
+class WriteBatchWithIndexInternal {
+ public:
+ static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi,
+ uint32_t cf_id);
+
+ // For GetFromBatchAndDB or similar
+ explicit WriteBatchWithIndexInternal(DB* db,
+ ColumnFamilyHandle* column_family);
+ // For GetFromBatchAndDB or similar
+ explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
+ // For GetFromBatch or similar
+ explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
+ ColumnFamilyHandle* column_family);
+
+ // If batch contains a value for key, store it in *value and return kFound.
+ // If batch contains a deletion for key, return Deleted.
+ // If batch contains Merge operations as the most recent entry for a key,
+ // and the merge process does not stop (not reaching a value or delete),
+ // prepend the current merge operands to *operands,
+ // and return kMergeInProgress
+ // If batch does not contain this key, return kNotFound
+ // Else, return kError on error with error Status stored in *s.
+ WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+ const Slice& key, std::string* value,
+ Status* s) {
+ return GetFromBatch(batch, key, &merge_context_, value, s);
+ }
+ WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+ const Slice& key,
+ MergeContext* merge_context,
+ std::string* value, Status* s);
+ Status MergeKey(const Slice& key, const Slice* value,
+ std::string* result) const {
+ return MergeKey(key, value, merge_context_, result);
+ }
+ Status MergeKey(const Slice& key, const Slice* value,
+ const MergeContext& context, std::string* result) const;
+ size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
+ MergeContext* GetMergeContext() { return &merge_context_; }
+ Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
+
+ private:
+ DB* db_;
+ const DBOptions* db_options_;
+ ColumnFamilyHandle* column_family_;
+ MergeContext merge_context_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
new file mode 100644
index 000000000..350dcc881
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -0,0 +1,2419 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <map>
+#include <memory>
+
+#include "db/column_family.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+ explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator)
+ : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+ id_(id),
+ comparator_(comparator) {}
+ uint32_t GetID() const override { return id_; }
+ const Comparator* GetComparator() const override { return comparator_; }
+
+ private:
+ uint32_t id_;
+ const Comparator* comparator_;
+};
+
+struct Entry {
+ std::string key;
+ std::string value;
+ WriteType type;
+};
+
+struct TestHandler : public WriteBatch::Handler {
+ std::map<uint32_t, std::vector<Entry>> seen;
+ Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ Entry e;
+ e.key = key.ToString();
+ e.value = value.ToString();
+ e.type = kPutRecord;
+ seen[column_family_id].push_back(e);
+ return Status::OK();
+ }
+ Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ Entry e;
+ e.key = key.ToString();
+ e.value = value.ToString();
+ e.type = kMergeRecord;
+ seen[column_family_id].push_back(e);
+ return Status::OK();
+ }
+ void LogData(const Slice& /*blob*/) override {}
+ Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+ Entry e;
+ e.key = key.ToString();
+ e.value = "";
+ e.type = kDeleteRecord;
+ seen[column_family_id].push_back(e);
+ return Status::OK();
+ }
+};
+
+using KVMap = std::map<std::string, std::string>;
+
+class KVIter : public Iterator {
+ public:
+ explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void SeekForPrev(const Slice& k) override {
+ iter_ = map_->upper_bound(k.ToString());
+ Prev();
+ }
+ void Next() override { ++iter_; }
+ void Prev() override {
+ if (iter_ == map_->begin()) {
+ iter_ = map_->end();
+ return;
+ }
+ --iter_;
+ }
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const KVMap* const map_;
+ KVMap::const_iterator iter_;
+};
+
+static std::string PrintContents(WriteBatchWithIndex* batch,
+ ColumnFamilyHandle* column_family,
+ bool hex = false) {
+ std::string result;
+
+ WBWIIterator* iter;
+ if (column_family == nullptr) {
+ iter = batch->NewIterator();
+ } else {
+ iter = batch->NewIterator(column_family);
+ }
+
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ WriteEntry e = iter->Entry();
+
+ if (e.type == kPutRecord) {
+ result.append("PUT(");
+ result.append(e.key.ToString(hex));
+ result.append("):");
+ result.append(e.value.ToString(hex));
+ } else if (e.type == kMergeRecord) {
+ result.append("MERGE(");
+ result.append(e.key.ToString(hex));
+ result.append("):");
+ result.append(e.value.ToString(hex));
+ } else if (e.type == kSingleDeleteRecord) {
+ result.append("SINGLE-DEL(");
+ result.append(e.key.ToString(hex));
+ result.append(")");
+ } else {
+ assert(e.type == kDeleteRecord);
+ result.append("DEL(");
+ result.append(e.key.ToString(hex));
+ result.append(")");
+ }
+
+ result.append(",");
+ iter->Next();
+ }
+
+ delete iter;
+ return result;
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
+ ColumnFamilyHandle* column_family) {
+ std::string result;
+
+ Iterator* iter;
+ if (column_family == nullptr) {
+ iter = batch->NewIteratorWithBase(new KVIter(base_map));
+ } else {
+ iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
+ }
+
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ assert(iter->status().ok());
+
+ Slice key = iter->key();
+ Slice value = iter->value();
+
+ result.append(key.ToString());
+ result.append(":");
+ result.append(value.ToString());
+ result.append(",");
+
+ iter->Next();
+ }
+
+ delete iter;
+ return result;
+}
+
+void AssertIter(Iterator* iter, const std::string& key,
+ const std::string& value) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key, iter->key().ToString());
+ ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersMatch(Iterator* iter1, Iterator* iter2) {
+ ASSERT_EQ(iter1->Valid(), iter2->Valid());
+ if (iter1->Valid()) {
+ ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+ ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+ }
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+ iter1->SeekToFirst();
+ iter2->SeekToFirst();
+ while (iter1->Valid()) {
+ ASSERT_EQ(iter1->Valid(), iter2->Valid());
+ ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+ ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+ iter1->Next();
+ iter2->Next();
+ }
+ ASSERT_EQ(iter1->Valid(), iter2->Valid());
+}
+
+void AssertIterEqual(WBWIIteratorImpl* wbwii,
+ const std::vector<std::string>& keys) {
+ wbwii->SeekToFirst();
+ for (auto k : keys) {
+ ASSERT_TRUE(wbwii->Valid());
+ ASSERT_EQ(wbwii->Entry().key, k);
+ wbwii->NextKey();
+ }
+ ASSERT_FALSE(wbwii->Valid());
+ wbwii->SeekToLast();
+ for (auto kit = keys.rbegin(); kit != keys.rend(); ++kit) {
+ ASSERT_TRUE(wbwii->Valid());
+ ASSERT_EQ(wbwii->Entry().key, *kit);
+ wbwii->PrevKey();
+ }
+ ASSERT_FALSE(wbwii->Valid());
+}
+} // namespace
+
+class WBWIBaseTest : public testing::Test {
+ public:
+ explicit WBWIBaseTest(bool overwrite) : db_(nullptr) {
+ options_.merge_operator =
+ MergeOperators::CreateFromStringId("stringappend");
+ options_.create_if_missing = true;
+ dbname_ = test::PerThreadDBPath("write_batch_with_index_test");
+ EXPECT_OK(DestroyDB(dbname_, options_));
+ batch_.reset(new WriteBatchWithIndex(BytewiseComparator(), 20, overwrite));
+ }
+
+ virtual ~WBWIBaseTest() {
+ if (db_ != nullptr) {
+ ReleaseSnapshot();
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, options_));
+ }
+ }
+
+ std::string AddToBatch(ColumnFamilyHandle* cf, const std::string& key) {
+ std::string result;
+ for (size_t i = 0; i < key.size(); i++) {
+ if (key[i] == 'd') {
+ batch_->Delete(cf, key);
+ result = "";
+ } else if (key[i] == 'p') {
+ result = key + std::to_string(i);
+ batch_->Put(cf, key, result);
+ } else if (key[i] == 'm') {
+ std::string value = key + std::to_string(i);
+ batch_->Merge(cf, key, value);
+ if (result.empty()) {
+ result = value;
+ } else {
+ result = result + "," + value;
+ }
+ }
+ }
+ return result;
+ }
+
+ virtual Status OpenDB() { return DB::Open(options_, dbname_, &db_); }
+
+ void ReleaseSnapshot() {
+ if (read_opts_.snapshot != nullptr) {
+ EXPECT_NE(db_, nullptr);
+ db_->ReleaseSnapshot(read_opts_.snapshot);
+ read_opts_.snapshot = nullptr;
+ }
+ }
+
+ public:
+ DB* db_;
+ std::string dbname_;
+ Options options_;
+ WriteOptions write_opts_;
+ ReadOptions read_opts_;
+ std::unique_ptr<WriteBatchWithIndex> batch_;
+};
+
+class WBWIKeepTest : public WBWIBaseTest {
+ public:
+ WBWIKeepTest() : WBWIBaseTest(false) {}
+};
+
+class WBWIOverwriteTest : public WBWIBaseTest {
+ public:
+ WBWIOverwriteTest() : WBWIBaseTest(true) {}
+};
+class WriteBatchWithIndexTest : public WBWIBaseTest,
+ public testing::WithParamInterface<bool> {
+ public:
+ WriteBatchWithIndexTest() : WBWIBaseTest(GetParam()) {}
+};
+
+void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
+ WriteBatchWithIndex* batch) {
+ // In this test, we insert <key, value> to column family `data`, and
+ // <value, key> to column family `index`. Then iterator them in order
+ // and seek them by key.
+
+ // Sort entries by key
+ std::map<std::string, std::vector<Entry*>> data_map;
+ // Sort entries by value
+ std::map<std::string, std::vector<Entry*>> index_map;
+ for (auto& e : entries) {
+ data_map[e.key].push_back(&e);
+ index_map[e.value].push_back(&e);
+ }
+
+ ColumnFamilyHandleImplDummy data(6, BytewiseComparator());
+ ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
+ for (auto& e : entries) {
+ if (e.type == kPutRecord) {
+ ASSERT_OK(batch->Put(&data, e.key, e.value));
+ ASSERT_OK(batch->Put(&index, e.value, e.key));
+ } else if (e.type == kMergeRecord) {
+ ASSERT_OK(batch->Merge(&data, e.key, e.value));
+ ASSERT_OK(batch->Put(&index, e.value, e.key));
+ } else {
+ assert(e.type == kDeleteRecord);
+ std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+ iter->Seek(e.key);
+ ASSERT_OK(iter->status());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(e.key, write_entry.key.ToString());
+ ASSERT_EQ(e.value, write_entry.value.ToString());
+ ASSERT_OK(batch->Delete(&data, e.key));
+ ASSERT_OK(batch->Put(&index, e.value, ""));
+ }
+ }
+
+ // Iterator all keys
+ {
+ std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+ for (int seek_to_first : {0, 1}) {
+ if (seek_to_first) {
+ iter->SeekToFirst();
+ } else {
+ iter->Seek("");
+ }
+ for (auto pair : data_map) {
+ for (auto v : pair.second) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair.first, write_entry.key.ToString());
+ ASSERT_EQ(v->type, write_entry.type);
+ if (write_entry.type != kDeleteRecord) {
+ ASSERT_EQ(v->value, write_entry.value.ToString());
+ }
+ iter->Next();
+ }
+ }
+ ASSERT_TRUE(!iter->Valid());
+ }
+ iter->SeekToLast();
+ for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+ for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair->first, write_entry.key.ToString());
+ ASSERT_EQ((*v)->type, write_entry.type);
+ if (write_entry.type != kDeleteRecord) {
+ ASSERT_EQ((*v)->value, write_entry.value.ToString());
+ }
+ iter->Prev();
+ }
+ }
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ // Iterator all indexes
+ {
+ std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+ for (int seek_to_first : {0, 1}) {
+ if (seek_to_first) {
+ iter->SeekToFirst();
+ } else {
+ iter->Seek("");
+ }
+ for (auto pair : index_map) {
+ for (auto v : pair.second) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair.first, write_entry.key.ToString());
+ if (v->type != kDeleteRecord) {
+ ASSERT_EQ(v->key, write_entry.value.ToString());
+ ASSERT_EQ(v->value, write_entry.key.ToString());
+ }
+ iter->Next();
+ }
+ }
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ iter->SeekToLast();
+ for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+ for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair->first, write_entry.key.ToString());
+ if ((*v)->type != kDeleteRecord) {
+ ASSERT_EQ((*v)->key, write_entry.value.ToString());
+ ASSERT_EQ((*v)->value, write_entry.key.ToString());
+ }
+ iter->Prev();
+ }
+ }
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ // Seek to every key
+ {
+ std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+
+ // Seek the keys one by one in reverse order
+ for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+ iter->Seek(pair->first);
+ ASSERT_OK(iter->status());
+ for (auto v : pair->second) {
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair->first, write_entry.key.ToString());
+ ASSERT_EQ(v->type, write_entry.type);
+ if (write_entry.type != kDeleteRecord) {
+ ASSERT_EQ(v->value, write_entry.value.ToString());
+ }
+ iter->Next();
+ ASSERT_OK(iter->status());
+ }
+ }
+ }
+
+ // Seek to every index
+ {
+ std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+
+ // Seek the keys one by one in reverse order
+ for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+ iter->Seek(pair->first);
+ ASSERT_OK(iter->status());
+ for (auto v : pair->second) {
+ ASSERT_TRUE(iter->Valid());
+ auto write_entry = iter->Entry();
+ ASSERT_EQ(pair->first, write_entry.key.ToString());
+ ASSERT_EQ(v->value, write_entry.key.ToString());
+ if (v->type != kDeleteRecord) {
+ ASSERT_EQ(v->key, write_entry.value.ToString());
+ }
+ iter->Next();
+ ASSERT_OK(iter->status());
+ }
+ }
+ }
+
+ // Verify WriteBatch can be iterated
+ TestHandler handler;
+ ASSERT_OK(batch->GetWriteBatch()->Iterate(&handler));
+
+ // Verify data column family
+ {
+ ASSERT_EQ(entries.size(), handler.seen[data.GetID()].size());
+ size_t i = 0;
+ for (auto e : handler.seen[data.GetID()]) {
+ auto write_entry = entries[i++];
+ ASSERT_EQ(e.type, write_entry.type);
+ ASSERT_EQ(e.key, write_entry.key);
+ if (e.type != kDeleteRecord) {
+ ASSERT_EQ(e.value, write_entry.value);
+ }
+ }
+ }
+
+ // Verify index column family
+ {
+ ASSERT_EQ(entries.size(), handler.seen[index.GetID()].size());
+ size_t i = 0;
+ for (auto e : handler.seen[index.GetID()]) {
+ auto write_entry = entries[i++];
+ ASSERT_EQ(e.key, write_entry.value);
+ if (write_entry.type != kDeleteRecord) {
+ ASSERT_EQ(e.value, write_entry.key);
+ }
+ }
+ }
+}
+
+TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) {
+ Entry entries[] = {
+ {"aaa", "0005", kPutRecord}, {"b", "0002", kPutRecord},
+ {"cdd", "0002", kMergeRecord}, {"aab", "00001", kPutRecord},
+ {"cc", "00005", kPutRecord}, {"cdd", "0002", kPutRecord},
+ {"aab", "0003", kPutRecord}, {"cc", "00005", kDeleteRecord},
+ };
+ std::vector<Entry> entries_list(entries, entries + 8);
+
+ batch_.reset(new WriteBatchWithIndex(nullptr, 20, false));
+
+ TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
+
+ // Clear batch and re-run test with new values
+ batch_->Clear();
+
+ Entry new_entries[] = {
+ {"aaa", "0005", kPutRecord}, {"e", "0002", kPutRecord},
+ {"add", "0002", kMergeRecord}, {"aab", "00001", kPutRecord},
+ {"zz", "00005", kPutRecord}, {"add", "0002", kPutRecord},
+ {"aab", "0003", kPutRecord}, {"zz", "00005", kDeleteRecord},
+ };
+
+ entries_list = std::vector<Entry>(new_entries, new_entries + 8);
+
+ TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestComparatorForCF) {
+ ColumnFamilyHandleImplDummy cf1(6, nullptr);
+ ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+
+ ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+ ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+ ASSERT_OK(batch_->Put(&cf2, "eee", ""));
+ ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+ ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+ ASSERT_OK(batch_->Put(&cf1, "bbb", ""));
+
+ Slice key_slices[] = {"a", "3", "3"};
+ Slice value_slice = "";
+ ASSERT_OK(batch_->Put(&reverse_cf, SliceParts(key_slices, 3),
+ SliceParts(&value_slice, 1)));
+ ASSERT_OK(batch_->Put(&reverse_cf, "a22", ""));
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbb", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ccc", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ddd", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("aaa", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("eee", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("z");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a33", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a22", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a11", iter->Entry().key.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("a22");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a22", iter->Entry().key.ToString());
+
+ iter->Seek("a13");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a11", iter->Entry().key.ToString());
+ }
+}
+
+TEST_F(WBWIOverwriteTest, TestOverwriteKey) {
+ ColumnFamilyHandleImplDummy cf1(6, nullptr);
+ ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+
+ ASSERT_OK(batch_->Merge(&cf1, "ddd", ""));
+ ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+ ASSERT_OK(batch_->Delete(&cf1, "ddd"));
+ ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+ ASSERT_OK(batch_->Delete(&cf2, "aaa"));
+ ASSERT_OK(batch_->Put(&cf2, "aaa", "aaa"));
+ ASSERT_OK(batch_->Put(&cf2, "eee", "eee"));
+ ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+ ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+ ASSERT_OK(batch_->Delete(&cf1, "ccc"));
+ ASSERT_OK(batch_->Put(&reverse_cf, "a33", "a33"));
+ ASSERT_OK(batch_->Put(&reverse_cf, "a11", "a11"));
+ Slice slices[] = {"a", "3", "3"};
+ ASSERT_OK(batch_->Delete(&reverse_cf, SliceParts(slices, 3)));
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ccc", iter->Entry().key.ToString());
+ ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("ddd", iter->Entry().key.ToString());
+ ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
+ iter->SeekToLast();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("eee", iter->Entry().key.ToString());
+ ASSERT_EQ("eee", iter->Entry().value.ToString());
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("aaa", iter->Entry().key.ToString());
+ ASSERT_EQ("aaa", iter->Entry().value.ToString());
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToFirst();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("aaa", iter->Entry().key.ToString());
+ ASSERT_EQ("aaa", iter->Entry().value.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("eee", iter->Entry().key.ToString());
+ ASSERT_EQ("eee", iter->Entry().value.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ {
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
+ iter->Seek("");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("z");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a33", iter->Entry().key.ToString());
+ ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a11", iter->Entry().key.ToString());
+ ASSERT_EQ("a11", iter->Entry().value.ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a11", iter->Entry().key.ToString());
+ ASSERT_EQ("a11", iter->Entry().value.ToString());
+ iter->Prev();
+
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a33", iter->Entry().key.ToString());
+ ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+ iter->Prev();
+ ASSERT_TRUE(!iter->Valid());
+ }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestWBWIIterator) {
+ ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+ ASSERT_OK(batch_->Put(&cf1, "a", "a1"));
+ ASSERT_OK(batch_->Put(&cf1, "c", "c1"));
+ ASSERT_OK(batch_->Put(&cf1, "c", "c2"));
+ ASSERT_OK(batch_->Put(&cf1, "e", "e1"));
+ ASSERT_OK(batch_->Put(&cf1, "e", "e2"));
+ ASSERT_OK(batch_->Put(&cf1, "e", "e3"));
+ std::unique_ptr<WBWIIteratorImpl> iter1(
+ static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf1)));
+ std::unique_ptr<WBWIIteratorImpl> iter2(
+ static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf2)));
+ AssertIterEqual(iter1.get(), {"a", "c", "e"});
+ AssertIterEqual(iter2.get(), {});
+ ASSERT_OK(batch_->Put(&cf2, "a", "a2"));
+ ASSERT_OK(batch_->Merge(&cf2, "b", "b1"));
+ ASSERT_OK(batch_->Merge(&cf2, "b", "b2"));
+ ASSERT_OK(batch_->Delete(&cf2, "d"));
+ ASSERT_OK(batch_->Merge(&cf2, "d", "d2"));
+ ASSERT_OK(batch_->Merge(&cf2, "d", "d3"));
+ ASSERT_OK(batch_->Delete(&cf2, "f"));
+ AssertIterEqual(iter1.get(), {"a", "c", "e"});
+ AssertIterEqual(iter2.get(), {"a", "b", "d", "f"});
+}
+
+TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+ std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
+ "f", "g", "h", "i", "j"};
+ for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
+ Random rnd(rand_seed);
+
+ ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+ ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
+ batch_->Clear();
+
+ if (rand_seed % 2 == 0) {
+ ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+ }
+ if (rand_seed % 4 == 1) {
+ ASSERT_OK(batch_->Put(&cf3, "zoo", "bar"));
+ }
+
+ KVMap map;
+ KVMap merged_map;
+ for (auto key : source_strings) {
+ std::string value = key + key;
+ int type = rnd.Uniform(6);
+ switch (type) {
+ case 0:
+ // only base has it
+ map[key] = value;
+ merged_map[key] = value;
+ break;
+ case 1:
+ // only delta has it
+ ASSERT_OK(batch_->Put(&cf1, key, value));
+ map[key] = value;
+ merged_map[key] = value;
+ break;
+ case 2:
+ // both has it. Delta should win
+ ASSERT_OK(batch_->Put(&cf1, key, value));
+ map[key] = "wrong_value";
+ merged_map[key] = value;
+ break;
+ case 3:
+ // both has it. Delta is delete
+ ASSERT_OK(batch_->Delete(&cf1, key));
+ map[key] = "wrong_value";
+ break;
+ case 4:
+ // only delta has it. Delta is delete
+ ASSERT_OK(batch_->Delete(&cf1, key));
+ map[key] = "wrong_value";
+ break;
+ default:
+ // Neither iterator has it.
+ break;
+ }
+ }
+
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+ std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
+
+ bool is_valid = false;
+ for (int i = 0; i < 128; i++) {
+ // Random walk and make sure iter and result_iter returns the
+ // same key and value
+ int type = rnd.Uniform(6);
+ ASSERT_OK(iter->status());
+ switch (type) {
+ case 0:
+ // Seek to First
+ iter->SeekToFirst();
+ result_iter->SeekToFirst();
+ break;
+ case 1:
+ // Seek to last
+ iter->SeekToLast();
+ result_iter->SeekToLast();
+ break;
+ case 2: {
+ // Seek to random key
+ auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ iter->Seek(key);
+ result_iter->Seek(key);
+ break;
+ }
+ case 3: {
+ // SeekForPrev to random key
+ auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ iter->SeekForPrev(key);
+ result_iter->SeekForPrev(key);
+ break;
+ }
+ case 4:
+ // Next
+ if (is_valid) {
+ iter->Next();
+ result_iter->Next();
+ } else {
+ continue;
+ }
+ break;
+ default:
+ assert(type == 5);
+ // Prev
+ if (is_valid) {
+ iter->Prev();
+ result_iter->Prev();
+ } else {
+ continue;
+ }
+ break;
+ }
+ AssertItersMatch(iter.get(), result_iter.get());
+ is_valid = iter->Valid();
+ }
+
+ ASSERT_OK(iter->status());
+ }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+ ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+ {
+ KVMap map;
+ map["a"] = "aa";
+ map["c"] = "cc";
+ map["e"] = "ee";
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "e", "ee");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ AssertIter(iter.get(), "e", "ee");
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Prev();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("b");
+ AssertIter(iter.get(), "c", "cc");
+
+ iter->Prev();
+ AssertIter(iter.get(), "a", "aa");
+
+ iter->Seek("a");
+ AssertIter(iter.get(), "a", "aa");
+ }
+
+ // Test the case that there is one element in the write batch
+ ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+ ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
+ {
+ KVMap empty_map;
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ ASSERT_OK(batch_->Delete(&cf1, "b"));
+ ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+ ASSERT_OK(batch_->Put(&cf1, "d", "dd"));
+ ASSERT_OK(batch_->Delete(&cf1, "e"));
+
+ {
+ KVMap map;
+ map["b"] = "";
+ map["cc"] = "cccc";
+ map["f"] = "ff";
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "cc", "cccc");
+ iter->Next();
+ AssertIter(iter.get(), "d", "dd");
+ iter->Next();
+ AssertIter(iter.get(), "f", "ff");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ AssertIter(iter.get(), "f", "ff");
+ iter->Prev();
+ AssertIter(iter.get(), "d", "dd");
+ iter->Prev();
+ AssertIter(iter.get(), "cc", "cccc");
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "cc", "cccc");
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Prev();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("c");
+ AssertIter(iter.get(), "c", "cc");
+
+ iter->Seek("cb");
+ AssertIter(iter.get(), "cc", "cccc");
+
+ iter->Seek("cc");
+ AssertIter(iter.get(), "cc", "cccc");
+ iter->Next();
+ AssertIter(iter.get(), "d", "dd");
+
+ iter->Seek("e");
+ AssertIter(iter.get(), "f", "ff");
+
+ iter->Prev();
+ AssertIter(iter.get(), "d", "dd");
+
+ iter->Next();
+ AssertIter(iter.get(), "f", "ff");
+ }
+
+ {
+ KVMap empty_map;
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "d", "dd");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ AssertIter(iter.get(), "d", "dd");
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Prev();
+ AssertIter(iter.get(), "a", "aa");
+
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("aa");
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "d", "dd");
+
+ iter->Seek("ca");
+ AssertIter(iter.get(), "d", "dd");
+
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+ ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
+ ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
+
+ // Test the case that there is one element in the write batch
+ ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+ ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
+ {
+ KVMap empty_map;
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ }
+
+ ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+ {
+ KVMap map;
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Next();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ AssertIter(iter.get(), "a", "aa");
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("b");
+ AssertIter(iter.get(), "a", "aa");
+
+ iter->Prev();
+ AssertIter(iter.get(), "c", "cc");
+
+ iter->Seek("a");
+ AssertIter(iter.get(), "a", "aa");
+ }
+
+ // default column family
+ ASSERT_OK(batch_->Put("a", "b"));
+ {
+ KVMap map;
+ map["b"] = "";
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(new KVIter(&map)));
+
+ iter->SeekToFirst();
+ AssertIter(iter.get(), "a", "b");
+ iter->Next();
+ AssertIter(iter.get(), "b", "");
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->SeekToLast();
+ AssertIter(iter.get(), "b", "");
+ iter->Prev();
+ AssertIter(iter.get(), "a", "b");
+ iter->Prev();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+
+ iter->Seek("b");
+ AssertIter(iter.get(), "b", "");
+
+ iter->Prev();
+ AssertIter(iter.get(), "a", "b");
+
+ iter->Seek("0");
+ AssertIter(iter.get(), "a", "b");
+ }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatch) {
+ Options options;
+ Status s;
+ std::string value;
+
+ s = batch_->GetFromBatch(options_, "b", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Put("a", "a"));
+ ASSERT_OK(batch_->Put("b", "b"));
+ ASSERT_OK(batch_->Put("c", "c"));
+ ASSERT_OK(batch_->Put("a", "z"));
+ ASSERT_OK(batch_->Delete("c"));
+ ASSERT_OK(batch_->Delete("d"));
+ ASSERT_OK(batch_->Delete("e"));
+ ASSERT_OK(batch_->Put("e", "e"));
+
+ s = batch_->GetFromBatch(options_, "b", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ s = batch_->GetFromBatch(options_, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("z", value);
+
+ s = batch_->GetFromBatch(options_, "c", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = batch_->GetFromBatch(options_, "d", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = batch_->GetFromBatch(options_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = batch_->GetFromBatch(options_, "e", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("e", value);
+
+ ASSERT_OK(batch_->Merge("z", "z"));
+
+ s = batch_->GetFromBatch(options_, "z", &value);
+ ASSERT_NOK(s); // No merge operator specified.
+
+ s = batch_->GetFromBatch(options_, "b", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+ std::string value;
+
+ s = batch_->GetFromBatch(options_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Put("x", "X"));
+ std::string expected = "X";
+
+ for (int i = 0; i < 5; i++) {
+ ASSERT_OK(batch_->Merge("x", std::to_string(i)));
+ expected = expected + "," + std::to_string(i);
+
+ if (i % 2 == 0) {
+ ASSERT_OK(batch_->Put("y", std::to_string(i / 2)));
+ }
+
+ ASSERT_OK(batch_->Merge("z", "z"));
+
+ s = batch_->GetFromBatch(column_family, options_, "x", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(expected, value);
+
+ s = batch_->GetFromBatch(column_family, options_, "y", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ(std::to_string(i / 2), value);
+
+ s = batch_->GetFromBatch(column_family, options_, "z", &value);
+ ASSERT_TRUE(s.IsMergeInProgress());
+ }
+}
+
+TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+ std::string value;
+
+ s = batch_->GetFromBatch(column_family, options_, "X", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Put(column_family, "X", "x"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x", value);
+
+ ASSERT_OK(batch_->Put(column_family, "X", "x2"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x2", value);
+
+ ASSERT_OK(batch_->Merge(column_family, "X", "aaa"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x2,aaa", value);
+
+ ASSERT_OK(batch_->Merge(column_family, "X", "bbb"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x2,aaa,bbb", value);
+
+ ASSERT_OK(batch_->Put(column_family, "X", "x3"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x3", value);
+
+ ASSERT_OK(batch_->Merge(column_family, "X", "ccc"));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("x3,ccc", value);
+
+ ASSERT_OK(batch_->Delete(column_family, "X"));
+ s = batch_->GetFromBatch(column_family, options_, "X", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ batch_->Merge(column_family, "X", "ddd");
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+ ASSERT_EQ("ddd", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
+ ASSERT_OK(OpenDB());
+
+ std::string value;
+
+ ASSERT_OK(db_->Put(write_opts_, "a", "a"));
+ ASSERT_OK(db_->Put(write_opts_, "b", "b"));
+ ASSERT_OK(db_->Put(write_opts_, "c", "c"));
+
+ ASSERT_OK(batch_->Put("a", "batch_->a"));
+ ASSERT_OK(batch_->Delete("b"));
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+ ASSERT_EQ("batch_->a", value);
+
+ Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+ ASSERT_EQ("c", value);
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ std::string value;
+
+ ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+ ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+ ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+ ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+ ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+
+ ASSERT_OK(batch_->Merge("a", "a1"));
+ ASSERT_OK(batch_->Merge("a", "a2"));
+ ASSERT_OK(batch_->Merge("b", "b2"));
+ ASSERT_OK(batch_->Merge("d", "d1"));
+ ASSERT_OK(batch_->Merge("e", "e0"));
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+ ASSERT_EQ("a0,a1,a2", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+ ASSERT_EQ("b0,b1,b2", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+ ASSERT_EQ("c0", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+ ASSERT_EQ("d0,d1", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+ ASSERT_EQ("e0", value);
+
+ ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions snapshot_read_options;
+ snapshot_read_options.snapshot = snapshot;
+
+ ASSERT_OK(db_->Delete(write_opts_, "a"));
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+ ASSERT_EQ("a1,a2", value);
+
+ ASSERT_OK(
+ s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
+ ASSERT_EQ("a0,a1,a2", value);
+
+ ASSERT_OK(batch_->Delete("a"));
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
+
+ ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+ ASSERT_EQ("c0,c1", value);
+
+ ASSERT_OK(
+ s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
+ ASSERT_EQ("c0", value);
+
+ ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+ ASSERT_EQ("e1,e0", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+ ASSERT_EQ("e0", value);
+
+ ASSERT_OK(s = db_->Delete(write_opts_, "e"));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+ ASSERT_EQ("e0", value);
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+ ASSERT_EQ("e0", value);
+
+ db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ std::string value;
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Merge("A", "xxx"));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+ ASSERT_EQ(value, "xxx");
+
+ ASSERT_OK(batch_->Merge("A", "yyy"));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+ ASSERT_EQ(value, "xxx,yyy");
+
+ ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+ ASSERT_EQ(value, "a0,xxx,yyy");
+
+ ASSERT_OK(batch_->Delete("A"));
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ FlushOptions flush_options;
+ std::string value;
+
+ ASSERT_OK(db_->Put(write_opts_, "A", "1"));
+ ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
+ ASSERT_OK(batch_->Merge("A", "2"));
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+ ASSERT_EQ(value, "1,2");
+}
+
+TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
+ Status s = OpenDB();
+ ASSERT_OK(s);
+
+ PinnableSlice value;
+
+ ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+ ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+ ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+ ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+ ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+ ASSERT_OK(batch_->Merge("a", "a1"));
+ ASSERT_OK(batch_->Merge("a", "a2"));
+ ASSERT_OK(batch_->Merge("b", "b2"));
+ ASSERT_OK(batch_->Merge("d", "d1"));
+ ASSERT_OK(batch_->Merge("e", "e0"));
+
+ for (int i = 0; i < 2; i++) {
+ if (i == 1) {
+ // Do it again with a flushed DB...
+ ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
+ }
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+ ASSERT_EQ("a0,a1,a2", value.ToString());
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+ ASSERT_EQ("b0,b1,b2", value.ToString());
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+ ASSERT_EQ("c0", value.ToString());
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+ ASSERT_EQ("d0,d1", value.ToString());
+
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+ ASSERT_EQ("e0", value.ToString());
+ ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+}
+
+void AssertKey(std::string key, WBWIIterator* iter) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key, iter->Entry().key.ToString());
+}
+
+void AssertValue(std::string value, WBWIIterator* iter) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value, iter->Entry().value.ToString());
+}
+
+// Tests that we can write to the WBWI while we iterate (from a single thread).
+// iteration should see the newest writes
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingCorrectnessTest) {
+ for (char c = 'a'; c <= 'z'; ++c) {
+ ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+ }
+
+ std::unique_ptr<WBWIIterator> iter(batch_->NewIterator());
+ iter->Seek("k");
+ AssertKey("k", iter.get());
+ iter->Next();
+ AssertKey("l", iter.get());
+ ASSERT_OK(batch_->Put("ab", "cc"));
+ iter->Next();
+ AssertKey("m", iter.get());
+ ASSERT_OK(batch_->Put("mm", "kk"));
+ iter->Next();
+ AssertKey("mm", iter.get());
+ AssertValue("kk", iter.get());
+ ASSERT_OK(batch_->Delete("mm"));
+
+ iter->Next();
+ AssertKey("n", iter.get());
+ iter->Prev();
+ AssertKey("mm", iter.get());
+ ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+
+ iter->Seek("ab");
+ AssertKey("ab", iter.get());
+ ASSERT_OK(batch_->Delete("x"));
+ iter->Seek("x");
+ AssertKey("x", iter.get());
+ ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+ iter->Prev();
+ AssertKey("w", iter.get());
+}
+
+void AssertIterKey(std::string key, Iterator* iter) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(key, iter->key().ToString());
+}
+
+void AssertIterValue(std::string value, Iterator* iter) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(value, iter->value().ToString());
+}
+
+// same thing as above, but testing IteratorWithBase
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseCorrectnessTest) {
+ WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+ for (char c = 'a'; c <= 'z'; ++c) {
+ ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+ }
+
+ KVMap map;
+ map["aa"] = "aa";
+ map["cc"] = "cc";
+ map["ee"] = "ee";
+ map["em"] = "me";
+
+ std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
+ iter->Seek("k");
+ AssertIterKey("k", iter.get());
+ iter->Next();
+ AssertIterKey("l", iter.get());
+ ASSERT_OK(batch_->Put("ab", "cc"));
+ iter->Next();
+ AssertIterKey("m", iter.get());
+ ASSERT_OK(batch_->Put("mm", "kk"));
+ iter->Next();
+ AssertIterKey("mm", iter.get());
+ AssertIterValue("kk", iter.get());
+ ASSERT_OK(batch_->Delete("mm"));
+ iter->Next();
+ AssertIterKey("n", iter.get());
+ iter->Prev();
+ // "mm" is deleted, so we're back at "m"
+ AssertIterKey("m", iter.get());
+
+ iter->Seek("ab");
+ AssertIterKey("ab", iter.get());
+ iter->Prev();
+ AssertIterKey("aa", iter.get());
+ iter->Prev();
+ AssertIterKey("a", iter.get());
+ ASSERT_OK(batch_->Delete("aa"));
+ iter->Next();
+ AssertIterKey("ab", iter.get());
+ iter->Prev();
+ AssertIterKey("a", iter.get());
+
+ ASSERT_OK(batch_->Delete("x"));
+ iter->Seek("x");
+ AssertIterKey("y", iter.get());
+ iter->Next();
+ AssertIterKey("z", iter.get());
+ iter->Prev();
+ iter->Prev();
+ AssertIterKey("w", iter.get());
+
+ ASSERT_OK(batch_->Delete("e"));
+ iter->Seek("e");
+ AssertIterKey("ee", iter.get());
+ AssertIterValue("ee", iter.get());
+ ASSERT_OK(batch_->Put("ee", "xx"));
+ // still the same value
+ AssertIterValue("ee", iter.get());
+ iter->Next();
+ AssertIterKey("em", iter.get());
+ iter->Prev();
+ // new value
+ AssertIterValue("xx", iter.get());
+
+ ASSERT_OK(iter->status());
+}
+
+// stress testing mutations with IteratorWithBase
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseStressTest) {
+ for (char c = 'a'; c <= 'z'; ++c) {
+ ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+ }
+
+ KVMap map;
+ for (char c = 'a'; c <= 'z'; ++c) {
+ map[std::string(2, c)] = std::string(2, c);
+ }
+
+ std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
+
+ Random rnd(301);
+ for (int i = 0; i < 1000000; ++i) {
+ int random = rnd.Uniform(8);
+ char c = static_cast<char>(rnd.Uniform(26) + 'a');
+ switch (random) {
+ case 0:
+ ASSERT_OK(batch_->Put(std::string(1, c), "xxx"));
+ break;
+ case 1:
+ ASSERT_OK(batch_->Put(std::string(2, c), "xxx"));
+ break;
+ case 2:
+ ASSERT_OK(batch_->Delete(std::string(1, c)));
+ break;
+ case 3:
+ ASSERT_OK(batch_->Delete(std::string(2, c)));
+ break;
+ case 4:
+ iter->Seek(std::string(1, c));
+ break;
+ case 5:
+ iter->Seek(std::string(2, c));
+ break;
+ case 6:
+ if (iter->Valid()) {
+ iter->Next();
+ }
+ break;
+ case 7:
+ if (iter->Valid()) {
+ iter->Prev();
+ }
+ break;
+ default:
+ assert(false);
+ }
+ }
+ ASSERT_OK(iter->status());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestNewIteratorWithBaseFromWbwi) {
+ ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+ KVMap map;
+ map["a"] = "aa";
+ map["c"] = "cc";
+ map["e"] = "ee";
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+ ASSERT_NE(nullptr, iter);
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_OK(iter->status());
+}
+
+TEST_P(WriteBatchWithIndexTest, SavePointTest) {
+ ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+ KVMap empty_map;
+ std::unique_ptr<Iterator> cf0_iter(
+ batch_->NewIteratorWithBase(new KVIter(&empty_map)));
+ std::unique_ptr<Iterator> cf1_iter(
+ batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+ Status s;
+ KVMap kvm_cf0_0 = {{"A", "aa"}, {"B", "b"}};
+ KVMap kvm_cf1_0 = {{"A", "a1"}, {"C", "c1"}, {"E", "e1"}};
+ KVIter kvi_cf0_0(&kvm_cf0_0);
+ KVIter kvi_cf1_0(&kvm_cf1_0);
+
+ ASSERT_OK(batch_->Put("A", "a"));
+ ASSERT_OK(batch_->Put("B", "b"));
+ ASSERT_OK(batch_->Put("A", "aa"));
+ ASSERT_OK(batch_->Put(&cf1, "A", "a1"));
+ ASSERT_OK(batch_->Delete(&cf1, "B"));
+ ASSERT_OK(batch_->Put(&cf1, "C", "c1"));
+ ASSERT_OK(batch_->Put(&cf1, "E", "e1"));
+
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+ batch_->SetSavePoint(); // 1
+
+ KVMap kvm_cf0_1 = {{"B", "bb"}, {"C", "cc"}};
+ KVMap kvm_cf1_1 = {{"B", "b1"}, {"C", "c1"}};
+ KVIter kvi_cf0_1(&kvm_cf0_1);
+ KVIter kvi_cf1_1(&kvm_cf1_1);
+
+ ASSERT_OK(batch_->Put("C", "cc"));
+ ASSERT_OK(batch_->Put("B", "bb"));
+ ASSERT_OK(batch_->Delete("A"));
+ ASSERT_OK(batch_->Put(&cf1, "B", "b1"));
+ ASSERT_OK(batch_->Delete(&cf1, "A"));
+ ASSERT_OK(batch_->SingleDelete(&cf1, "E"));
+ batch_->SetSavePoint(); // 2
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+ KVMap kvm_cf0_2 = {{"A", "xxx"}, {"C", "cc"}};
+ KVMap kvm_cf1_2 = {{"B", "b2"}};
+ KVIter kvi_cf0_2(&kvm_cf0_2);
+ KVIter kvi_cf1_2(&kvm_cf1_2);
+
+ ASSERT_OK(batch_->Put("A", "aaa"));
+ ASSERT_OK(batch_->Put("A", "xxx"));
+ ASSERT_OK(batch_->Delete("B"));
+ ASSERT_OK(batch_->Put(&cf1, "B", "b2"));
+ ASSERT_OK(batch_->Delete(&cf1, "C"));
+ batch_->SetSavePoint(); // 3
+ batch_->SetSavePoint(); // 4
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+ KVMap kvm_cf0_4 = {{"A", "xxx"}, {"C", "cc"}};
+ KVMap kvm_cf1_4 = {{"B", "b2"}};
+ KVIter kvi_cf0_4(&kvm_cf0_4);
+ KVIter kvi_cf1_4(&kvm_cf1_4);
+ ASSERT_OK(batch_->SingleDelete("D"));
+ ASSERT_OK(batch_->Delete(&cf1, "D"));
+ ASSERT_OK(batch_->Delete(&cf1, "E"));
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_4);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_4);
+
+ ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 4
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+ ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 3
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+ ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 2
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+ batch_->SetSavePoint(); // 5
+ ASSERT_OK(batch_->Put("X", "x"));
+
+ KVMap kvm_cf0_5 = {{"B", "bb"}, {"C", "cc"}, {"X", "x"}};
+ KVIter kvi_cf0_5(&kvm_cf0_5);
+ KVIter kvi_cf1_5(&kvm_cf1_1);
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_5);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_5);
+
+ ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 5
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+ ASSERT_OK(batch_->RollbackToSavePoint()); // rollback to 1
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+ s = batch_->RollbackToSavePoint(); // no savepoint found
+ ASSERT_TRUE(s.IsNotFound());
+ AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+ AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+ batch_->SetSavePoint(); // 6
+
+ batch_->Clear();
+ ASSERT_EQ("", PrintContents(batch_.get(), nullptr));
+ ASSERT_EQ("", PrintContents(batch_.get(), &cf1));
+
+ s = batch_->RollbackToSavePoint(); // rollback to 6
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, SingleDeleteTest) {
+ Status s;
+ std::string value;
+
+ ASSERT_OK(batch_->SingleDelete("A"));
+
+ s = batch_->GetFromBatch(options_, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatch(options_, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ batch_->Clear();
+ ASSERT_OK(batch_->Put("A", "a"));
+ ASSERT_OK(batch_->Put("A", "a2"));
+ ASSERT_OK(batch_->Put("B", "b"));
+ ASSERT_OK(batch_->SingleDelete("A"));
+
+ s = batch_->GetFromBatch(options_, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatch(options_, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b", value);
+
+ ASSERT_OK(batch_->Put("C", "c"));
+ ASSERT_OK(batch_->Put("A", "a3"));
+ ASSERT_OK(batch_->Delete("B"));
+ ASSERT_OK(batch_->SingleDelete("B"));
+ ASSERT_OK(batch_->SingleDelete("C"));
+
+ s = batch_->GetFromBatch(options_, "A", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a3", value);
+ s = batch_->GetFromBatch(options_, "B", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatch(options_, "C", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatch(options_, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Put("B", "b4"));
+ ASSERT_OK(batch_->Put("C", "c4"));
+ ASSERT_OK(batch_->Put("D", "d4"));
+ ASSERT_OK(batch_->SingleDelete("D"));
+ ASSERT_OK(batch_->SingleDelete("D"));
+ ASSERT_OK(batch_->Delete("A"));
+
+ s = batch_->GetFromBatch(options_, "A", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatch(options_, "B", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("b4", value);
+ s = batch_->GetFromBatch(options_, "C", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("c4", value);
+ s = batch_->GetFromBatch(options_, "D", &value);
+ ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
+ std::string value;
+ ASSERT_OK(batch_->Put("A", "a"));
+ ASSERT_OK(batch_->Put("A", "a2"));
+ ASSERT_OK(batch_->Put("B", "b"));
+ ASSERT_OK(batch_->SingleDelete("A"));
+ ASSERT_OK(batch_->Delete("B"));
+
+ KVMap map;
+ value = PrintContents(batch_.get(), &map, nullptr);
+ ASSERT_EQ("", value);
+
+ map["A"] = "aa";
+ map["C"] = "cc";
+ map["D"] = "dd";
+
+ ASSERT_OK(batch_->SingleDelete("B"));
+ ASSERT_OK(batch_->SingleDelete("C"));
+ ASSERT_OK(batch_->SingleDelete("Z"));
+
+ value = PrintContents(batch_.get(), &map, nullptr);
+ ASSERT_EQ("D:dd,", value);
+
+ ASSERT_OK(batch_->Put("A", "a3"));
+ ASSERT_OK(batch_->Put("B", "b3"));
+ ASSERT_OK(batch_->SingleDelete("A"));
+ ASSERT_OK(batch_->SingleDelete("A"));
+ ASSERT_OK(batch_->SingleDelete("D"));
+ ASSERT_OK(batch_->SingleDelete("D"));
+ ASSERT_OK(batch_->Delete("D"));
+
+ map["E"] = "ee";
+
+ value = PrintContents(batch_.get(), &map, nullptr);
+ ASSERT_EQ("B:b3,E:ee,", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
+ // MultiGet a lot of keys in order to force std::vector reallocations
+ std::vector<std::string> keys;
+ for (int i = 0; i < 100; ++i) {
+ keys.emplace_back(std::to_string(i));
+ }
+
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ // Write some data to the db for the even numbered keys
+ {
+ WriteBatch wb;
+ for (size_t i = 0; i < keys.size(); i += 2) {
+ std::string val = "val" + std::to_string(i);
+ ASSERT_OK(wb.Put(cf0, keys[i], val));
+ }
+ ASSERT_OK(db_->Write(write_opts_, &wb));
+ for (size_t i = 0; i < keys.size(); i += 2) {
+ std::string value;
+ ASSERT_OK(db_->Get(read_opts_, cf0, keys[i], &value));
+ }
+ }
+
+ // Write some data to the batch
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if ((i % 5) == 0) {
+ ASSERT_OK(batch_->Delete(cf0, keys[i]));
+ } else if ((i % 7) == 0) {
+ std::string val = "new" + std::to_string(i);
+ ASSERT_OK(batch_->Put(cf0, keys[i], val));
+ }
+ if (i > 0 && (i % 3) == 0) {
+ ASSERT_OK(batch_->Merge(cf0, keys[i], "merge"));
+ }
+ }
+
+ std::vector<Slice> key_slices;
+ for (size_t i = 0; i < keys.size(); ++i) {
+ key_slices.emplace_back(keys[i]);
+ }
+ std::vector<PinnableSlice> values(keys.size());
+ std::vector<Status> statuses(keys.size());
+
+ batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+ key_slices.data(), values.data(),
+ statuses.data(), false);
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if (i == 0) {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ } else if ((i % 3) == 0) {
+ ASSERT_OK(statuses[i]);
+ if ((i % 5) == 0) { // Merge after Delete
+ ASSERT_EQ(values[i], "merge");
+ } else if ((i % 7) == 0) { // Merge after Put
+ std::string val = "new" + std::to_string(i);
+ ASSERT_EQ(values[i], val + ",merge");
+ } else if ((i % 2) == 0) {
+ std::string val = "val" + std::to_string(i);
+ ASSERT_EQ(values[i], val + ",merge");
+ } else {
+ ASSERT_EQ(values[i], "merge");
+ }
+ } else if ((i % 5) == 0) {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ } else if ((i % 7) == 0) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], "new" + std::to_string(i));
+ } else if ((i % 2) == 0) {
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], "val" + std::to_string(i));
+ } else {
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ }
+ }
+}
+TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
+ // MultiGet a lot of keys in order to force std::vector reallocations
+ const int num_keys = 700;
+ const int keys_per_pass = 100;
+ std::vector<std::string> keys;
+ for (size_t i = 0; i < num_keys; ++i) {
+ keys.emplace_back(std::to_string(i));
+ }
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ // Keys 0- 99 have a PUT in the batch but not DB
+ // Keys 100-199 have a PUT in the DB
+ // Keys 200-299 Have a PUT/DELETE
+ // Keys 300-399 Have a PUT/DELETE/MERGE
+ // Keys 400-499 have a PUT/MERGE
+ // Keys 500-599 have a MERGE only
+ // Keys 600-699 were never written
+ {
+ WriteBatch wb;
+ for (size_t i = 100; i < 500; i++) {
+ std::string val = std::to_string(i);
+ ASSERT_OK(wb.Put(cf0, keys[i], val));
+ }
+ ASSERT_OK(db_->Write(write_opts_, &wb));
+ }
+ ASSERT_OK(db_->Flush(FlushOptions(), cf0));
+ for (size_t i = 0; i < 100; i++) {
+ ASSERT_OK(batch_->Put(cf0, keys[i], keys[i]));
+ }
+ for (size_t i = 200; i < 400; i++) {
+ ASSERT_OK(batch_->Delete(cf0, keys[i]));
+ }
+ for (size_t i = 300; i < 600; i++) {
+ std::string val = std::to_string(i) + "m";
+ ASSERT_OK(batch_->Merge(cf0, keys[i], val));
+ }
+
+ Random rnd(301);
+ std::vector<PinnableSlice> values(keys_per_pass);
+ std::vector<Status> statuses(keys_per_pass);
+ for (int pass = 0; pass < 40; pass++) {
+ std::vector<Slice> key_slices;
+ for (size_t i = 0; i < keys_per_pass; i++) {
+ int random = rnd.Uniform(num_keys);
+ key_slices.emplace_back(keys[random]);
+ }
+ batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+ key_slices.data(), values.data(),
+ statuses.data(), false);
+ for (size_t i = 0; i < keys_per_pass; i++) {
+ int key = ParseInt(key_slices[i].ToString());
+ switch (key / 100) {
+ case 0: // 0-99 PUT only
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], key_slices[i].ToString());
+ break;
+ case 1: // 100-199 PUT only
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], key_slices[i].ToString());
+ break;
+ case 2: // 200-299 Deleted
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ break;
+ case 3: // 300-399 Delete+Merge
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+ break;
+ case 4: // 400-400 Put+ Merge
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], key_slices[i].ToString() + "," +
+ key_slices[i].ToString() + "m");
+ break;
+ case 5: // Merge only
+ ASSERT_OK(statuses[i]);
+ ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+ break;
+ case 6: // Never written
+ ASSERT_TRUE(statuses[i].IsNotFound());
+ break;
+ default:
+ assert(false);
+ } // end switch
+ } // End for each key
+ } // end for passes
+}
+
+// This test has merges, but the merge does not play into the final result
+TEST_P(WriteBatchWithIndexTest, FakeMergeWithIteratorTest) {
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ // The map we are starting with
+ KVMap input = {
+ {"odm", "odm0"},
+ {"omd", "omd0"},
+ {"omp", "omp0"},
+ };
+ KVMap result = {
+ {"odm", "odm2"}, // Orig, Delete, Merge
+ {"mp", "mp1"}, // Merge, Put
+ {"omp", "omp2"}, // Origi, Merge, Put
+ {"mmp", "mmp2"} // Merge, Merge, Put
+ };
+
+ for (auto& iter : result) {
+ EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+ }
+ AddToBatch(cf0, "md"); // Merge, Delete
+ AddToBatch(cf0, "mmd"); // Merge, Merge, Delete
+ AddToBatch(cf0, "omd"); // Orig, Merge, Delete
+
+ KVIter kvi(&result);
+ // First try just the batch
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(cf0, new KVIter(&input)));
+ AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTest) {
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ KVMap result = {
+ {"m", "m0"}, // Merge
+ {"mm", "mm0,mm1"}, // Merge, Merge
+ {"dm", "dm1"}, // Delete, Merge
+ {"dmm", "dmm1,dmm2"}, // Delete, Merge, Merge
+ {"mdm", "mdm2"}, // Merge, Delete, Merge
+ {"mpm", "mpm1,mpm2"}, // Merge, Put, Merge
+ {"pm", "pm0,pm1"}, // Put, Merge
+ {"pmm", "pmm0,pmm1,pmm2"}, // Put, Merge, Merge
+ };
+
+ for (auto& iter : result) {
+ EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+ }
+
+ KVIter kvi(&result);
+ // First try just the batch
+ KVMap empty_map;
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(cf0, new KVIter(&empty_map)));
+ AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTestWithOrig) {
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+ KVMap original;
+ KVMap results = {
+ {"m", "om,m0"}, // Merge
+ {"mm", "omm,mm0,mm1"}, // Merge, Merge
+ {"dm", "dm1"}, // Delete, Merge
+ {"dmm", "dmm1,dmm2"}, // Delete, Merge, Merge
+ {"mdm", "mdm2"}, // Merge, Delete, Merge
+ {"mpm", "mpm1,mpm2"}, // Merge, Put, Merge
+ {"pm", "pm0,pm1"}, // Put, Merge
+ {"pmm", "pmm0,pmm1,pmm2"}, // Put, Merge, Merge
+ };
+
+ for (auto& iter : results) {
+ AddToBatch(cf0, iter.first);
+ original[iter.first] = "o" + iter.first;
+ }
+
+ KVIter kvi(&results);
+ // First try just the batch
+ std::unique_ptr<Iterator> iter(
+ batch_->NewIteratorWithBase(cf0, new KVIter(&original)));
+ AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) {
+ std::string value;
+ Status s;
+
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+ batch_->Merge("o", "bb"); // Merging bb under key "o"
+ batch_->Merge("m", "cc"); // Merging bc under key "m"
+ s = batch_->GetFromBatch(options_, "m", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+ s = batch_->GetFromBatch(options_, "o", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+
+ ASSERT_OK(db_->Write(write_opts_, batch_->GetWriteBatch()));
+ ASSERT_OK(db_->Get(read_opts_, "o", &value));
+ ASSERT_EQ(value, "aa,bb");
+ ASSERT_OK(db_->Get(read_opts_, "m", &value));
+ ASSERT_EQ(value, "cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
+ std::string value;
+
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+ ASSERT_OK(batch_->Merge("o", "bb")); // Merging bb under key "o"
+ ASSERT_OK(batch_->Merge("m", "cc")); // Merging bc under key "m"
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+ ASSERT_EQ(value, "aa,bb");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+ ASSERT_EQ(value, "cc");
+}
+
+TEST_F(WBWIKeepTest, GetAfterPut) {
+ std::string value;
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+ ASSERT_OK(batch_->Put("key", "aa")); // Writing aa under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "aa");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "aa");
+
+ ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "aa,bb");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "aa,bb");
+
+ ASSERT_OK(batch_->Merge("key", "cc")); // Merging cc under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "aa,bb,cc");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "aa,bb,cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
+ std::string value;
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+ ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+ ASSERT_OK(batch_->Merge("key", "aa")); // Merging aa under key
+ Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "orig,aa");
+
+ ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key
+ s = batch_->GetFromBatch(cf0, options_, "key", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "orig,aa,bb");
+
+ ASSERT_OK(batch_->Put("key", "cc")); // Writing cc under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "cc");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "cc");
+
+ ASSERT_OK(batch_->Merge("key", "dd")); // Merging dd under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "cc,dd");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
+ std::string value;
+ ASSERT_OK(OpenDB());
+ ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+ ASSERT_OK(batch_->Merge("key", "aa")); // Merging aa under key
+ Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "aa");
+
+ ASSERT_OK(batch_->Merge("key", "bb")); // Merging bb under key
+ s = batch_->GetFromBatch(cf0, options_, "key", &value);
+ ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "aa,bb");
+
+ ASSERT_OK(batch_->Delete("key")); // Delete key from batch
+ s = batch_->GetFromBatch(cf0, options_, "key", &value);
+ ASSERT_TRUE(s.IsNotFound());
+ s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+ ASSERT_TRUE(s.IsNotFound());
+
+ ASSERT_OK(batch_->Merge("key", "cc")); // Merging cc under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "cc");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "cc");
+ ASSERT_OK(batch_->Merge("key", "dd")); // Merging dd under key
+ ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+ ASSERT_EQ(value, "cc,dd");
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+ ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_F(WBWIOverwriteTest, TestBadMergeOperator) {
+ class FailingMergeOperator : public MergeOperator {
+ public:
+ FailingMergeOperator() {}
+
+ bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+ MergeOperationOutput* /*merge_out*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "Failing"; }
+ };
+ options_.merge_operator.reset(new FailingMergeOperator());
+ ASSERT_OK(OpenDB());
+
+ ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+ std::string value;
+
+ ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+ ASSERT_OK(batch_->Put("b", "b0"));
+
+ ASSERT_OK(batch_->Merge("a", "a1"));
+ ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+ ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
+ ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+ ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
+}
+
+TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
+ ColumnFamilyHandleImplDummy cf2(2,
+ test::BytewiseComparatorWithU64TsWrapper());
+
+ // Sanity checks
+ ASSERT_TRUE(batch_->Put(&cf2, "key", "ts", "value").IsNotSupported());
+ ASSERT_TRUE(batch_->Put(/*column_family=*/nullptr, "key", "ts", "value")
+ .IsInvalidArgument());
+ ASSERT_TRUE(batch_->Delete(&cf2, "key", "ts").IsNotSupported());
+ ASSERT_TRUE(batch_->Delete(/*column_family=*/nullptr, "key", "ts")
+ .IsInvalidArgument());
+ ASSERT_TRUE(batch_->SingleDelete(&cf2, "key", "ts").IsNotSupported());
+ ASSERT_TRUE(batch_->SingleDelete(/*column_family=*/nullptr, "key", "ts")
+ .IsInvalidArgument());
+ {
+ std::string value;
+ ASSERT_TRUE(batch_
+ ->GetFromBatchAndDB(
+ /*db=*/nullptr, ReadOptions(), &cf2, "key", &value)
+ .IsInvalidArgument());
+ }
+ {
+ constexpr size_t num_keys = 2;
+ std::array<Slice, num_keys> keys{{Slice(), Slice()}};
+ std::array<PinnableSlice, num_keys> pinnable_vals{
+ {PinnableSlice(), PinnableSlice()}};
+ std::array<Status, num_keys> statuses{{Status(), Status()}};
+ constexpr bool sorted_input = false;
+ batch_->MultiGetFromBatchAndDB(/*db=*/nullptr, ReadOptions(), &cf2,
+ num_keys, keys.data(), pinnable_vals.data(),
+ statuses.data(), sorted_input);
+ for (const auto& s : statuses) {
+ ASSERT_TRUE(s.IsInvalidArgument());
+ }
+ }
+
+ constexpr uint32_t kMaxKey = 10;
+
+ const auto ts_sz_lookup = [&cf2](uint32_t id) {
+ if (cf2.GetID() == id) {
+ return sizeof(uint64_t);
+ } else {
+ return std::numeric_limits<size_t>::max();
+ }
+ };
+
+ // Put keys
+ for (uint32_t i = 0; i < kMaxKey; ++i) {
+ std::string key;
+ PutFixed32(&key, i);
+ Status s = batch_->Put(&cf2, key, "value" + std::to_string(i));
+ ASSERT_OK(s);
+ }
+
+ WriteBatch* wb = batch_->GetWriteBatch();
+ assert(wb);
+ ASSERT_OK(
+ wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\0'), ts_sz_lookup));
+
+ // Point lookup
+ for (uint32_t i = 0; i < kMaxKey; ++i) {
+ std::string value;
+ std::string key;
+ PutFixed32(&key, i);
+ Status s = batch_->GetFromBatch(&cf2, Options(), key, &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("value" + std::to_string(i), value);
+ }
+
+ // Iterator
+ {
+ std::unique_ptr<WBWIIterator> it(batch_->NewIterator(&cf2));
+ uint32_t start = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) {
+ std::string key;
+ PutFixed32(&key, start);
+ ASSERT_OK(it->status());
+ ASSERT_EQ(key, it->Entry().key);
+ ASSERT_EQ("value" + std::to_string(start), it->Entry().value);
+ ASSERT_EQ(WriteType::kPutRecord, it->Entry().type);
+ }
+ ASSERT_EQ(kMaxKey, start);
+ }
+
+ // Delete the keys with Delete() or SingleDelete()
+ for (uint32_t i = 0; i < kMaxKey; ++i) {
+ std::string key;
+ PutFixed32(&key, i);
+ Status s;
+ if (0 == (i % 2)) {
+ s = batch_->Delete(&cf2, key);
+ } else {
+ s = batch_->SingleDelete(&cf2, key);
+ }
+ ASSERT_OK(s);
+ }
+
+ ASSERT_OK(wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\xfe'),
+ ts_sz_lookup));
+
+ for (uint32_t i = 0; i < kMaxKey; ++i) {
+ std::string value;
+ std::string key;
+ PutFixed32(&key, i);
+ Status s = batch_->GetFromBatch(&cf2, Options(), key, &value);
+ ASSERT_TRUE(s.IsNotFound());
+ }
+
+ // Iterator
+ {
+ const bool overwrite = GetParam();
+ std::unique_ptr<WBWIIterator> it(batch_->NewIterator(&cf2));
+ uint32_t start = 0;
+ for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) {
+ std::string key;
+ PutFixed32(&key, start);
+ ASSERT_EQ(key, it->Entry().key);
+ if (!overwrite) {
+ ASSERT_EQ(WriteType::kPutRecord, it->Entry().type);
+ it->Next();
+ ASSERT_TRUE(it->Valid());
+ }
+ if (0 == (start % 2)) {
+ ASSERT_EQ(WriteType::kDeleteRecord, it->Entry().type);
+ } else {
+ ASSERT_EQ(WriteType::kSingleDeleteRecord, it->Entry().type);
+ }
+ }
+ }
+}
+
+TEST_P(WriteBatchWithIndexTest, IndexNoTs) {
+ const Comparator* const ucmp = test::BytewiseComparatorWithU64TsWrapper();
+ ColumnFamilyHandleImplDummy cf(1, ucmp);
+ WriteBatchWithIndex wbwi;
+ ASSERT_OK(wbwi.Put(&cf, "a", "a0"));
+ ASSERT_OK(wbwi.Put(&cf, "a", "a1"));
+ {
+ std::string ts;
+ PutFixed64(&ts, 10000);
+ ASSERT_OK(wbwi.GetWriteBatch()->UpdateTimestamps(
+ ts, [](uint32_t cf_id) { return cf_id == 1 ? 8 : 0; }));
+ }
+ {
+ std::string value;
+ Status s = wbwi.GetFromBatch(&cf, options_, "a", &value);
+ ASSERT_OK(s);
+ ASSERT_EQ("a1", value);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool());
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main() {
+ fprintf(stderr, "SKIPPED\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE