diff options
Diffstat (limited to 'src/rocksdb/utilities/transactions')
89 files changed, 39432 insertions, 0 deletions
diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.cc b/src/rocksdb/utilities/transactions/lock/lock_manager.cc new file mode 100644 index 000000000..df16b32ad --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_manager.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/lock_manager.h" + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt) { + assert(db); + if (opt.lock_mgr_handle) { + // A custom lock manager was provided in options + auto mgr = opt.lock_mgr_handle->getLockManager(); + return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr); + } else { + // Use a point lock manager by default + return std::shared_ptr<LockManager>(new PointLockManager(db, opt)); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.h b/src/rocksdb/utilities/transactions/lock/lock_manager.h new file mode 100644 index 000000000..a5ce1948c --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_manager.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +class LockManager { + public: + virtual ~LockManager() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Locks acquired through this LockManager should be tracked by + // the LockTrackers created through the returned factory. + virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0; + + // Enable locking for the specified column family. + // Caller should guarantee that this column family is not already enabled. + virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Disable locking for the specified column family. + // Caller should guarantee that this column family is no longer used. + virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0; + + // Attempt to lock a key or a key range. If OK status is returned, the caller + // is responsible for calling UnLock() on this key. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) = 0; + // The range [start, end] are inclusive at both sides. + virtual Status TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env, bool exclusive) = 0; + + // Unlock a key or a range locked by TryLock(). txn must be the same + // Transaction that locked this key. + virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const std::string& key, + Env* env) = 0; + virtual void UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, const Endpoint& start, + const Endpoint& end, Env* env) = 0; + + using PointLockStatus = std::unordered_multimap<ColumnFamilyId, KeyLockInfo>; + virtual PointLockStatus GetPointLockStatus() = 0; + + using RangeLockStatus = + std::unordered_multimap<ColumnFamilyId, RangeLockInfo>; + virtual RangeLockStatus GetRangeLockStatus() = 0; + + virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; + + virtual void Resize(uint32_t new_size) = 0; +}; + +// LockManager should always be constructed through this factory method, +// instead of constructing through concrete implementations' constructor. +// Caller owns the returned pointer. +std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/lock_tracker.h b/src/rocksdb/utilities/transactions/lock/lock_tracker.h new file mode 100644 index 000000000..5fa228a82 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/lock_tracker.h @@ -0,0 +1,209 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <memory> + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Request for locking a single key. +struct PointLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id = 0; + // The key to lock. + std::string key; + // The sequence number from which there is no concurrent update to key. + SequenceNumber seq = 0; + // Whether the lock is acquired only for read. + bool read_only = false; + // Whether the lock is in exclusive mode. + bool exclusive = true; +}; + +// Request for locking a range of keys. +struct RangeLockRequest { + // The id of the key's column family. + ColumnFamilyId column_family_id; + + // The range to be locked + Endpoint start_endp; + Endpoint end_endp; +}; + +struct PointLockStatus { + // Whether the key is locked. + bool locked = false; + // Whether the key is locked in exclusive mode. + bool exclusive = true; + // The sequence number in the tracked PointLockRequest. + SequenceNumber seq = 0; +}; + +// Return status when calling LockTracker::Untrack. +enum class UntrackStatus { + // The lock is not tracked at all, so no lock to untrack. + NOT_TRACKED, + // The lock is untracked but not removed from the tracker. + UNTRACKED, + // The lock is removed from the tracker. + REMOVED, +}; + +// Tracks the lock requests. +// In PessimisticTransaction, it tracks the locks acquired through LockMgr; +// In OptimisticTransaction, since there is no LockMgr, it tracks the lock +// intention. Not thread-safe. +class LockTracker { + public: + virtual ~LockTracker() {} + + // Whether supports locking a specific key. + virtual bool IsPointLockSupported() const = 0; + + // Whether supports locking a range of keys. + virtual bool IsRangeLockSupported() const = 0; + + // Tracks the acquirement of a lock on key. + // + // If this method is not supported, leave it as a no-op. + virtual void Track(const PointLockRequest& /*lock_request*/) = 0; + + // Untracks the lock on a key. + // seq and exclusive in lock_request are not used. + // + // If this method is not supported, leave it as a no-op and + // returns NOT_TRACKED. + virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0; + + // Counterpart of Track(const PointLockRequest&) for RangeLockRequest. + virtual void Track(const RangeLockRequest& /*lock_request*/) = 0; + + // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest. + virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0; + + // Merges lock requests tracked in the specified tracker into the current + // tracker. + // + // E.g. for point lock, if a key in tracker is not yet tracked, + // track this new key; otherwise, merge the tracked information of the key + // such as lock's exclusiveness, read/write statistics. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: the specified tracker must be of the same concrete class type as + // the current tracker. + virtual void Merge(const LockTracker& /*tracker*/) = 0; + + // This is a reverse operation of Merge. + // + // E.g. for point lock, if a key exists in both current and the sepcified + // tracker, then subtract the information (such as read/write statistics) of + // the key in the specified tracker from the current tracker. + // + // If this method is not supported, leave it as a no-op. + // + // REQUIRED: + // The specified tracker must be of the same concrete class type as + // the current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual void Subtract(const LockTracker& /*tracker*/) = 0; + + // Clears all tracked locks. + virtual void Clear() = 0; + + // Gets the new locks (excluding the locks that have been tracked before the + // save point) tracked since the specified save point, the result is stored + // in an internally constructed LockTracker and returned. + // + // save_point_tracker is the tracker used by a SavePoint to track locks + // tracked after creating the SavePoint. + // + // The implementation should document whether point lock, or range lock, or + // both are considered in this method. + // If this method is not supported, returns nullptr. + // + // REQUIRED: + // The save_point_tracker must be of the same concrete class type as the + // current tracker. + // The tracked locks in the specified tracker must be a subset of those + // tracked by the current tracker. + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& /*save_point_tracker*/) const = 0; + + // Gets lock related information of the key. + // + // If point lock is not supported, always returns LockStatus with + // locked=false. + virtual PointLockStatus GetPointLockStatus( + ColumnFamilyId /*column_family_id*/, + const std::string& /*key*/) const = 0; + + // Gets number of tracked point locks. + // + // If point lock is not supported, always returns 0. + virtual uint64_t GetNumPointLocks() const = 0; + + class ColumnFamilyIterator { + public: + virtual ~ColumnFamilyIterator() {} + + // Whether there are remaining column families. + virtual bool HasNext() const = 0; + + // Gets next column family id. + // + // If HasNext is false, calling this method has undefined behavior. + virtual ColumnFamilyId Next() = 0; + }; + + // Gets an iterator for column families. + // + // Returned iterator must not be nullptr. + // If there is no column family to iterate, + // returns an empty non-null iterator. + // Caller owns the returned pointer. + virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0; + + class KeyIterator { + public: + virtual ~KeyIterator() {} + + // Whether there are remaining keys. + virtual bool HasNext() const = 0; + + // Gets the next key. + // + // If HasNext is false, calling this method has undefined behavior. + virtual const std::string& Next() = 0; + }; + + // Gets an iterator for keys with tracked point locks in the column family. + // + // The column family must exist. + // Returned iterator must not be nullptr. + // Caller owns the returned pointer. + virtual KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const = 0; +}; + +// LockTracker should always be constructed through this factory. +// Each LockManager owns a LockTrackerFactory. +class LockTrackerFactory { + public: + // Caller owns the returned pointer. + virtual LockTracker* Create() const = 0; + virtual ~LockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc new file mode 100644 index 000000000..b362a164d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc @@ -0,0 +1,721 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_manager.h" + +#include <algorithm> +#include <cinttypes> +#include <mutex> + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/thread_local.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +struct LockInfo { + bool exclusive; + autovector<TransactionID> txn_ids; + + // Transaction locks are not valid after this time in us + uint64_t expiration_time; + + LockInfo(TransactionID id, uint64_t time, bool ex) + : exclusive(ex), expiration_time(time) { + txn_ids.push_back(id); + } + LockInfo(const LockInfo& lock_info) + : exclusive(lock_info.exclusive), + txn_ids(lock_info.txn_ids), + expiration_time(lock_info.expiration_time) {} + void operator=(const LockInfo& lock_info) { + exclusive = lock_info.exclusive; + txn_ids = lock_info.txn_ids; + expiration_time = lock_info.expiration_time; + } + DECLARE_DEFAULT_MOVES(LockInfo); +}; + +struct LockMapStripe { + explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) { + stripe_mutex = factory->AllocateMutex(); + stripe_cv = factory->AllocateCondVar(); + assert(stripe_mutex); + assert(stripe_cv); + } + + // Mutex must be held before modifying keys map + std::shared_ptr<TransactionDBMutex> stripe_mutex; + + // Condition Variable per stripe for waiting on a lock + std::shared_ptr<TransactionDBCondVar> stripe_cv; + + // Locked keys mapped to the info about the transactions that locked them. + // TODO(agiardullo): Explore performance of other data structures. + UnorderedMap<std::string, LockInfo> keys; +}; + +// Map of #num_stripes LockMapStripes +struct LockMap { + explicit LockMap(size_t num_stripes, + std::shared_ptr<TransactionDBMutexFactory> factory) + : num_stripes_(num_stripes) { + lock_map_stripes_.reserve(num_stripes); + for (size_t i = 0; i < num_stripes; i++) { + LockMapStripe* stripe = new LockMapStripe(factory); + lock_map_stripes_.push_back(stripe); + } + } + + ~LockMap() { + for (auto stripe : lock_map_stripes_) { + delete stripe; + } + } + + // Number of sepearate LockMapStripes to create, each with their own Mutex + const size_t num_stripes_; + + // Count of keys that are currently locked in this column family. + // (Only maintained if PointLockManager::max_num_locks_ is positive.) + std::atomic<int64_t> lock_cnt{0}; + + std::vector<LockMapStripe*> lock_map_stripes_; + + size_t GetStripe(const std::string& key) const; +}; + +namespace { +void UnrefLockMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_maps_cache = + static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr); + delete lock_maps_cache; +} +} // anonymous namespace + +PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db, + const TransactionDBOptions& opt) + : txn_db_impl_(txn_db), + default_num_stripes_(opt.num_stripes), + max_num_locks_(opt.max_num_locks), + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)), + dlock_buffer_(opt.max_num_deadlocks), + mutex_factory_(opt.custom_mutex_factory + ? opt.custom_mutex_factory + : std::make_shared<TransactionDBMutexFactoryImpl>()) {} + +size_t LockMap::GetStripe(const std::string& key) const { + assert(num_stripes_ > 0); + return FastRange64(GetSliceNPHash64(key), num_stripes_); +} + +void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) { + InstrumentedMutexLock l(&lock_map_mutex_); + + if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) { + lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>( + default_num_stripes_, mutex_factory_)); + } else { + // column_family already exists in lock map + assert(false); + } +} + +void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) { + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep using it + // until they release their references to it. + { + InstrumentedMutexLock l(&lock_map_mutex_); + + auto lock_maps_iter = lock_maps_.find(cf->GetID()); + if (lock_maps_iter == lock_maps_.end()) { + return; + } + + lock_maps_.erase(lock_maps_iter); + } // lock_map_mutex_ + + // Clear all thread-local caches + autovector<void*> local_caches; + lock_maps_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast<LockMaps*>(cache); + } +} + +// Look up the LockMap std::shared_ptr for a given column_family_id. +// Note: The LockMap is only valid as long as the caller is still holding on +// to the returned std::shared_ptr. +std::shared_ptr<LockMap> PointLockManager::GetLockMap( + ColumnFamilyId column_family_id) { + // First check thread-local cache + if (lock_maps_cache_->Get() == nullptr) { + lock_maps_cache_->Reset(new LockMaps()); + } + + auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get()); + + auto lock_map_iter = lock_maps_cache->find(column_family_id); + if (lock_map_iter != lock_maps_cache->end()) { + // Found lock map for this column family. + return lock_map_iter->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(&lock_map_mutex_); + + lock_map_iter = lock_maps_.find(column_family_id); + if (lock_map_iter == lock_maps_.end()) { + return std::shared_ptr<LockMap>(nullptr); + } else { + // Found lock map. Store in thread-local cache and return. + std::shared_ptr<LockMap>& lock_map = lock_map_iter->second; + lock_maps_cache->insert({column_family_id, lock_map}); + + return lock_map; + } +} + +// Returns true if this lock has expired and can be acquired by another +// transaction. +// If false, sets *expire_time to the expiration time of the lock according +// to Env->GetMicros() or 0 if no expiration. +bool PointLockManager::IsLockExpired(TransactionID txn_id, + const LockInfo& lock_info, Env* env, + uint64_t* expire_time) { + if (lock_info.expiration_time == 0) { + *expire_time = 0; + return false; + } + + auto now = env->NowMicros(); + bool expired = lock_info.expiration_time <= now; + if (!expired) { + // return how many microseconds until lock will be expired + *expire_time = lock_info.expiration_time; + } else { + for (auto id : lock_info.txn_ids) { + if (txn_id == id) { + continue; + } + + bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id); + if (!success) { + expired = false; + *expire_time = 0; + break; + } + } + } + + return expired; +} + +Status PointLockManager::TryLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env, + bool exclusive) { + // Lookup lock map for this column family id + std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + char msg[255]; + snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32, + column_family_id); + + return Status::InvalidArgument(msg); + } + + // Need to lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive); + int64_t timeout = txn->GetLockTimeout(); + + return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env, + timeout, lock_info); +} + +// Helper function for TryLock(). +Status PointLockManager::AcquireWithTimeout( + PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe, + ColumnFamilyId column_family_id, const std::string& key, Env* env, + int64_t timeout, const LockInfo& lock_info) { + Status result; + uint64_t end_time = 0; + + if (timeout > 0) { + uint64_t start_time = env->NowMicros(); + end_time = start_time + timeout; + } + + if (timeout < 0) { + // If timeout is negative, we wait indefinitely to acquire the lock + result = stripe->stripe_mutex->Lock(); + } else { + result = stripe->stripe_mutex->TryLockFor(timeout); + } + + if (!result.ok()) { + // failed to acquire mutex + return result; + } + + // Acquire lock if we are able to + uint64_t expire_time_hint = 0; + autovector<TransactionID> wait_ids; + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + + if (!result.ok() && timeout != 0) { + PERF_TIMER_GUARD(key_lock_wait_time); + PERF_COUNTER_ADD(key_lock_wait_count, 1); + // If we weren't able to acquire the lock, we will keep retrying as long + // as the timeout allows. + bool timed_out = false; + do { + // Decide how long to wait + int64_t cv_end_time = -1; + if (expire_time_hint > 0 && end_time > 0) { + cv_end_time = std::min(expire_time_hint, end_time); + } else if (expire_time_hint > 0) { + cv_end_time = expire_time_hint; + } else if (end_time > 0) { + cv_end_time = end_time; + } + + assert(result.IsBusy() || wait_ids.size() != 0); + + // We are dependent on a transaction to finish, so perform deadlock + // detection. + if (wait_ids.size() != 0) { + if (txn->IsDeadlockDetect()) { + if (IncrementWaiters(txn, wait_ids, key, column_family_id, + lock_info.exclusive, env)) { + result = Status::Busy(Status::SubCode::kDeadlock); + stripe->stripe_mutex->UnLock(); + return result; + } + } + txn->SetWaitingTxn(wait_ids, column_family_id, &key); + } + + TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn"); + if (cv_end_time < 0) { + // Wait indefinitely + result = stripe->stripe_cv->Wait(stripe->stripe_mutex); + } else { + uint64_t now = env->NowMicros(); + if (static_cast<uint64_t>(cv_end_time) > now) { + result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex, + cv_end_time - now); + } + } + + if (wait_ids.size() != 0) { + txn->ClearWaitingTxn(); + if (txn->IsDeadlockDetect()) { + DecrementWaiters(txn, wait_ids); + } + } + + if (result.IsTimedOut()) { + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). + } + + if (result.ok() || result.IsTimedOut()) { + result = AcquireLocked(lock_map, stripe, key, env, lock_info, + &expire_time_hint, &wait_ids); + } + } while (!result.ok() && !timed_out); + } + + stripe->stripe_mutex->UnLock(); + + return result; +} + +void PointLockManager::DecrementWaiters( + const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids) { + std::lock_guard<std::mutex> lock(wait_txn_map_mutex_); + DecrementWaitersImpl(txn, wait_ids); +} + +void PointLockManager::DecrementWaitersImpl( + const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids) { + auto id = txn->GetID(); + assert(wait_txn_map_.Contains(id)); + wait_txn_map_.Delete(id); + + for (auto wait_id : wait_ids) { + rev_wait_txn_map_.Get(wait_id)--; + if (rev_wait_txn_map_.Get(wait_id) == 0) { + rev_wait_txn_map_.Delete(wait_id); + } + } +} + +bool PointLockManager::IncrementWaiters( + const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids, const std::string& key, + const uint32_t& cf_id, const bool& exclusive, Env* const env) { + auto id = txn->GetID(); + std::vector<int> queue_parents( + static_cast<size_t>(txn->GetDeadlockDetectDepth())); + std::vector<TransactionID> queue_values( + static_cast<size_t>(txn->GetDeadlockDetectDepth())); + std::lock_guard<std::mutex> lock(wait_txn_map_mutex_); + assert(!wait_txn_map_.Contains(id)); + + wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key}); + + for (auto wait_id : wait_ids) { + if (rev_wait_txn_map_.Contains(wait_id)) { + rev_wait_txn_map_.Get(wait_id)++; + } else { + rev_wait_txn_map_.Insert(wait_id, 1); + } + } + + // No deadlock if nobody is waiting on self. + if (!rev_wait_txn_map_.Contains(id)) { + return false; + } + + const auto* next_ids = &wait_ids; + int parent = -1; + int64_t deadlock_time = 0; + for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) { + int i = 0; + if (next_ids) { + for (; i < static_cast<int>(next_ids->size()) && + tail + i < txn->GetDeadlockDetectDepth(); + i++) { + queue_values[tail + i] = (*next_ids)[i]; + queue_parents[tail + i] = parent; + } + tail += i; + } + + // No more items in the list, meaning no deadlock. + if (tail == head) { + return false; + } + + auto next = queue_values[head]; + if (next == id) { + std::vector<DeadlockInfo> path; + while (head != -1) { + assert(wait_txn_map_.Contains(queue_values[head])); + + auto extracted_info = wait_txn_map_.Get(queue_values[head]); + path.push_back({queue_values[head], extracted_info.m_cf_id, + extracted_info.m_exclusive, + extracted_info.m_waiting_key}); + head = queue_parents[head]; + } + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the + status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED + passes. Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + std::reverse(path.begin(), path.end()); + dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time)); + deadlock_time = 0; + DecrementWaitersImpl(txn, wait_ids); + return true; + } else if (!wait_txn_map_.Contains(next)) { + next_ids = nullptr; + continue; + } else { + parent = head; + next_ids = &(wait_txn_map_.Get(next).m_neighbors); + } + } + + // Wait cycle too big, just assume deadlock. + if (!env->GetCurrentTime(&deadlock_time).ok()) { + /* + TODO(AR) this preserves the current behaviour whilst checking the status + of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes. + Should we instead raise an error if !ok() ? + */ + deadlock_time = 0; + } + dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true)); + DecrementWaitersImpl(txn, wait_ids); + return true; +} + +// Try to lock this key after we have acquired the mutex. +// Sets *expire_time to the expiration time in microseconds +// or 0 if no expiration. +// REQUIRED: Stripe mutex must be held. +Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& txn_lock_info, + uint64_t* expire_time, + autovector<TransactionID>* txn_ids) { + assert(txn_lock_info.txn_ids.size() == 1); + + Status result; + // Check if this key is already locked + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + // Lock already held + LockInfo& lock_info = stripe_iter->second; + assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive); + + if (lock_info.exclusive || txn_lock_info.exclusive) { + if (lock_info.txn_ids.size() == 1 && + lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) { + // The list contains one txn and we're it, so just take it. + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + } else { + // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case + // it's there for a shared lock with multiple holders which was not + // caught in the first case. + if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env, + expire_time)) { + // lock is expired, can steal it + lock_info.txn_ids = txn_lock_info.txn_ids; + lock_info.exclusive = txn_lock_info.exclusive; + lock_info.expiration_time = txn_lock_info.expiration_time; + // lock_cnt does not change + } else { + result = Status::TimedOut(Status::SubCode::kLockTimeout); + *txn_ids = lock_info.txn_ids; + } + } + } else { + // We are requesting shared access to a shared lock, so just grant it. + lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]); + // Using std::max means that expiration time never goes down even when + // a transaction is removed from the list. The correct solution would be + // to track expiry for every transaction, but this would also work for + // now. + lock_info.expiration_time = + std::max(lock_info.expiration_time, txn_lock_info.expiration_time); + } + } else { // Lock not held. + // Check lock limit + if (max_num_locks_ > 0 && + lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) { + result = Status::Busy(Status::SubCode::kLockLimit); + } else { + // acquire lock + stripe->keys.emplace(key, txn_lock_info); + + // Maintain lock count if there is a limit on the number of locks + if (max_num_locks_) { + lock_map->lock_cnt++; + } + } + } + + return result; +} + +void PointLockManager::UnLockKey(PessimisticTransaction* txn, + const std::string& key, LockMapStripe* stripe, + LockMap* lock_map, Env* env) { +#ifdef NDEBUG + (void)env; +#endif + TransactionID txn_id = txn->GetID(); + + auto stripe_iter = stripe->keys.find(key); + if (stripe_iter != stripe->keys.end()) { + auto& txns = stripe_iter->second.txn_ids; + auto txn_it = std::find(txns.begin(), txns.end(), txn_id); + // Found the key we locked. unlock it. + if (txn_it != txns.end()) { + if (txns.size() == 1) { + stripe->keys.erase(stripe_iter); + } else { + auto last_it = txns.end() - 1; + if (txn_it != last_it) { + *txn_it = *last_it; + } + txns.pop_back(); + } + + if (max_num_locks_ > 0) { + // Maintain lock count if there is a limit on the number of locks. + assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0); + lock_map->lock_cnt--; + } + } + } else { + // This key is either not locked or locked by someone else. This should + // only happen if the unlocking transaction has expired. + assert(txn->GetExpirationTime() > 0 && + txn->GetExpirationTime() < env->NowMicros()); + } +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env* env) { + std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id); + LockMap* lock_map = lock_map_ptr.get(); + if (lock_map == nullptr) { + // Column Family must have been dropped. + return; + } + + // Lock the mutex for the stripe that this key hashes to + size_t stripe_num = lock_map->GetStripe(key); + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + UnLockKey(txn, key, stripe, lock_map, env); + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); +} + +void PointLockManager::UnLock(PessimisticTransaction* txn, + const LockTracker& tracker, Env* env) { + std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf); + LockMap* lock_map = lock_map_ptr.get(); + if (!lock_map) { + // Column Family must have been dropped. + return; + } + + // Bucket keys by lock_map_ stripe + UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe( + lock_map->num_stripes_); + std::unique_ptr<LockTracker::KeyIterator> key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + size_t stripe_num = lock_map->GetStripe(key); + keys_by_stripe[stripe_num].push_back(&key); + } + + // For each stripe, grab the stripe mutex and unlock all keys in this stripe + for (auto& stripe_iter : keys_by_stripe) { + size_t stripe_num = stripe_iter.first; + auto& stripe_keys = stripe_iter.second; + + assert(lock_map->lock_map_stripes_.size() > stripe_num); + LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num); + + stripe->stripe_mutex->Lock().PermitUncheckedError(); + + for (const std::string* key : stripe_keys) { + UnLockKey(txn, *key, stripe, lock_map, env); + } + + stripe->stripe_mutex->UnLock(); + + // Signal waiting threads to retry locking + stripe->stripe_cv->NotifyAll(); + } + } +} + +PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() { + PointLockStatus data; + // Lock order here is important. The correct order is lock_map_mutex_, then + // for every column family ID in ascending order lock every stripe in + // ascending order. + InstrumentedMutexLock l(&lock_map_mutex_); + + std::vector<uint32_t> cf_ids; + for (const auto& map : lock_maps_) { + cf_ids.push_back(map.first); + } + std::sort(cf_ids.begin(), cf_ids.end()); + + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + // Iterate and lock all stripes in ascending order. + for (const auto& j : stripes) { + j->stripe_mutex->Lock().PermitUncheckedError(); + for (const auto& it : j->keys) { + struct KeyLockInfo info; + info.exclusive = it.second.exclusive; + info.key = it.first; + for (const auto& id : it.second.txn_ids) { + info.ids.push_back(id); + } + data.insert({i, info}); + } + } + } + + // Unlock everything. Unlocking order is not important. + for (auto i : cf_ids) { + const auto& stripes = lock_maps_[i]->lock_map_stripes_; + for (const auto& j : stripes) { + j->stripe_mutex->UnLock(); + } + } + + return data; +} + +std::vector<DeadlockPath> PointLockManager::GetDeadlockInfoBuffer() { + return dlock_buffer_.PrepareBuffer(); +} + +void PointLockManager::Resize(uint32_t target_size) { + dlock_buffer_.Resize(target_size); +} + +PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() { + return {}; +} + +Status PointLockManager::TryLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */, + bool /* exclusive */) { + return Status::NotSupported( + "PointLockManager does not support range locking"); +} + +void PointLockManager::UnLock(PessimisticTransaction* /* txn */, + ColumnFamilyId /* cf_id */, + const Endpoint& /* start */, + const Endpoint& /* end */, Env* /* env */) { + // no-op +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h new file mode 100644 index 000000000..eeb34f3be --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h @@ -0,0 +1,224 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <memory> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/utilities/transaction.h" +#include "util/autovector.h" +#include "util/hash_containers.h" +#include "util/hash_map.h" +#include "util/thread_local.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +struct LockInfo; +struct LockMap; +struct LockMapStripe; + +template <class Path> +class DeadlockInfoBufferTempl { + private: + std::vector<Path> paths_buffer_; + uint32_t buffer_idx_; + std::mutex paths_buffer_mutex_; + + std::vector<Path> Normalize() { + auto working = paths_buffer_; + + if (working.empty()) { + return working; + } + + // Next write occurs at a nonexistent path's slot + if (paths_buffer_[buffer_idx_].empty()) { + working.resize(buffer_idx_); + } else { + std::rotate(working.begin(), working.begin() + buffer_idx_, + working.end()); + } + + return working; + } + + public: + explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks) + : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {} + + void AddNewPath(Path path) { + std::lock_guard<std::mutex> lock(paths_buffer_mutex_); + + if (paths_buffer_.empty()) { + return; + } + + paths_buffer_[buffer_idx_] = std::move(path); + buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size(); + } + + void Resize(uint32_t target_size) { + std::lock_guard<std::mutex> lock(paths_buffer_mutex_); + + paths_buffer_ = Normalize(); + + // Drop the deadlocks that will no longer be needed ater the normalize + if (target_size < paths_buffer_.size()) { + paths_buffer_.erase( + paths_buffer_.begin(), + paths_buffer_.begin() + (paths_buffer_.size() - target_size)); + buffer_idx_ = 0; + } + // Resize the buffer to the target size and restore the buffer's idx + else { + auto prev_size = paths_buffer_.size(); + paths_buffer_.resize(target_size); + buffer_idx_ = (uint32_t)prev_size; + } + } + + std::vector<Path> PrepareBuffer() { + std::lock_guard<std::mutex> lock(paths_buffer_mutex_); + + // Reversing the normalized vector returns the latest deadlocks first + auto working = Normalize(); + std::reverse(working.begin(), working.end()); + + return working; + } +}; + +using DeadlockInfoBuffer = DeadlockInfoBufferTempl<DeadlockPath>; + +struct TrackedTrxInfo { + autovector<TransactionID> m_neighbors; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +class PointLockManager : public LockManager { + public: + PointLockManager(PessimisticTransactionDB* db, + const TransactionDBOptions& opt); + // No copying allowed + PointLockManager(const PointLockManager&) = delete; + PointLockManager& operator=(const PointLockManager&) = delete; + + ~PointLockManager() override {} + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return PointLockTrackerFactory::Get(); + } + + // Creates a new LockMap for this column family. Caller should guarantee + // that this column family does not already exist. + void AddColumnFamily(const ColumnFamilyHandle* cf) override; + // Deletes the LockMap for this column family. Caller should guarantee that + // this column family is no longer in use. + void RemoveColumnFamily(const ColumnFamilyHandle* cf) override; + + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start, const Endpoint& end, Env* env) override; + + PointLockStatus GetPointLockStatus() override; + + RangeLockStatus GetRangeLockStatus() override; + + std::vector<DeadlockPath> GetDeadlockInfoBuffer() override; + + void Resize(uint32_t new_size) override; + + private: + PessimisticTransactionDB* txn_db_impl_; + + // Default number of lock map stripes per column family + const size_t default_num_stripes_; + + // Limit on number of keys locked per column family + const int64_t max_num_locks_; + + // The following lock order must be satisfied in order to avoid deadlocking + // ourselves. + // - lock_map_mutex_ + // - stripe mutexes in ascending cf id, ascending stripe order + // - wait_txn_map_mutex_ + // + // Must be held when accessing/modifying lock_maps_. + InstrumentedMutex lock_map_mutex_; + + // Map of ColumnFamilyId to locked key info + using LockMaps = UnorderedMap<uint32_t, std::shared_ptr<LockMap>>; + LockMaps lock_maps_; + + // Thread-local cache of entries in lock_maps_. This is an optimization + // to avoid acquiring a mutex in order to look up a LockMap + std::unique_ptr<ThreadLocalPtr> lock_maps_cache_; + + // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_. + std::mutex wait_txn_map_mutex_; + + // Maps from waitee -> number of waiters. + HashMap<TransactionID, int> rev_wait_txn_map_; + // Maps from waiter -> waitee. + HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_; + DeadlockInfoBuffer dlock_buffer_; + + // Used to allocate mutexes/condvars to use when locking keys + std::shared_ptr<TransactionDBMutexFactory> mutex_factory_; + + bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env, + uint64_t* wait_time); + + std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id); + + Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map, + LockMapStripe* stripe, uint32_t column_family_id, + const std::string& key, Env* env, int64_t timeout, + const LockInfo& lock_info); + + Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, + const std::string& key, Env* env, + const LockInfo& lock_info, uint64_t* wait_time, + autovector<TransactionID>* txn_ids); + + void UnLockKey(PessimisticTransaction* txn, const std::string& key, + LockMapStripe* stripe, LockMap* lock_map, Env* env); + + bool IncrementWaiters(const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids, + const std::string& key, const uint32_t& cf_id, + const bool& exclusive, Env* const env); + void DecrementWaiters(const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids); + void DecrementWaitersImpl(const PessimisticTransaction* txn, + const autovector<TransactionID>& wait_ids); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc new file mode 100644 index 000000000..525fdea71 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_manager_test.h" + +namespace ROCKSDB_NAMESPACE { + +// This test is not applicable for Range Lock manager as Range Lock Manager +// operates on Column Families, not their ids. +TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) { + MockColumnFamilyHandle cf(1024); + locker_->RemoveColumnFamily(&cf); + auto txn = NewTxn(); + auto s = locker_->TryLock(txn, 1024, "k", env_, true); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STREQ(s.getState(), "Column family id not found: 1024"); + delete txn; +} + +TEST_F(PointLockManagerTest, LockStatus) { + MockColumnFamilyHandle cf1(1024), cf2(2048); + locker_->AddColumnFamily(&cf1); + locker_->AddColumnFamily(&cf2); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true)); + ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true)); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false)); + ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false)); + + auto s = locker_->GetPointLockStatus(); + ASSERT_EQ(s.size(), 4u); + for (uint32_t cf_id : {1024, 2048}) { + ASSERT_EQ(s.count(cf_id), 2u); + auto range = s.equal_range(cf_id); + for (auto it = range.first; it != range.second; it++) { + ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2"); + if (it->second.key == "k1") { + ASSERT_EQ(it->second.exclusive, true); + ASSERT_EQ(it->second.ids.size(), 1u); + ASSERT_EQ(it->second.ids[0], txn1->GetID()); + } else if (it->second.key == "k2") { + ASSERT_EQ(it->second.exclusive, false); + ASSERT_EQ(it->second.ids.size(), 1u); + ASSERT_EQ(it->second.ids[0], txn2->GetID()); + } + } + } + + // Cleanup + locker_->UnLock(txn1, 1024, "k1", env_); + locker_->UnLock(txn1, 2048, "k1", env_); + locker_->UnLock(txn2, 1024, "k2", env_); + locker_->UnLock(txn2, 2048, "k2", env_); + + delete txn1; + delete txn2; +} + +TEST_F(PointLockManagerTest, UnlockExclusive) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true)); + locker_->UnLock(txn1, 1, "k", env_); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +TEST_F(PointLockManagerTest, UnlockShared) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + locker_->UnLock(txn1, 1, "k", env_); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +// This test doesn't work with Range Lock Manager, because Range Lock Manager +// doesn't support deadlock_detect_depth. + +TEST_F(PointLockManagerTest, DeadlockDepthExceeded) { + // Tests that when detecting deadlock, if the detection depth is exceeded, + // it's also viewed as deadlock. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + TransactionOptions txn_opt; + txn_opt.deadlock_detect = true; + txn_opt.deadlock_detect_depth = 1; + txn_opt.lock_timeout = 1000000; + auto txn1 = NewTxn(txn_opt); + auto txn2 = NewTxn(txn_opt); + auto txn3 = NewTxn(txn_opt); + auto txn4 = NewTxn(txn_opt); + // "a ->(k) b" means transaction a is waiting for transaction b to release + // the held lock on key k. + // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1 + // txn3's deadlock detection will exceed the detection depth 1, + // which will be viewed as a deadlock. + // NOTE: + // txn4 ->(k3) -> txn3 must be set up before + // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3, + // it must have another txn waiting on it, which is txn4 in this case. + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + + port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); + // block because txn1 is holding a lock on k1. + locker_->TryLock(txn2, 1, "k1", env_, true); + }); + + ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true)); + + port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + // block because txn3 is holding a lock on k1. + locker_->TryLock(txn4, 1, "k3", env_, true); + }); + + auto s = locker_->TryLock(txn3, 1, "k2", env_, true); + ASSERT_TRUE(s.IsBusy()); + ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock); + + std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer(); + ASSERT_EQ(deadlock_paths.size(), 1u); + ASSERT_TRUE(deadlock_paths[0].limit_exceeded); + + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn3, 1, "k3", env_); + t1.join(); + t2.join(); + + delete txn4; + delete txn3; + delete txn2; + delete txn1; +} + +INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest, + ::testing::Values(nullptr)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h new file mode 100644 index 000000000..ca9f46bf9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h @@ -0,0 +1,324 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class MockColumnFamilyHandle : public ColumnFamilyHandle { + public: + explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {} + + ~MockColumnFamilyHandle() override {} + + const std::string& GetName() const override { return name_; } + + ColumnFamilyId GetID() const override { return cf_id_; } + + Status GetDescriptor(ColumnFamilyDescriptor*) override { + return Status::OK(); + } + + const Comparator* GetComparator() const override { + return BytewiseComparator(); + } + + private: + ColumnFamilyId cf_id_; + std::string name_ = "MockCF"; +}; + +class PointLockManagerTest : public testing::Test { + public: + void SetUp() override { + env_ = Env::Default(); + db_dir_ = test::PerThreadDBPath("point_lock_manager_test"); + ASSERT_OK(env_->CreateDir(db_dir_)); + + Options opt; + opt.create_if_missing = true; + TransactionDBOptions txn_opt; + txn_opt.transaction_lock_timeout = 0; + + ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_)); + + // CAUTION: This test creates a separate lock manager object (right, NOT + // the one that the TransactionDB is using!), and runs tests on it. + locker_.reset(new PointLockManager( + static_cast<PessimisticTransactionDB*>(db_), txn_opt)); + + wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn"; + } + + void TearDown() override { + delete db_; + EXPECT_OK(DestroyDir(env_, db_dir_)); + } + + PessimisticTransaction* NewTxn( + TransactionOptions txn_opt = TransactionOptions()) { + Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt); + return reinterpret_cast<PessimisticTransaction*>(txn); + } + + protected: + Env* env_; + std::shared_ptr<LockManager> locker_; + const char* wait_sync_point_name_; + friend void PointLockManagerTestExternalSetup(PointLockManagerTest*); + + private: + std::string db_dir_; + TransactionDB* db_; +}; + +using init_func_t = void (*)(PointLockManagerTest*); + +class AnyLockManagerTest : public PointLockManagerTest, + public testing::WithParamInterface<init_func_t> { + public: + void SetUp() override { + // If a custom setup function was provided, use it. Otherwise, use what we + // have inherited. + auto init_func = GetParam(); + if (init_func) + (*init_func)(this); + else + PointLockManagerTest::SetUp(); + } +}; + +TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) { + // Tests that a txn can acquire exclusive lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, ReentrantSharedLock) { + // Tests that a txn can acquire shared lock on the same key repeatedly. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + + delete txn; +} + +TEST_P(AnyLockManagerTest, LockUpgrade) { + // Tests that a txn can upgrade from a shared lock to an exclusive lock. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockDowngrade) { + // Tests that a txn can acquire a shared lock after acquiring an exclusive + // lock on the same key. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn = NewTxn(); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true)); + ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn, 1, "k", env_); + delete txn; +} + +TEST_P(AnyLockManagerTest, LockConflict) { + // Tests that lock conflicts lead to lock timeout. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + + { + // exclusive-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // exclusive-shared conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, false); + ASSERT_TRUE(s.IsTimedOut()); + } + + { + // shared-exclusive conflict. + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false)); + auto s = locker_->TryLock(txn2, 1, "k2", env_, true); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + + delete txn1; + delete txn2; +} + +port::Thread BlockUntilWaitingTxn(const char* sync_point_name, + std::function<void()> f) { + std::atomic<bool> reached(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + sync_point_name, [&](void* /*arg*/) { reached.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t(f); + + while (!reached.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + return t; +} + +TEST_P(AnyLockManagerTest, SharedLocks) { + // Tests that shared locks can be concurrently held by multiple transactions. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + auto txn1 = NewTxn(); + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + // Cleanup + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + delete txn1; + delete txn2; +} + +TEST_P(AnyLockManagerTest, Deadlock) { + // Tests that deadlock can be detected. + // Deadlock scenario: + // txn1 exclusively locks k1, and wants to lock k2; + // txn2 exclusively locks k2, and wants to lock k1. + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + TransactionOptions txn_opt; + txn_opt.deadlock_detect = true; + txn_opt.lock_timeout = 1000000; + auto txn1 = NewTxn(txn_opt); + auto txn2 = NewTxn(txn_opt); + + ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true)); + ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); + + // txn1 tries to lock k2, will block forever. + port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + // block because txn2 is holding a lock on k2. + locker_->TryLock(txn1, 1, "k2", env_, true); + }); + + auto s = locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_TRUE(s.IsBusy()); + ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock); + + std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer(); + ASSERT_EQ(deadlock_paths.size(), 1u); + ASSERT_FALSE(deadlock_paths[0].limit_exceeded); + + std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path; + ASSERT_EQ(deadlocks.size(), 2u); + + ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID()); + ASSERT_EQ(deadlocks[0].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[0].m_exclusive); + ASSERT_EQ(deadlocks[0].m_waiting_key, "k2"); + + ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID()); + ASSERT_EQ(deadlocks[1].m_cf_id, 1u); + ASSERT_TRUE(deadlocks[1].m_exclusive); + ASSERT_EQ(deadlocks[1].m_waiting_key, "k1"); + + locker_->UnLock(txn2, 1, "k2", env_); + t.join(); + + // Cleanup + locker_->UnLock(txn1, 1, "k1", env_); + locker_->UnLock(txn1, 1, "k2", env_); + delete txn2; + delete txn1; +} + +TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) { + MockColumnFamilyHandle cf(1); + locker_->AddColumnFamily(&cf); + + auto txn1 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false)); + + auto txn2 = NewTxn(); + ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false)); + + auto txn3 = NewTxn(); + txn3->SetLockTimeout(10000); + port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { + ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true)); + locker_->UnLock(txn3, 1, "k", env_); + }); + + // Ok, now txn3 is waiting for lock on "k", which is owned by two + // transactions. Check that GetWaitingTxns reports this correctly + uint32_t wait_cf_id; + std::string wait_key; + auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key); + + ASSERT_EQ(wait_cf_id, 1u); + ASSERT_EQ(wait_key, "k"); + ASSERT_EQ(waiters.size(), 2); + bool waits_correct = + (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) || + (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID()); + ASSERT_EQ(waits_correct, true); + + // Release locks so txn3 can proceed with execution + locker_->UnLock(txn1, 1, "k", env_); + locker_->UnLock(txn2, 1, "k", env_); + + // Wait until txn3 finishes + t1.join(); + + delete txn1; + delete txn2; + delete txn3; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc new file mode 100644 index 000000000..6204a8f02 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/point/point_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class TrackedKeysColumnFamilyIterator + : public LockTracker::ColumnFamilyIterator { + public: + explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys) + : tracked_keys_(keys), it_(keys.begin()) {} + + bool HasNext() const override { return it_ != tracked_keys_.end(); } + + ColumnFamilyId Next() override { return (it_++)->first; } + + private: + const TrackedKeys& tracked_keys_; + TrackedKeys::const_iterator it_; +}; + +class TrackedKeysIterator : public LockTracker::KeyIterator { + public: + TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id) + : key_infos_(keys.at(id)), it_(key_infos_.begin()) {} + + bool HasNext() const override { return it_ != key_infos_.end(); } + + const std::string& Next() override { return (it_++)->first; } + + private: + const TrackedKeyInfos& key_infos_; + TrackedKeyInfos::const_iterator it_; +}; + +} // namespace + +void PointLockTracker::Track(const PointLockRequest& r) { + auto& keys = tracked_keys_[r.column_family_id]; + auto result = keys.try_emplace(r.key, r.seq); + auto it = result.first; + if (!result.second && r.seq < it->second.seq) { + // Now tracking this key with an earlier sequence number + it->second.seq = r.seq; + } + // else we do not update the seq. The smaller the tracked seq, the stronger it + // the guarantee since it implies from the seq onward there has not been a + // concurrent update to the key. So we update the seq if it implies stronger + // guarantees, i.e., if it is smaller than the existing tracked seq. + + if (r.read_only) { + it->second.num_reads++; + } else { + it->second.num_writes++; + } + + it->second.exclusive = it->second.exclusive || r.exclusive; +} + +UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) { + auto cf_keys = tracked_keys_.find(r.column_family_id); + if (cf_keys == tracked_keys_.end()) { + return UntrackStatus::NOT_TRACKED; + } + + auto& keys = cf_keys->second; + auto it = keys.find(r.key); + if (it == keys.end()) { + return UntrackStatus::NOT_TRACKED; + } + + bool untracked = false; + auto& info = it->second; + if (r.read_only) { + if (info.num_reads > 0) { + info.num_reads--; + untracked = true; + } + } else { + if (info.num_writes > 0) { + info.num_writes--; + untracked = true; + } + } + + bool removed = false; + if (info.num_reads == 0 && info.num_writes == 0) { + keys.erase(it); + if (keys.empty()) { + tracked_keys_.erase(cf_keys); + } + removed = true; + } + + if (removed) { + return UntrackStatus::REMOVED; + } + if (untracked) { + return UntrackStatus::UNTRACKED; + } + return UntrackStatus::NOT_TRACKED; +} + +void PointLockTracker::Merge(const LockTracker& tracker) { + const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto current_cf_keys = tracked_keys_.find(cf); + if (current_cf_keys == tracked_keys_.end()) { + tracked_keys_.emplace(cf_keys); + } else { + auto& current_keys = current_cf_keys->second; + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + // If key was not previously tracked, just copy the whole struct over. + // Otherwise, some merging needs to occur. + auto current_info = current_keys.find(key); + if (current_info == current_keys.end()) { + current_keys.emplace(key_info); + } else { + current_info->second.Merge(info); + } + } + } + } +} + +void PointLockTracker::Subtract(const LockTracker& tracker) { + const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker); + for (const auto& cf_keys : t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + + // Decrement the total reads/writes of this key by the number of + // reads/writes done since the last SavePoint. + if (num_reads > 0) { + assert(current_key_info->second.num_reads >= num_reads); + current_key_info->second.num_reads -= num_reads; + } + if (num_writes > 0) { + assert(current_key_info->second.num_writes >= num_writes); + current_key_info->second.num_writes -= num_writes; + } + if (current_key_info->second.num_reads == 0 && + current_key_info->second.num_writes == 0) { + current_keys.erase(current_key_info); + } + } + } +} + +LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const { + // Examine the number of reads/writes performed on all keys written + // since the last SavePoint and compare to the total number of reads/writes + // for each key. + LockTracker* t = new PointLockTracker(); + const PointLockTracker& save_point_t = + static_cast<const PointLockTracker&>(save_point_tracker); + for (const auto& cf_keys : save_point_t.tracked_keys_) { + ColumnFamilyId cf = cf_keys.first; + const auto& keys = cf_keys.second; + + auto& current_keys = tracked_keys_.at(cf); + for (const auto& key_info : keys) { + const std::string& key = key_info.first; + const TrackedKeyInfo& info = key_info.second; + uint32_t num_reads = info.num_reads; + uint32_t num_writes = info.num_writes; + + auto current_key_info = current_keys.find(key); + assert(current_key_info != current_keys.end()); + assert(current_key_info->second.num_reads >= num_reads); + assert(current_key_info->second.num_writes >= num_writes); + + if (current_key_info->second.num_reads == num_reads && + current_key_info->second.num_writes == num_writes) { + // All the reads/writes to this key were done in the last savepoint. + PointLockRequest r; + r.column_family_id = cf; + r.key = key; + r.seq = info.seq; + r.read_only = (num_writes == 0); + r.exclusive = info.exclusive; + t->Track(r); + } + } + } + return t; +} + +PointLockStatus PointLockTracker::GetPointLockStatus( + ColumnFamilyId column_family_id, const std::string& key) const { + assert(IsPointLockSupported()); + PointLockStatus status; + auto it = tracked_keys_.find(column_family_id); + if (it == tracked_keys_.end()) { + return status; + } + + const auto& keys = it->second; + auto key_it = keys.find(key); + if (key_it == keys.end()) { + return status; + } + + const TrackedKeyInfo& key_info = key_it->second; + status.locked = true; + status.exclusive = key_info.exclusive; + status.seq = key_info.seq; + return status; +} + +uint64_t PointLockTracker::GetNumPointLocks() const { + uint64_t num_keys = 0; + for (const auto& cf_keys : tracked_keys_) { + num_keys += cf_keys.second.size(); + } + return num_keys; +} + +LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator() + const { + return new TrackedKeysColumnFamilyIterator(tracked_keys_); +} + +LockTracker::KeyIterator* PointLockTracker::GetKeyIterator( + ColumnFamilyId column_family_id) const { + assert(tracked_keys_.find(column_family_id) != tracked_keys_.end()); + return new TrackedKeysIterator(tracked_keys_, column_family_id); +} + +void PointLockTracker::Clear() { tracked_keys_.clear(); } + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h new file mode 100644 index 000000000..daf6f9aa2 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h @@ -0,0 +1,99 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <memory> +#include <string> +#include <unordered_map> + +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +struct TrackedKeyInfo { + // Earliest sequence number that is relevant to this transaction for this key + SequenceNumber seq; + + uint32_t num_writes; + uint32_t num_reads; + + bool exclusive; + + explicit TrackedKeyInfo(SequenceNumber seq_no) + : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {} + + void Merge(const TrackedKeyInfo& info) { + assert(seq <= info.seq); + num_reads += info.num_reads; + num_writes += info.num_writes; + exclusive = exclusive || info.exclusive; + } +}; + +using TrackedKeyInfos = std::unordered_map<std::string, TrackedKeyInfo>; + +using TrackedKeys = std::unordered_map<ColumnFamilyId, TrackedKeyInfos>; + +// Tracks point locks on single keys. +class PointLockTracker : public LockTracker { + public: + PointLockTracker() = default; + + PointLockTracker(const PointLockTracker&) = delete; + PointLockTracker& operator=(const PointLockTracker&) = delete; + + bool IsPointLockSupported() const override { return true; } + + bool IsRangeLockSupported() const override { return false; } + + void Track(const PointLockRequest& lock_request) override; + + UntrackStatus Untrack(const PointLockRequest& lock_request) override; + + void Track(const RangeLockRequest& /*lock_request*/) override {} + + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + void Merge(const LockTracker& tracker) override; + + void Subtract(const LockTracker& tracker) override; + + void Clear() override; + + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker& save_point_tracker) const override; + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + uint64_t GetNumPointLocks() const override; + + ColumnFamilyIterator* GetColumnFamilyIterator() const override; + + KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override; + + private: + TrackedKeys tracked_keys_; +}; + +class PointLockTrackerFactory : public LockTrackerFactory { + public: + static const PointLockTrackerFactory& Get() { + static const PointLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new PointLockTracker(); } + + private: + PointLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h new file mode 100644 index 000000000..01899542e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h @@ -0,0 +1,36 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// Generic definitions for a Range-based Lock Manager +// +#pragma once +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/lock/lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +/* + A base class for all Range-based lock managers + + See also class RangeLockManagerHandle in + include/rocksdb/utilities/transaction_db.h +*/ +class RangeLockManagerBase : public LockManager { + public: + // Geting a point lock is reduced to getting a range lock on a single-point + // range + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env, bool exclusive) override { + Endpoint endp(key.data(), key.size(), false); + return TryLock(txn, column_family_id, endp, endp, env, exclusive); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc new file mode 100644 index 000000000..bce66c1f3 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc @@ -0,0 +1,459 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include <algorithm> +#include <functional> +#include <string> +#include <thread> + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/transactions/lock/point/point_lock_manager_test.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_test.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +class RangeLockingTest : public ::testing::Test { + public: + TransactionDB* db; + std::string dbname; + Options options; + + std::shared_ptr<RangeLockManagerHandle> range_lock_mgr; + TransactionDBOptions txn_db_options; + + RangeLockingTest() : db(nullptr) { + options.create_if_missing = true; + dbname = test::PerThreadDBPath("range_locking_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + + range_lock_mgr.reset(NewRangeLockManager(nullptr)); + txn_db_options.lock_mgr_handle = range_lock_mgr; + + auto s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(s.ok()); + } + + ~RangeLockingTest() { + delete db; + db = nullptr; + // This is to skip the assert statement in FaultInjectionTestEnv. There + // seems to be a bug in btrfs that the makes readdir return recently + // unlink-ed files. By using the default fs we simply ignore errors resulted + // from attempting to delete such files in DestroyDB. + EXPECT_OK(DestroyDB(dbname, options)); + } + + PessimisticTransaction* NewTxn( + TransactionOptions txn_opt = TransactionOptions()) { + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt); + return reinterpret_cast<PessimisticTransaction*>(txn); + } +}; + +// TODO: set a smaller lock wait timeout so that the test runs faster. +TEST_F(RangeLockingTest, BasicRangeLocking) { + WriteOptions write_options; + TransactionOptions txn_options; + std::string value; + ReadOptions read_options; + auto cf = db->DefaultColumnFamily(); + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + // Check that range Lock inhibits an overlapping range lock + { + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Check that range Lock inhibits an overlapping point lock + { + auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Get a point lock, check that it inhibits range locks + ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value"))); + { + auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p")); + ASSERT_TRUE(s.IsTimedOut()); + } + + ASSERT_OK(txn0->Commit()); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, MyRocksLikeUpdate) { + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto cf = db->DefaultColumnFamily(); + Status s; + + // Get a range lock for the range we are about to update + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + bool try_range_lock_called = false; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RangeTreeLockManager::TryRangeLock:enter", + [&](void* /*arg*/) { try_range_lock_called = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // For performance reasons, the following must NOT call lock_mgr->TryLock(): + // We verify that by checking the value of try_range_lock_called. + ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"), + /*assume_tracked=*/true)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_FALSE(try_range_lock_called); + + txn0->Rollback(); + + delete txn0; +} + +TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) { + WriteOptions write_options; + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + Status s; + std::string value; + txn_options.lock_timeout = 10; + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get the shared lock in txn0 + s = txn0->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, + false /*exclusive*/); + ASSERT_TRUE(s.IsNotFound()); + + // Get the shared lock on the same key in txn1 + s = txn1->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, + false /*exclusive*/); + ASSERT_TRUE(s.IsNotFound()); + + // Now, try getting an exclusive lock that overlaps with the above + s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b")); + ASSERT_TRUE(s.IsTimedOut()); + + txn0->Rollback(); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, SnapshotValidation) { + Status s; + Slice key_slice = Slice("k"); + ColumnFamilyHandle* cfh = db->DefaultColumnFamily(); + + auto txn0 = NewTxn(); + txn0->Put(key_slice, Slice("initial")); + txn0->Commit(); + + // txn1 + auto txn1 = NewTxn(); + txn1->SetSnapshot(); + std::string val1; + ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1)); + ASSERT_EQ(val1, "initial"); + val1 = val1 + std::string("-txn1"); + + ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1))); + + // txn2 + auto txn2 = NewTxn(); + txn2->SetSnapshot(); + std::string val2; + // This will see the original value as nothing is committed + // This is also Get, so it is doesn't acquire any locks. + ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2)); + ASSERT_EQ(val2, "initial"); + + // txn1 + ASSERT_OK(txn1->Commit()); + + // txn2 + val2 = val2 + std::string("-txn2"); + // Now, this call should do Snapshot Validation and fail: + s = txn2->Put(cfh, key_slice, Slice(val2)); + ASSERT_TRUE(s.IsBusy()); + + ASSERT_OK(txn2->Commit()); + + delete txn0; + delete txn1; + delete txn2; +} + +TEST_F(RangeLockingTest, MultipleTrxLockStatusData) { + WriteOptions write_options; + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z"))); + ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e"))); + + auto s = range_lock_mgr->GetRangeLockStatusData(); + ASSERT_EQ(s.size(), 2); + for (auto it = s.begin(); it != s.end(); ++it) { + ASSERT_EQ(it->first, cf->GetID()); + auto val = it->second; + ASSERT_FALSE(val.start.inf_suffix); + ASSERT_FALSE(val.end.inf_suffix); + ASSERT_TRUE(val.exclusive); + ASSERT_EQ(val.ids.size(), 1); + if (val.ids[0] == txn0->GetID()) { + ASSERT_EQ(val.start.slice, "z"); + ASSERT_EQ(val.end.slice, "z"); + } else if (val.ids[0] == txn1->GetID()) { + ASSERT_EQ(val.start.slice, "b"); + ASSERT_EQ(val.end.slice, "e"); + } else { + FAIL(); // Unknown transaction ID. + } + } + + delete txn0; + delete txn1; +} + +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define SKIP_LOCK_ESCALATION_TEST 1 +#endif +#else +#define SKIP_LOCK_ESCALATION_TEST 1 +#endif + +#ifndef SKIP_LOCK_ESCALATION_TEST +TEST_F(RangeLockingTest, BasicLockEscalation) { + auto cf = db->DefaultColumnFamily(); + + auto counters = range_lock_mgr->GetStatus(); + + // Initially not using any lock memory + ASSERT_EQ(counters.current_lock_memory, 0); + ASSERT_EQ(counters.escalation_count, 0); + + ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000)); + + // Insert until we see lock escalations + auto txn = NewTxn(); + + // Get the locks until we hit an escalation + for (int i = 0; i < 2020; i++) { + std::ostringstream buf; + buf << std::setw(8) << std::setfill('0') << i; + std::string buf_str = buf.str(); + ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + counters = range_lock_mgr->GetStatus(); + ASSERT_GT(counters.escalation_count, 0); + ASSERT_LE(counters.current_lock_memory, 2000); + + delete txn; +} + +// An escalation barrier function. Allow escalation iff the first two bytes are +// identical. +static bool escalation_barrier(const Endpoint& a, const Endpoint& b) { + assert(a.slice.size() > 2); + assert(b.slice.size() > 2); + if (memcmp(a.slice.data(), b.slice.data(), 2)) { + return true; // This is a barrier + } else { + return false; // No barrier + } +} + +TEST_F(RangeLockingTest, LockEscalationBarrier) { + auto cf = db->DefaultColumnFamily(); + + auto counters = range_lock_mgr->GetStatus(); + + // Initially not using any lock memory + ASSERT_EQ(counters.escalation_count, 0); + + range_lock_mgr->SetMaxLockMemory(8000); + range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier); + + // Insert enough locks to cause lock escalations to happen + auto txn = NewTxn(); + const int N = 2000; + for (int i = 0; i < N; i++) { + std::ostringstream buf; + buf << std::setw(4) << std::setfill('0') << i; + std::string buf_str = buf.str(); + ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + counters = range_lock_mgr->GetStatus(); + ASSERT_GT(counters.escalation_count, 0); + + // Check that lock escalation was not performed across escalation barriers: + // Use another txn to acquire locks near the barriers. + auto txn2 = NewTxn(); + range_lock_mgr->SetMaxLockMemory(500000); + for (int i = 100; i < N; i += 100) { + std::ostringstream buf; + buf << std::setw(4) << std::setfill('0') << i - 1 << "-a"; + std::string buf_str = buf.str(); + // Check that we CAN get a lock near the escalation barrier + ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str))); + } + + txn->Rollback(); + txn2->Rollback(); + delete txn; + delete txn2; +} + +#endif + +TEST_F(RangeLockingTest, LockWaitCount) { + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + txn_options.lock_timeout = 50; + Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options); + Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + uint64_t lock_waits1 = range_lock_mgr->GetStatus().lock_wait_count; + // Attempt to get a conflicting lock + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.IsTimedOut()); + + // Check that the counter was incremented + uint64_t lock_waits2 = range_lock_mgr->GetStatus().lock_wait_count; + ASSERT_EQ(lock_waits1 + 1, lock_waits2); + + txn0->Rollback(); + txn1->Rollback(); + + delete txn0; + delete txn1; +} + +TEST_F(RangeLockingTest, LockWaiteeAccess) { + TransactionOptions txn_options; + auto cf = db->DefaultColumnFamily(); + txn_options.lock_timeout = 60; + Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options); + Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options); + + // Get a range lock + ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c"))); + + std::atomic<bool> reached(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RangeTreeLockManager::TryRangeLock:EnterWaitingTxn", [&](void* /*arg*/) { + reached.store(true); + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t([&]() { + // Attempt to get a conflicting lock + auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z")); + ASSERT_TRUE(s.ok()); + txn1->Rollback(); + }); + + while (!reached.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Release locks and free the transaction + txn0->Rollback(); + delete txn0; + + t.join(); + + delete txn1; +} + +void PointLockManagerTestExternalSetup(PointLockManagerTest* self) { + self->env_ = Env::Default(); + self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test"); + ASSERT_OK(self->env_->CreateDir(self->db_dir_)); + + Options opt; + opt.create_if_missing = true; + TransactionDBOptions txn_opt; + txn_opt.transaction_lock_timeout = 0; + + auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>(); + self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager()); + std::shared_ptr<RangeLockManagerHandle> range_lock_mgr = + std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_); + txn_opt.lock_mgr_handle = range_lock_mgr; + + ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_)); + self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn"; +} + +INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest, + ::testing::Values(PointLockManagerTestExternalSetup)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else // OS_WIN + +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "skipped as Range Locking is not supported on Windows\n"); + return 0; +} + +#endif // OS_WIN + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "skipped as transactions are not supported in rocksdb_lite\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 new file mode 100644 index 000000000..dba13ed2d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +<http://www.gnu.org/licenses/>. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 new file mode 100644 index 000000000..ecbfc770f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 @@ -0,0 +1,174 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 new file mode 100644 index 000000000..d511905c1 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README new file mode 100644 index 000000000..2ea86bf46 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README @@ -0,0 +1,13 @@ +The files in this directory originally come from +https://github.com/percona/PerconaFT/. + +This directory only includes the "locktree" part of PerconaFT, and its +dependencies. + +The following modifications were made: +- Make locktree usable outside of PerconaFT library +- Add shared read-only lock support + +The files named *_subst.* are substitutes of the PerconaFT's files, they +contain replacements of PerconaFT's functionality. + diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h new file mode 100644 index 000000000..5aa826c8e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h @@ -0,0 +1,76 @@ +#ifndef _DB_H +#define _DB_H + +#include <stdint.h> +#include <sys/types.h> + +typedef struct __toku_dbt DBT; + +// port: this is currently not used +struct simple_dbt { + uint32_t len; + void *data; +}; + +// engine status info +// engine status is passed to handlerton as an array of +// TOKU_ENGINE_STATUS_ROW_S[] +typedef enum { + STATUS_FS_STATE = 0, // interpret as file system state (redzone) enum + STATUS_UINT64, // interpret as uint64_t + STATUS_CHARSTR, // interpret as char * + STATUS_UNIXTIME, // interpret as time_t + STATUS_TOKUTIME, // interpret as tokutime_t + STATUS_PARCOUNT, // interpret as PARTITIONED_COUNTER + STATUS_DOUBLE // interpret as double +} toku_engine_status_display_type; + +typedef enum { + TOKU_ENGINE_STATUS = (1ULL << 0), // Include when asking for engine status + TOKU_GLOBAL_STATUS = + (1ULL << 1), // Include when asking for information_schema.global_status +} toku_engine_status_include_type; + +typedef struct __toku_engine_status_row { + const char *keyname; // info schema key, should not change across revisions + // without good reason + const char + *columnname; // column for mysql, e.g. information_schema.global_status. + // TOKUDB_ will automatically be prefixed. + const char *legend; // the text that will appear at user interface + toku_engine_status_display_type type; // how to interpret the value + toku_engine_status_include_type + include; // which kinds of callers should get read this row? + union { + double dnum; + uint64_t num; + const char *str; + char datebuf[26]; + struct partitioned_counter *parcount; + } value; +} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; + +#define DB_BUFFER_SMALL -30999 +#define DB_LOCK_DEADLOCK -30995 +#define DB_LOCK_NOTGRANTED -30994 +#define DB_NOTFOUND -30989 +#define DB_KEYEXIST -30996 +#define DB_DBT_MALLOC 8 +#define DB_DBT_REALLOC 64 +#define DB_DBT_USERMEM 256 + +/* PerconaFT specific error codes */ +#define TOKUDB_OUT_OF_LOCKS -100000 + +typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid, + uint64_t blocking_txnid); + +struct __toku_dbt { + void *data; + size_t size; + size_t ulen; + // One of DB_DBT_XXX flags + uint32_t flags; +}; + +#endif diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h new file mode 100644 index 000000000..718efc623 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h @@ -0,0 +1,138 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <string.h> + +#include "../db.h" +#include "../portability/memory.h" +#include "../util/dbt.h" + +typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b); + +int toku_keycompare(const void *key1, size_t key1len, const void *key2, + size_t key2len); + +int toku_builtin_compare_fun(const DBT *, const DBT *) + __attribute__((__visibility__("default"))); + +namespace toku { + +// a comparator object encapsulates the data necessary for +// comparing two keys in a fractal tree. it further understands +// that points may be positive or negative infinity. + +class comparator { + void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) { + _cmp = cmp; + _cmp_arg = cmp_arg; + _memcmp_magic = memcmp_magic; + } + + public: + // This magic value is reserved to mean that the magic has not been set. + static const uint8_t MEMCMP_MAGIC_NONE = 0; + + void create(ft_compare_func cmp, void *cmp_arg, + uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) { + init(cmp, cmp_arg, memcmp_magic); + } + + // inherit the attributes of another comparator, but keep our own + // copy of fake_db that is owned separately from the one given. + void inherit(const comparator &cmp) { + invariant_notnull(cmp._cmp); + init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic); + } + + // like inherit, but doesn't require that the this comparator + // was already created + void create_from(const comparator &cmp) { inherit(cmp); } + + void destroy() {} + + ft_compare_func get_compare_func() const { return _cmp; } + + uint8_t get_memcmp_magic() const { return _memcmp_magic; } + + bool valid() const { return _cmp != nullptr; } + + inline bool dbt_has_memcmp_magic(const DBT *dbt) const { + return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic; + } + + int operator()(const DBT *a, const DBT *b) const { + if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b), + 0)) { + return toku_dbt_infinite_compare(a, b); + } else if (_memcmp_magic != MEMCMP_MAGIC_NONE + // If `a' has the memcmp magic.. + && dbt_has_memcmp_magic(a) + // ..then we expect `b' to also have the memcmp magic + && __builtin_expect(dbt_has_memcmp_magic(b), 1)) { + assert(0); // psergey: this branch should not be taken. + return toku_builtin_compare_fun(a, b); + } else { + // yikes, const sadness here + return _cmp(_cmp_arg, a, b); + } + } + + private: + ft_compare_func _cmp; + void *_cmp_arg; + + uint8_t _memcmp_magic; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h new file mode 100644 index 000000000..1b4511172 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h @@ -0,0 +1,102 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../portability/toku_race_tools.h" +#include "../util/status.h" + +// +// Lock Tree Manager statistics +// +class LTM_STATUS_S { + public: + enum { + LTM_SIZE_CURRENT = 0, + LTM_SIZE_LIMIT, + LTM_ESCALATION_COUNT, + LTM_ESCALATION_TIME, + LTM_ESCALATION_LATEST_RESULT, + LTM_NUM_LOCKTREES, + LTM_LOCK_REQUESTS_PENDING, + LTM_STO_NUM_ELIGIBLE, + LTM_STO_END_EARLY_COUNT, + LTM_STO_END_EARLY_TIME, + LTM_WAIT_COUNT, + LTM_WAIT_TIME, + LTM_LONG_WAIT_COUNT, + LTM_LONG_WAIT_TIME, + LTM_TIMEOUT_COUNT, + LTM_WAIT_ESCALATION_COUNT, + LTM_WAIT_ESCALATION_TIME, + LTM_LONG_WAIT_ESCALATION_COUNT, + LTM_LONG_WAIT_ESCALATION_TIME, + LTM_STATUS_NUM_ROWS // must be last + }; + + void init(void); + void destroy(void); + + TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS]; + + private: + bool m_initialized = false; +}; +typedef LTM_STATUS_S* LTM_STATUS; +extern LTM_STATUS_S ltm_status; + +#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num + +void toku_status_init(void); // just call ltm_status.init(); +void toku_status_destroy(void); // just call ltm_status.destroy(); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc new file mode 100644 index 000000000..5110cd482 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc @@ -0,0 +1,139 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "concurrent_tree.h" + +// PORT #include <toku_assert.h> +namespace toku { + +void concurrent_tree::create(const comparator *cmp) { + // start with an empty root node. we do this instead of + // setting m_root to null so there's always a root to lock + m_root.create_root(cmp); +} + +void concurrent_tree::destroy(void) { m_root.destroy_root(); } + +bool concurrent_tree::is_empty(void) { return m_root.is_empty(); } + +uint64_t concurrent_tree::get_insertion_memory_overhead(void) { + return sizeof(treenode); +} + +void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) { + // the first step in acquiring a locked keyrange is locking the root + treenode *const root = &tree->m_root; + m_tree = tree; + m_subtree = root; + m_range = keyrange::get_infinite_range(); + root->mutex_lock(); +} + +void concurrent_tree::locked_keyrange::acquire(const keyrange &range) { + treenode *const root = &m_tree->m_root; + + treenode *subtree; + if (root->is_empty() || root->range_overlaps(range)) { + subtree = root; + } else { + // we do not have a precomputed comparison hint, so pass null + const keyrange::comparison *cmp_hint = nullptr; + subtree = root->find_node_with_overlapping_child(range, cmp_hint); + } + + // subtree is locked. it will be unlocked when this is release()'d + invariant_notnull(subtree); + m_range = range; + m_subtree = subtree; +} + +bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range, + TXNID new_owner) { + return m_subtree->insert(range, new_owner, /*is_shared*/ true); +} + +void concurrent_tree::locked_keyrange::release(void) { + m_subtree->mutex_unlock(); +} + +void concurrent_tree::locked_keyrange::insert(const keyrange &range, + TXNID txnid, bool is_shared) { + // empty means no children, and only the root should ever be empty + if (m_subtree->is_empty()) { + m_subtree->set_range_and_txnid(range, txnid, is_shared); + } else { + m_subtree->insert(range, txnid, is_shared); + } +} + +void concurrent_tree::locked_keyrange::remove(const keyrange &range, + TXNID txnid) { + invariant(!m_subtree->is_empty()); + treenode *new_subtree = m_subtree->remove(range, txnid); + // if removing range changed the root of the subtree, + // then the subtree must be the root of the entire tree. + if (new_subtree == nullptr) { + invariant(m_subtree->is_root()); + invariant(m_subtree->is_empty()); + } +} + +void concurrent_tree::locked_keyrange::remove_all(void) { + m_subtree->recursive_remove(); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h new file mode 100644 index 000000000..e1bfb86c5 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h @@ -0,0 +1,174 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" +#include "keyrange.h" +#include "treenode.h" + +namespace toku { + +// A concurrent_tree stores non-overlapping ranges. +// Access to disjoint parts of the tree usually occurs concurrently. + +class concurrent_tree { + public: + // A locked_keyrange gives you exclusive access to read and write + // operations that occur on any keys in that range. You only have + // the right to operate on keys in that range or keys that were read + // from the keyrange using iterate() + // + // Access model: + // - user prepares a locked keyrange. all threads serialize behind prepare(). + // - user breaks the serialzation point by acquiring a range, or releasing. + // - one thread operates on a certain locked_keyrange object at a time. + // - when the thread is finished, it releases + + class locked_keyrange { + public: + // effect: prepare to acquire a locked keyrange over the given + // concurrent_tree, preventing other threads from preparing + // until this thread either does acquire() or release(). + // note: operations performed on a prepared keyrange are equivalent + // to ones performed on an acquired keyrange over -inf, +inf. + // rationale: this provides the user with a serialization point for + // descending + // or modifying the the tree. it also proives a convenient way of + // doing serializable operations on the tree. + // There are two valid sequences of calls: + // - prepare, acquire, [operations], release + // - prepare, [operations],release + void prepare(concurrent_tree *tree); + + // requires: the locked keyrange was prepare()'d + // effect: acquire a locked keyrange over the given concurrent_tree. + // the locked keyrange represents the range of keys overlapped + // by the given range + void acquire(const keyrange &range); + + // effect: releases a locked keyrange and the mutex it holds + void release(void); + + // effect: iterate over each range this locked_keyrange represents, + // calling function->fn() on each node's keyrange and txnid + // until there are no more or the function returns false + template <class F> + void iterate(F *function) const { + // if the subtree is non-empty, traverse it by calling the given + // function on each range, txnid pair found that overlaps. + if (!m_subtree->is_empty()) { + m_subtree->traverse_overlaps(m_range, function); + } + } + + // Adds another owner to the lock on the specified keyrange. + // requires: the keyrange contains one treenode whose bounds are + // exactly equal to the specifed range (no sub/supersets) + bool add_shared_owner(const keyrange &range, TXNID new_owner); + + // inserts the given range into the tree, with an associated txnid. + // requires: range does not overlap with anything in this locked_keyrange + // rationale: caller is responsible for only inserting unique ranges + void insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the tree. + // - txnid=TXNID_ANY means remove the range no matter what its + // owners are + // - Other value means remove the specified txnid from + // ownership (if the range has other owners, it will remain + // in the tree) + // requires: range exists exactly in this locked_keyrange + // rationale: caller is responsible for only removing existing ranges + void remove(const keyrange &range, TXNID txnid); + + // effect: removes all of the keys represented by this locked keyrange + // rationale: we'd like a fast way to empty out a tree + void remove_all(void); + + private: + // the concurrent tree this locked keyrange is for + concurrent_tree *m_tree; + + // the range of keys this locked keyrange represents + keyrange m_range; + + // the subtree under which all overlapping ranges exist + treenode *m_subtree; + + friend class concurrent_tree_unit_test; + }; + + // effect: initialize the tree to an empty state + void create(const comparator *cmp); + + // effect: destroy the tree. + // requires: tree is empty + void destroy(void); + + // returns: true iff the tree is empty + bool is_empty(void); + + // returns: the memory overhead of a single insertion into the tree + static uint64_t get_insertion_memory_overhead(void); + + private: + // the root needs to always exist so there's a lock to grab + // even if the tree is empty. that's why we store a treenode + // here and not a pointer to one. + treenode m_root; + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc new file mode 100644 index 000000000..e50ace5a9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc @@ -0,0 +1,222 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "keyrange.h" + +#include "../util/dbt.h" + +namespace toku { + +// create a keyrange by borrowing the left and right dbt +// pointers. no memory is copied. no checks for infinity needed. +void keyrange::create(const DBT *left, const DBT *right) { + init_empty(); + m_left_key = left; + m_right_key = right; +} + +// destroy the key copies. if they were never set, then destroy does nothing. +void keyrange::destroy(void) { + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); +} + +// create a keyrange by copying the keys from the given range. +void keyrange::create_copy(const keyrange &range) { + // start with an initialized, empty range + init_empty(); + + // optimize the case where the left and right keys are the same. + // we'd like to only have one copy of the data. + if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) { + set_both_keys(range.get_left_key()); + } else { + // replace our empty left and right keys with + // copies of the range's left and right keys + replace_left_key(range.get_left_key()); + replace_right_key(range.get_right_key()); + } +} + +// extend this keyrange by choosing the leftmost and rightmost +// endpoints between this range and the given. replaced keys +// in this range are freed and inherited keys are copied. +void keyrange::extend(const comparator &cmp, const keyrange &range) { + const DBT *range_left = range.get_left_key(); + const DBT *range_right = range.get_right_key(); + if (cmp(range_left, get_left_key()) < 0) { + replace_left_key(range_left); + } + if (cmp(range_right, get_right_key()) > 0) { + replace_right_key(range_right); + } +} + +// how much memory does this keyrange take? +// - the size of the left and right keys +// --- ignore the fact that we may have optimized the point case. +// it complicates things for little gain. +// - the size of the keyrange class itself +uint64_t keyrange::get_memory_size(void) const { + const DBT *left_key = get_left_key(); + const DBT *right_key = get_right_key(); + return left_key->size + right_key->size + sizeof(keyrange); +} + +// compare ranges. +keyrange::comparison keyrange::compare(const comparator &cmp, + const keyrange &range) const { + if (cmp(get_right_key(), range.get_left_key()) < 0) { + return comparison::LESS_THAN; + } else if (cmp(get_left_key(), range.get_right_key()) > 0) { + return comparison::GREATER_THAN; + } else if (cmp(get_left_key(), range.get_left_key()) == 0 && + cmp(get_right_key(), range.get_right_key()) == 0) { + return comparison::EQUALS; + } else { + return comparison::OVERLAPS; + } +} + +bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const { + // equality is a stronger form of overlapping. + // so two ranges "overlap" if they're either equal or just overlapping. + comparison c = compare(cmp, range); + return c == comparison::EQUALS || c == comparison::OVERLAPS; +} + +keyrange keyrange::get_infinite_range(void) { + keyrange range; + range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity()); + return range; +} + +void keyrange::init_empty(void) { + m_left_key = nullptr; + m_right_key = nullptr; + toku_init_dbt(&m_left_key_copy); + toku_init_dbt(&m_right_key_copy); + m_point_range = false; +} + +const DBT *keyrange::get_left_key(void) const { + if (m_left_key) { + return m_left_key; + } else { + return &m_left_key_copy; + } +} + +const DBT *keyrange::get_right_key(void) const { + if (m_right_key) { + return m_right_key; + } else { + return &m_right_key_copy; + } +} + +// copy the given once and set both the left and right pointers. +// optimization for point ranges, so the left and right ranges +// are not copied twice. +void keyrange::set_both_keys(const DBT *key) { + if (toku_dbt_is_infinite(key)) { + m_left_key = key; + m_right_key = key; + } else { + toku_clone_dbt(&m_left_key_copy, *key); + toku_copyref_dbt(&m_right_key_copy, m_left_key_copy); + } + m_point_range = true; +} + +// destroy the current left key. set and possibly copy the new one +void keyrange::replace_left_key(const DBT *key) { + // a little magic: + // + // if this is a point range, then the left and right keys share + // one copy of the data, and it lives in the left key copy. so + // if we're replacing the left key, move the real data to the + // right key copy instead of destroying it. now, the memory is + // owned by the right key and the left key may be replaced. + if (m_point_range) { + m_right_key_copy = m_left_key_copy; + } else { + toku_destroy_dbt(&m_left_key_copy); + } + + if (toku_dbt_is_infinite(key)) { + m_left_key = key; + } else { + toku_clone_dbt(&m_left_key_copy, *key); + m_left_key = nullptr; + } + m_point_range = false; +} + +// destroy the current right key. set and possibly copy the new one +void keyrange::replace_right_key(const DBT *key) { + toku_destroy_dbt(&m_right_key_copy); + if (toku_dbt_is_infinite(key)) { + m_right_key = key; + } else { + toku_clone_dbt(&m_right_key_copy, *key); + m_right_key = nullptr; + } + m_point_range = false; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h new file mode 100644 index 000000000..f9aeea0c4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../ft/comparator.h" + +namespace toku { + +// A keyrange has a left and right key as endpoints. +// +// When a keyrange is created it owns no memory, but when it copies +// or extends another keyrange, it copies memory as necessary. This +// means it is cheap in the common case. + +class keyrange { + public: + // effect: constructor that borrows left and right key pointers. + // no memory is allocated or copied. + void create(const DBT *left_key, const DBT *right_key); + + // effect: constructor that allocates and copies another keyrange's points. + void create_copy(const keyrange &range); + + // effect: destroys the keyrange, freeing any allocated memory + void destroy(void); + + // effect: extends the keyrange by choosing the leftmost and rightmost + // endpoints from this range and the given range. + // replaced keys in this range are freed, new keys are copied. + void extend(const comparator &cmp, const keyrange &range); + + // returns: the amount of memory this keyrange takes. does not account + // for point optimizations or malloc overhead. + uint64_t get_memory_size(void) const; + + // returns: pointer to the left key of this range + const DBT *get_left_key(void) const; + + // returns: pointer to the right key of this range + const DBT *get_right_key(void) const; + + // two ranges are either equal, lt, gt, or overlapping + enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS }; + + // effect: compares this range to the given range + // returns: LESS_THAN if given range is strictly to the left + // GREATER_THAN if given range is strictly to the right + // EQUALS if given range has the same left and right endpoints + // OVERLAPS if at least one of the given range's endpoints falls + // between this range's endpoints + comparison compare(const comparator &cmp, const keyrange &range) const; + + // returns: true if the range and the given range are equal or overlapping + bool overlaps(const comparator &cmp, const keyrange &range) const; + + // returns: a keyrange representing -inf, +inf + static keyrange get_infinite_range(void); + + private: + // some keys should be copied, some keys should not be. + // + // to support both, we use two DBTs for copies and two pointers + // for temporaries. the access rule is: + // - if a pointer is non-null, then it reprsents the key. + // - otherwise the pointer is null, and the key is in the copy. + DBT m_left_key_copy; + DBT m_right_key_copy; + const DBT *m_left_key; + const DBT *m_right_key; + + // if this range is a point range, then m_left_key == m_right_key + // and the actual data is stored exactly once in m_left_key_copy. + bool m_point_range; + + // effect: initializes a keyrange to be empty + void init_empty(void); + + // effect: copies the given key once into the left key copy + // and sets the right key copy to share the left. + // rationale: optimization for point ranges to only do one malloc + void set_both_keys(const DBT *key); + + // effect: destroys the current left key. sets and copies the new one. + void replace_left_key(const DBT *key); + + // effect: destroys the current right key. sets and copies the new one. + void replace_right_key(const DBT *key); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc new file mode 100644 index 000000000..3d217be70 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc @@ -0,0 +1,527 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "lock_request.h" + +#include "../portability/toku_race_tools.h" +#include "../portability/txn_subst.h" +#include "../util/dbt.h" +#include "locktree.h" + +namespace toku { + +// initialize a lock request's internals +void lock_request::create(toku_external_mutex_factory_t mutex_factory) { + m_txnid = TXNID_NONE; + m_conflicting_txnid = TXNID_NONE; + m_start_time = 0; + m_left_key = nullptr; + m_right_key = nullptr; + toku_init_dbt(&m_left_key_copy); + toku_init_dbt(&m_right_key_copy); + + m_type = type::UNKNOWN; + m_lt = nullptr; + + m_complete_r = 0; + m_state = state::UNINITIALIZED; + m_info = nullptr; + + // psergey-todo: this condition is for interruptible wait + // note: moved to here from lock_request::create: + toku_external_cond_init(mutex_factory, &m_wait_cond); + + m_start_test_callback = nullptr; + m_start_before_pending_test_callback = nullptr; + m_retry_test_callback = nullptr; +} + +// destroy a lock request. +void lock_request::destroy(void) { + invariant(m_state != state::PENDING); + invariant(m_state != state::DESTROYED); + m_state = state::DESTROYED; + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); + toku_external_cond_destroy(&m_wait_cond); +} + +// set the lock request parameters. this API allows a lock request to be reused. +void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, + const DBT *right_key, lock_request::type lock_type, + bool big_txn, void *extra) { + invariant(m_state != state::PENDING); + m_lt = lt; + + m_txnid = txnid; + m_left_key = left_key; + m_right_key = right_key; + toku_destroy_dbt(&m_left_key_copy); + toku_destroy_dbt(&m_right_key_copy); + m_type = lock_type; + m_state = state::INITIALIZED; + m_info = lt ? lt->get_lock_request_info() : nullptr; + m_big_txn = big_txn; + m_extra = extra; +} + +// get rid of any stored left and right key copies and +// replace them with copies of the given left and right key +void lock_request::copy_keys() { + if (!toku_dbt_is_infinite(m_left_key)) { + toku_clone_dbt(&m_left_key_copy, *m_left_key); + m_left_key = &m_left_key_copy; + } + if (!toku_dbt_is_infinite(m_right_key)) { + toku_clone_dbt(&m_right_key_copy, *m_right_key); + m_right_key = &m_right_key_copy; + } +} + +// what are the conflicts for this pending lock request? +void lock_request::get_conflicts(txnid_set *conflicts) { + invariant(m_state == state::PENDING); + const bool is_write_request = m_type == type::WRITE; + m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key, + conflicts); +} + +// build a wait-for-graph for this lock request and the given conflict set +// for each transaction B that blocks A's lock request +// if B is blocked then +// add (A,T) to the WFG and if B is new, fill in the WFG from B +void lock_request::build_wait_graph(wfg *wait_graph, + const txnid_set &conflicts) { + uint32_t num_conflicts = conflicts.size(); + for (uint32_t i = 0; i < num_conflicts; i++) { + TXNID conflicting_txnid = conflicts.get(i); + lock_request *conflicting_request = find_lock_request(conflicting_txnid); + invariant(conflicting_txnid != m_txnid); + invariant(conflicting_request != this); + if (conflicting_request) { + bool already_exists = wait_graph->node_exists(conflicting_txnid); + wait_graph->add_edge(m_txnid, conflicting_txnid); + if (!already_exists) { + // recursively build the wait for graph rooted at the conflicting + // request, given its set of lock conflicts. + txnid_set other_conflicts; + other_conflicts.create(); + conflicting_request->get_conflicts(&other_conflicts); + conflicting_request->build_wait_graph(wait_graph, other_conflicts); + other_conflicts.destroy(); + } + } + } +} + +// returns: true if the current set of lock requests contains +// a deadlock, false otherwise. +bool lock_request::deadlock_exists(const txnid_set &conflicts) { + wfg wait_graph; + wait_graph.create(); + + build_wait_graph(&wait_graph, conflicts); + + std::function<void(TXNID)> reporter; + if (m_deadlock_cb) { + reporter = [this](TXNID a) { + lock_request *req = find_lock_request(a); + if (req) { + m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE), + req->m_left_key, req->m_right_key); + } + }; + } + + bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter); + wait_graph.destroy(); + return deadlock; +} + +// try to acquire a lock described by this lock request. +int lock_request::start(void) { + int r; + + txnid_set conflicts; + conflicts.create(); + if (m_type == type::WRITE) { + r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } else { + invariant(m_type == type::READ); + r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } + + // if the lock is not granted, save it to the set of lock requests + // and check for a deadlock. if there is one, complete it as failed + if (r == DB_LOCK_NOTGRANTED) { + copy_keys(); + m_state = state::PENDING; + m_start_time = toku_current_time_microsec() / 1000; + m_conflicting_txnid = conflicts.get(0); + if (m_start_before_pending_test_callback) + m_start_before_pending_test_callback(); + toku_external_mutex_lock(&m_info->mutex); + insert_into_lock_requests(); + if (deadlock_exists(conflicts)) { + remove_from_lock_requests(); + r = DB_LOCK_DEADLOCK; + } + toku_external_mutex_unlock(&m_info->mutex); + if (m_start_test_callback) m_start_test_callback(); // test callback + } + + if (r != DB_LOCK_NOTGRANTED) { + complete(r); + } + + conflicts.destroy(); + return r; +} + +// sleep on the lock request until it becomes resolved or the wait time has +// elapsed. +int lock_request::wait(uint64_t wait_time_ms) { + return wait(wait_time_ms, 0, nullptr); +} + +int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, + int (*killed_callback)(void), + void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg) { + uint64_t t_now = toku_current_time_microsec(); + uint64_t t_start = t_now; + uint64_t t_end = t_start + wait_time_ms * 1000; + + toku_external_mutex_lock(&m_info->mutex); + + // check again, this time locking out other retry calls + if (m_state == state::PENDING) { + lock_wait_infos conflicts_collector; + retry(&conflicts_collector); + if (m_state == state::PENDING) { + report_waits(&conflicts_collector, lock_wait_callback, callback_arg); + } + } + + while (m_state == state::PENDING) { + // check if this thread is killed + if (killed_callback && killed_callback()) { + remove_from_lock_requests(); + complete(DB_LOCK_NOTGRANTED); + continue; + } + + // compute the time until we should wait + uint64_t t_wait; + if (killed_time_ms == 0) { + t_wait = t_end; + } else { + t_wait = t_now + killed_time_ms * 1000; + if (t_wait > t_end) t_wait = t_end; + } + + int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex, + (int64_t)(t_wait - t_now)); + invariant(r == 0 || r == ETIMEDOUT); + + t_now = toku_current_time_microsec(); + if (m_state == state::PENDING && (t_now >= t_end)) { + m_info->counters.timeout_count += 1; + + // if we're still pending and we timed out, then remove our + // request from the set of lock requests and fail. + remove_from_lock_requests(); + + // complete sets m_state to COMPLETE, breaking us out of the loop + complete(DB_LOCK_NOTGRANTED); + } + } + + uint64_t t_real_end = toku_current_time_microsec(); + uint64_t duration = t_real_end - t_start; + m_info->counters.wait_count += 1; + m_info->counters.wait_time += duration; + if (duration >= 1000000) { + m_info->counters.long_wait_count += 1; + m_info->counters.long_wait_time += duration; + } + toku_external_mutex_unlock(&m_info->mutex); + + invariant(m_state == state::COMPLETE); + return m_complete_r; +} + +// complete this lock request with the given return value +void lock_request::complete(int complete_r) { + m_complete_r = complete_r; + m_state = state::COMPLETE; +} + +const DBT *lock_request::get_left_key(void) const { return m_left_key; } + +const DBT *lock_request::get_right_key(void) const { return m_right_key; } + +TXNID lock_request::get_txnid(void) const { return m_txnid; } + +uint64_t lock_request::get_start_time(void) const { return m_start_time; } + +TXNID lock_request::get_conflicting_txnid(void) const { + return m_conflicting_txnid; +} + +int lock_request::retry(lock_wait_infos *conflicts_collector) { + invariant(m_state == state::PENDING); + int r; + txnid_set conflicts; + conflicts.create(); + + if (m_type == type::WRITE) { + r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } else { + r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, + m_big_txn); + } + + // if the acquisition succeeded then remove ourselves from the + // set of lock requests, complete, and signal the waiting thread. + if (r == 0) { + remove_from_lock_requests(); + complete(r); + if (m_retry_test_callback) m_retry_test_callback(); // test callback + toku_external_cond_broadcast(&m_wait_cond); + } else { + m_conflicting_txnid = conflicts.get(0); + add_conflicts_to_waits(&conflicts, conflicts_collector); + } + conflicts.destroy(); + + return r; +} + +void lock_request::retry_all_lock_requests( + locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg, void (*after_retry_all_test_callback)(void)) { + lt_lock_request_info *info = lt->get_lock_request_info(); + + // if there are no pending lock requests than there is nothing to do + // the unlocked data race on pending_is_empty is OK since lock requests + // are retried after added to the pending set. + if (info->pending_is_empty) return; + + // get my retry generation (post increment of retry_want) + unsigned long long my_retry_want = (info->retry_want += 1); + + toku_mutex_lock(&info->retry_mutex); + + // here is the group retry algorithm. + // get the latest retry_want count and use it as the generation number of + // this retry operation. if this retry generation is > the last retry + // generation, then do the lock retries. otherwise, no lock retries + // are needed. + if ((my_retry_want - 1) == info->retry_done) { + for (;;) { + if (!info->running_retry) { + info->running_retry = true; + info->retry_done = info->retry_want; + toku_mutex_unlock(&info->retry_mutex); + retry_all_lock_requests_info(info, lock_wait_callback, callback_arg); + if (after_retry_all_test_callback) after_retry_all_test_callback(); + toku_mutex_lock(&info->retry_mutex); + info->running_retry = false; + toku_cond_broadcast(&info->retry_cv); + break; + } else { + toku_cond_wait(&info->retry_cv, &info->retry_mutex); + } + } + } + toku_mutex_unlock(&info->retry_mutex); +} + +void lock_request::retry_all_lock_requests_info( + lt_lock_request_info *info, + void (*lock_wait_callback)(void *, lock_wait_infos *), void *callback_arg) { + toku_external_mutex_lock(&info->mutex); + // retry all of the pending lock requests. + lock_wait_infos conflicts_collector; + for (uint32_t i = 0; i < info->pending_lock_requests.size();) { + lock_request *request; + int r = info->pending_lock_requests.fetch(i, &request); + invariant_zero(r); + + // retry the lock request. if it didn't succeed, + // move on to the next lock request. otherwise + // the request is gone from the list so we may + // read the i'th entry for the next one. + r = request->retry(&conflicts_collector); + if (r != 0) { + i++; + } + } + + // call report_waits while holding the pending queue lock since + // the waiter object is still valid while it's in the queue + report_waits(&conflicts_collector, lock_wait_callback, callback_arg); + + // future threads should only retry lock requests if some still exist + info->should_retry_lock_requests = info->pending_lock_requests.size() > 0; + toku_external_mutex_unlock(&info->mutex); +} + +void lock_request::add_conflicts_to_waits(txnid_set *conflicts, + lock_wait_infos *wait_conflicts) { + wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}}); + uint32_t num_conflicts = conflicts->size(); + for (uint32_t i = 0; i < num_conflicts; i++) { + wait_conflicts->back().waitees.push_back(conflicts->get(i)); + } +} + +void lock_request::report_waits(lock_wait_infos *wait_conflicts, + void (*lock_wait_callback)(void *, + lock_wait_infos *), + void *callback_arg) { + if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts); +} + +void *lock_request::get_extra(void) const { return m_extra; } + +void lock_request::kill_waiter(void) { + remove_from_lock_requests(); + complete(DB_LOCK_NOTGRANTED); + toku_external_cond_broadcast(&m_wait_cond); +} + +void lock_request::kill_waiter(locktree *lt, void *extra) { + lt_lock_request_info *info = lt->get_lock_request_info(); + toku_external_mutex_lock(&info->mutex); + for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) { + lock_request *request; + int r = info->pending_lock_requests.fetch(i, &request); + if (r == 0 && request->get_extra() == extra) { + request->kill_waiter(); + break; + } + } + toku_external_mutex_unlock(&info->mutex); +} + +// find another lock request by txnid. must hold the mutex. +lock_request *lock_request::find_lock_request(const TXNID &txnid) { + lock_request *request; + int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>( + txnid, &request, nullptr); + if (r != 0) { + request = nullptr; + } + return request; +} + +// insert this lock request into the locktree's set. must hold the mutex. +void lock_request::insert_into_lock_requests(void) { + uint32_t idx; + lock_request *request; + int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>( + m_txnid, &request, &idx); + invariant(r == DB_NOTFOUND); + r = m_info->pending_lock_requests.insert_at(this, idx); + invariant_zero(r); + m_info->pending_is_empty = false; +} + +// remove this lock request from the locktree's set. must hold the mutex. +void lock_request::remove_from_lock_requests(void) { + uint32_t idx; + lock_request *request; + int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>( + m_txnid, &request, &idx); + invariant_zero(r); + invariant(request == this); + r = m_info->pending_lock_requests.delete_at(idx); + invariant_zero(r); + if (m_info->pending_lock_requests.size() == 0) + m_info->pending_is_empty = true; +} + +int lock_request::find_by_txnid(lock_request *const &request, + const TXNID &txnid) { + TXNID request_txnid = request->m_txnid; + if (request_txnid < txnid) { + return -1; + } else if (request_txnid == txnid) { + return 0; + } else { + return 1; + } +} + +void lock_request::set_start_test_callback(void (*f)(void)) { + m_start_test_callback = f; +} + +void lock_request::set_start_before_pending_test_callback(void (*f)(void)) { + m_start_before_pending_test_callback = f; +} + +void lock_request::set_retry_test_callback(void (*f)(void)) { + m_retry_test_callback = f; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h new file mode 100644 index 000000000..d30e1e2ca --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h @@ -0,0 +1,255 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_pthread.h" +#include "locktree.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +// Information about a lock wait +struct lock_wait_info { + locktree *ltree; // the tree where wait happens + TXNID waiter; // the waiting transaction + void *m_extra; // lock_request's m_extra + + // The transactions that are waited for. + std::vector<TXNID> waitees; +}; + +typedef std::vector<lock_wait_info> lock_wait_infos; + +// A lock request contains the db, the key range, the lock type, and +// the transaction id that describes a potential row range lock. +// +// the typical use case is: +// - initialize a lock request +// - start to try to acquire the lock +// - do something else +// - wait for the lock request to be resolved on a timed condition +// - destroy the lock request +// a lock request is resolved when its state is no longer pending, or +// when it becomes granted, or timedout, or deadlocked. when resolved, the +// state of the lock request is changed and any waiting threads are awakened. + +class lock_request { + public: + enum type { UNKNOWN, READ, WRITE }; + + // effect: Initializes a lock request. + void create(toku_external_mutex_factory_t mutex_factory); + + // effect: Destroys a lock request. + void destroy(void); + + // effect: Resets the lock request parameters, allowing it to be reused. + // requires: Lock request was already created at some point + void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, + type lock_type, bool big_txn, void *extra = nullptr); + + // effect: Tries to acquire a lock described by this lock request. + // returns: The return code of locktree::acquire_[write,read]_lock() + // or DB_LOCK_DEADLOCK if this request would end up deadlocked. + int start(void); + + // effect: Sleeps until either the request is granted or the wait time + // expires. returns: The return code of locktree::acquire_[write,read]_lock() + // or simply DB_LOCK_NOTGRANTED if the wait time expired. + int wait(uint64_t wait_time_ms); + int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, + int (*killed_callback)(void), + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr); + + // return: left end-point of the lock range + const DBT *get_left_key(void) const; + + // return: right end-point of the lock range + const DBT *get_right_key(void) const; + + // return: the txnid waiting for a lock + TXNID get_txnid(void) const; + + // return: when this lock request started, as milliseconds from epoch + uint64_t get_start_time(void) const; + + // return: which txnid is blocking this request (there may be more, though) + TXNID get_conflicting_txnid(void) const; + + // effect: Retries all of the lock requests for the given locktree. + // Any lock requests successfully restarted is completed and woken + // up. + // The rest remain pending. + static void retry_all_lock_requests( + locktree *lt, + void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr, + void *callback_arg = nullptr, + void (*after_retry_test_callback)(void) = nullptr); + static void retry_all_lock_requests_info( + lt_lock_request_info *info, + void (*lock_wait_callback)(void *, lock_wait_infos *), + void *callback_arg); + + void set_start_test_callback(void (*f)(void)); + void set_start_before_pending_test_callback(void (*f)(void)); + void set_retry_test_callback(void (*f)(void)); + + void *get_extra(void) const; + + void kill_waiter(void); + static void kill_waiter(locktree *lt, void *extra); + + private: + enum state { + UNINITIALIZED, + INITIALIZED, + PENDING, + COMPLETE, + DESTROYED, + }; + + // The keys for a lock request are stored "unowned" in m_left_key + // and m_right_key. When the request is about to go to sleep, it + // copies these keys and stores them in m_left_key_copy etc and + // sets the temporary pointers to null. + TXNID m_txnid; + TXNID m_conflicting_txnid; + uint64_t m_start_time; + const DBT *m_left_key; + const DBT *m_right_key; + DBT m_left_key_copy; + DBT m_right_key_copy; + + // The lock request type and associated locktree + type m_type; + locktree *m_lt; + + // If the lock request is in the completed state, then its + // final return value is stored in m_complete_r + int m_complete_r; + state m_state; + + toku_external_cond_t m_wait_cond; + + bool m_big_txn; + + // the lock request info state stored in the + // locktree that this lock request is for. + struct lt_lock_request_info *m_info; + + void *m_extra; + + // effect: tries again to acquire the lock described by this lock request + // returns: 0 if retrying the request succeeded and is now complete + int retry(lock_wait_infos *collector); + + void complete(int complete_r); + + // effect: Finds another lock request by txnid. + // requires: The lock request info mutex is held + lock_request *find_lock_request(const TXNID &txnid); + + // effect: Insert this lock request into the locktree's set. + // requires: the locktree's mutex is held + void insert_into_lock_requests(void); + + // effect: Removes this lock request from the locktree's set. + // requires: The lock request info mutex is held + void remove_from_lock_requests(void); + + // effect: Asks this request's locktree which txnids are preventing + // us from getting the lock described by this request. + // returns: conflicts is populated with the txnid's that this request + // is blocked on + void get_conflicts(txnid_set *conflicts); + + // effect: Builds a wait-for-graph for this lock request and the given + // conflict set + void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts); + + // returns: True if this lock request is in deadlock with the given conflicts + // set + bool deadlock_exists(const txnid_set &conflicts); + + void copy_keys(void); + + static int find_by_txnid(lock_request *const &request, const TXNID &txnid); + + // Report list of conflicts to lock wait callback. + static void report_waits(lock_wait_infos *wait_conflicts, + void (*lock_wait_callback)(void *, + lock_wait_infos *), + void *callback_arg); + void add_conflicts_to_waits(txnid_set *conflicts, + lock_wait_infos *wait_conflicts); + + void (*m_start_test_callback)(void); + void (*m_start_before_pending_test_callback)(void); + void (*m_retry_test_callback)(void); + + public: + std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb; + + friend class lock_request_unit_test; +}; +// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t +// This is ok as the PODness is not really required: lock_request objects are +// not moved in memory or anything. +// ENSURE_POD(lock_request); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc new file mode 100644 index 000000000..3d6a590c7 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -0,0 +1,1023 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "locktree.h" + +#include <memory.h> + +#include "../portability/toku_pthread.h" +#include "../portability/toku_time.h" +#include "../util/growable_array.h" +#include "range_buffer.h" + +// including the concurrent_tree here expands the templates +// and "defines" the implementation, so we do it here in +// the locktree source file instead of the header. +#include "concurrent_tree.h" + +namespace toku { +// A locktree represents the set of row locks owned by all transactions +// over an open dictionary. Read and write ranges are represented as +// a left and right key which are compared with the given descriptor +// and comparison fn. +// +// Each locktree has a reference count which it manages +// but does nothing based on the value of the reference count - it is +// up to the user of the locktree to destroy it when it sees fit. + +void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id, + const comparator &cmp, + toku_external_mutex_factory_t mutex_factory) { + m_mgr = mgr; + m_dict_id = dict_id; + + m_cmp.create_from(cmp); + m_reference_count = 1; + m_userdata = nullptr; + + XCALLOC(m_rangetree); + m_rangetree->create(&m_cmp); + + m_sto_txnid = TXNID_NONE; + m_sto_buffer.create(); + m_sto_score = STO_SCORE_THRESHOLD; + m_sto_end_early_count = 0; + m_sto_end_early_time = 0; + + m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool { + return false; + }; + + m_lock_request_info.init(mutex_factory); +} + +void locktree::set_escalation_barrier_func( + lt_escalation_barrier_check_func func, void *extra) { + m_escalation_barrier = func; + m_escalation_barrier_arg = extra; +} + +void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) { + pending_lock_requests.create(); + pending_is_empty = true; + toku_external_mutex_init(mutex_factory, &mutex); + retry_want = retry_done = 0; + ZERO_STRUCT(counters); + ZERO_STRUCT(retry_mutex); + toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr); + toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr); + running_retry = false; + + TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty, + sizeof(pending_is_empty)); + TOKU_DRD_IGNORE_VAR(pending_is_empty); +} + +void locktree::destroy(void) { + invariant(m_reference_count == 0); + invariant(m_lock_request_info.pending_lock_requests.size() == 0); + m_cmp.destroy(); + m_rangetree->destroy(); + toku_free(m_rangetree); + m_sto_buffer.destroy(); + m_lock_request_info.destroy(); +} + +void lt_lock_request_info::destroy(void) { + pending_lock_requests.destroy(); + toku_external_mutex_destroy(&mutex); + toku_mutex_destroy(&retry_mutex); + toku_cond_destroy(&retry_cv); +} + +void locktree::add_reference(void) { + (void)toku_sync_add_and_fetch(&m_reference_count, 1); +} + +uint32_t locktree::release_reference(void) { + return toku_sync_sub_and_fetch(&m_reference_count, 1); +} + +uint32_t locktree::get_reference_count(void) { return m_reference_count; } + +// a container for a range/txnid pair +struct row_lock { + keyrange range; + TXNID txnid; + bool is_shared; + TxnidVector *owners; +}; + +// iterate over a locked keyrange and copy out all of the data, +// storing each row lock into the given growable array. the +// caller does not own the range inside the returned row locks, +// so remove from the tree with care using them as keys. +static void iterate_and_get_overlapping_row_locks( + const concurrent_tree::locked_keyrange *lkr, + GrowableArray<row_lock> *row_locks) { + struct copy_fn_obj { + GrowableArray<row_lock> *row_locks; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + row_lock lock = {.range = range, + .txnid = txnid, + .is_shared = is_shared, + .owners = owners}; + row_locks->push(lock); + return true; + } + } copy_fn; + copy_fn.row_locks = row_locks; + lkr->iterate(©_fn); +} + +// given a txnid and a set of overlapping row locks, determine +// which txnids are conflicting, and store them in the conflicts +// set, if given. +static bool determine_conflicting_txnids( + const GrowableArray<row_lock> &row_locks, const TXNID &txnid, + txnid_set *conflicts) { + bool conflicts_exist = false; + const size_t num_overlaps = row_locks.get_size(); + for (size_t i = 0; i < num_overlaps; i++) { + const row_lock lock = row_locks.fetch_unchecked(i); + const TXNID other_txnid = lock.txnid; + if (other_txnid != txnid) { + if (conflicts) { + if (other_txnid == TXNID_SHARED) { + // Add all shared lock owners, except this transaction. + for (TXNID shared_id : *lock.owners) { + if (shared_id != txnid) conflicts->add(shared_id); + } + } else { + conflicts->add(other_txnid); + } + } + conflicts_exist = true; + } + } + return conflicts_exist; +} + +// how much memory does a row lock take up in a concurrent tree? +static uint64_t row_lock_size_in_tree(const row_lock &lock) { + const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead(); + return lock.range.get_memory_size() + overhead; +} + +// remove and destroy the given row lock from the locked keyrange, +// then notify the memory tracker of the newly freed lock. +static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr, + const row_lock &lock, TXNID txnid, + locktree_manager *mgr) { + const uint64_t mem_released = row_lock_size_in_tree(lock); + lkr->remove(lock.range, txnid); + if (mgr != nullptr) { + mgr->note_mem_released(mem_released); + } +} + +// insert a row lock into the locked keyrange, then notify +// the memory tracker of this newly acquired lock. +static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr, + const row_lock &lock, + locktree_manager *mgr) { + uint64_t mem_used = row_lock_size_in_tree(lock); + lkr->insert(lock.range, lock.txnid, lock.is_shared); + if (mgr != nullptr) { + mgr->note_mem_used(mem_used); + } +} + +void locktree::sto_begin(TXNID txnid) { + invariant(m_sto_txnid == TXNID_NONE); + invariant(m_sto_buffer.is_empty()); + m_sto_txnid = txnid; +} + +void locktree::sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { + uint64_t buffer_mem, delta; + + // psergey: the below two lines do not make any sense + // (and it's the same in upstream TokuDB) + keyrange range; + range.create(left_key, right_key); + + buffer_mem = m_sto_buffer.total_memory_size(); + m_sto_buffer.append(left_key, right_key, is_write_request); + delta = m_sto_buffer.total_memory_size() - buffer_mem; + if (m_mgr != nullptr) { + m_mgr->note_mem_used(delta); + } +} + +void locktree::sto_end(void) { + uint64_t mem_size = m_sto_buffer.total_memory_size(); + if (m_mgr != nullptr) { + m_mgr->note_mem_released(mem_size); + } + m_sto_buffer.destroy(); + m_sto_buffer.create(); + m_sto_txnid = TXNID_NONE; +} + +void locktree::sto_end_early_no_accounting(void *prepared_lkr) { + sto_migrate_buffer_ranges_to_tree(prepared_lkr); + sto_end(); + toku_unsafe_set(m_sto_score, 0); +} + +void locktree::sto_end_early(void *prepared_lkr) { + m_sto_end_early_count++; + + tokutime_t t0 = toku_time_now(); + sto_end_early_no_accounting(prepared_lkr); + tokutime_t t1 = toku_time_now(); + + m_sto_end_early_time += (t1 - t0); +} + +void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) { + // There should be something to migrate, and nothing in the rangetree. + invariant(!m_sto_buffer.is_empty()); + invariant(m_rangetree->is_empty()); + + concurrent_tree sto_rangetree; + concurrent_tree::locked_keyrange sto_lkr; + sto_rangetree.create(&m_cmp); + + // insert all of the ranges from the single txnid buffer into a new rangtree + range_buffer::iterator iter(&m_sto_buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + sto_lkr.prepare(&sto_rangetree); + int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(), + rec.get_right_key(), + rec.get_exclusive_flag(), nullptr); + invariant_zero(r); + sto_lkr.release(); + iter.next(); + } + + // Iterate the newly created rangetree and insert each range into the + // locktree's rangetree, on behalf of the old single txnid. + struct migrate_fn_obj { + concurrent_tree::locked_keyrange *dst_lkr; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + // There can't be multiple owners in STO mode + invariant_zero(owners); + dst_lkr->insert(range, txnid, is_shared); + return true; + } + } migrate_fn; + migrate_fn.dst_lkr = + static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr); + sto_lkr.prepare(&sto_rangetree); + sto_lkr.iterate(&migrate_fn); + sto_lkr.remove_all(); + sto_lkr.release(); + sto_rangetree.destroy(); + invariant(!m_rangetree->is_empty()); +} + +bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid, + const DBT *left_key, const DBT *right_key, + bool is_write_request) { + if (m_rangetree->is_empty() && m_sto_buffer.is_empty() && + toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) { + // We can do the optimization because the rangetree is empty, and + // we know its worth trying because the sto score is big enough. + sto_begin(txnid); + } else if (m_sto_txnid != TXNID_NONE) { + // We are currently doing the optimization. Check if we need to cancel + // it because a new txnid appeared, or if the current single txnid has + // taken too many locks already. + if (m_sto_txnid != txnid || + m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) { + sto_end_early(prepared_lkr); + } + } + + // At this point the sto txnid is properly set. If it is valid, then + // this txnid can append its lock to the sto buffer successfully. + if (m_sto_txnid != TXNID_NONE) { + invariant(m_sto_txnid == txnid); + sto_append(left_key, right_key, is_write_request); + return true; + } else { + invariant(m_sto_buffer.is_empty()); + return false; + } +} + +/* + Do the same as iterate_and_get_overlapping_row_locks does, but also check for + this: + The set of overlapping rows locks consists of just one read-only shared + lock with the same endpoints as specified (in that case, we can just add + ourselves into that list) + + @return true - One compatible shared lock + false - Otherwise +*/ +static bool iterate_and_get_overlapping_row_locks2( + const concurrent_tree::locked_keyrange *lkr, const DBT *left_key, + const DBT *right_key, comparator *cmp, TXNID, + GrowableArray<row_lock> *row_locks) { + struct copy_fn_obj { + GrowableArray<row_lock> *row_locks; + bool first_call = true; + bool matching_lock_found = false; + const DBT *left_key, *right_key; + comparator *cmp; + + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + if (first_call) { + first_call = false; + if (is_shared && !(*cmp)(left_key, range.get_left_key()) && + !(*cmp)(right_key, range.get_right_key())) { + matching_lock_found = true; + } + } else { + // if we see multiple matching locks, it doesn't matter whether + // the first one was matching. + matching_lock_found = false; + } + row_lock lock = {.range = range, + .txnid = txnid, + .is_shared = is_shared, + .owners = owners}; + row_locks->push(lock); + return true; + } + } copy_fn; + copy_fn.row_locks = row_locks; + copy_fn.left_key = left_key; + copy_fn.right_key = right_key; + copy_fn.cmp = cmp; + lkr->iterate(©_fn); + return copy_fn.matching_lock_found; +} + +// try to acquire a lock and consolidate it with existing locks if possible +// param: lkr, a prepared locked keyrange +// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist. +int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, + const DBT *left_key, + const DBT *right_key, + bool is_write_request, + txnid_set *conflicts) { + int r = 0; + concurrent_tree::locked_keyrange *lkr; + + keyrange requested_range; + requested_range.create(left_key, right_key); + lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr); + lkr->acquire(requested_range); + + // copy out the set of overlapping row locks. + GrowableArray<row_lock> overlapping_row_locks; + overlapping_row_locks.init(); + bool matching_shared_lock_found = false; + + if (is_write_request) + iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks); + else { + matching_shared_lock_found = iterate_and_get_overlapping_row_locks2( + lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks); + // psergey-todo: what to do now? So, we have figured we have just one + // shareable lock. Need to add us into it as an owner but the lock + // pointer cannot be kept? + // A: use find_node_with_overlapping_child(key_range, nullptr); + // then, add ourselves to the owner list. + // Dont' foreget to release the subtree after that. + } + + if (matching_shared_lock_found) { + // there is just one non-confliting matching shared lock. + // we are hilding a lock on it (see acquire() call above). + // we need to modify it to indicate there is another locker... + if (lkr->add_shared_owner(requested_range, txnid)) { + // Pretend shared lock uses as much memory. + row_lock new_lock = {.range = requested_range, + .txnid = txnid, + .is_shared = false, + .owners = nullptr}; + uint64_t mem_used = row_lock_size_in_tree(new_lock); + if (m_mgr) { + m_mgr->note_mem_used(mem_used); + } + } + requested_range.destroy(); + overlapping_row_locks.deinit(); + return 0; + } + + size_t num_overlapping_row_locks = overlapping_row_locks.get_size(); + + // if any overlapping row locks conflict with this request, bail out. + + bool conflicts_exist = + determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts); + if (!conflicts_exist) { + // there are no conflicts, so all of the overlaps are for the requesting + // txnid. so, we must consolidate all existing overlapping ranges and the + // requested range into one dominating range. then we insert the dominating + // range. + bool all_shared = !is_write_request; + for (size_t i = 0; i < num_overlapping_row_locks; i++) { + row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i); + invariant(overlapping_lock.txnid == txnid); + requested_range.extend(m_cmp, overlapping_lock.range); + remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr); + all_shared = all_shared && overlapping_lock.is_shared; + } + + row_lock new_lock = {.range = requested_range, + .txnid = txnid, + .is_shared = all_shared, + .owners = nullptr}; + insert_row_lock_into_tree(lkr, new_lock, m_mgr); + } else { + r = DB_LOCK_NOTGRANTED; + } + + requested_range.destroy(); + overlapping_row_locks.deinit(); + return r; +} + +// acquire a lock in the given key range, inclusive. if successful, +// return 0. otherwise, populate the conflicts txnid_set with the set of +// transactions that conflict with this request. +int locktree::acquire_lock(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts) { + int r = 0; + + // we are only supporting write locks for simplicity + // invariant(is_write_request); + + // acquire and prepare a locked keyrange over the requested range. + // prepare is a serialzation point, so we take the opportunity to + // try the single txnid optimization first. + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + + bool acquired = + sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request); + if (!acquired) { + r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, + is_write_request, conflicts); + } + + lkr.release(); + return r; +} + +int locktree::try_acquire_lock(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn) { + // All ranges in the locktree must have left endpoints <= right endpoints. + // Range comparisons rely on this fact, so we make a paranoid invariant here. + paranoid_invariant(m_cmp(left_key, right_key) <= 0); + int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn); + if (r == 0) { + r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts); + } + return r; +} + +// the locktree silently upgrades read locks to write locks for simplicity +int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn) { + return try_acquire_lock(false, txnid, left_key, right_key, conflicts, + big_txn); +} + +int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn) { + return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn); +} + +// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, +// TXNID txnid); +void locktree::dump_locks(void *cdata, dump_callback cb) { + concurrent_tree::locked_keyrange lkr; + keyrange range; + range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity()); + + lkr.prepare(m_rangetree); + lkr.acquire(range); + + TXNID sto_txn; + if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) { + // insert all of the ranges from the single txnid buffer into a new rangtree + range_buffer::iterator iter(&m_sto_buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn, + !rec.get_exclusive_flag(), nullptr); + iter.next(); + } + } else { + GrowableArray<row_lock> all_locks; + all_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &all_locks); + + const size_t n_locks = all_locks.get_size(); + for (size_t i = 0; i < n_locks; i++) { + const row_lock lock = all_locks.fetch_unchecked(i); + (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(), + lock.txnid, lock.is_shared, lock.owners); + } + all_locks.deinit(); + } + lkr.release(); + range.destroy(); +} + +void locktree::get_conflicts(bool is_write_request, TXNID txnid, + const DBT *left_key, const DBT *right_key, + txnid_set *conflicts) { + // because we only support write locks, ignore this bit for now. + (void)is_write_request; + + // preparing and acquire a locked keyrange over the range + keyrange range; + range.create(left_key, right_key); + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + lkr.acquire(range); + + // copy out the set of overlapping row locks and determine the conflicts + GrowableArray<row_lock> overlapping_row_locks; + overlapping_row_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks); + + // we don't care if conflicts exist. we just want the conflicts set populated. + (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts); + + lkr.release(); + overlapping_row_locks.deinit(); + range.destroy(); +} + +// Effect: +// For each range in the lock tree that overlaps the given range and has +// the given txnid, remove it. +// Rationale: +// In the common case, there is only the range [left_key, right_key] and +// it is associated with txnid, so this is a single tree delete. +// +// However, consolidation and escalation change the objects in the tree +// without telling the txn anything. In this case, the txn may own a +// large range lock that represents its ownership of many smaller range +// locks. For example, the txn may think it owns point locks on keys 1, +// 2, and 3, but due to escalation, only the object [1,3] exists in the +// tree. +// +// The first call for a small lock will remove the large range lock, and +// the rest of the calls should do nothing. After the first release, +// another thread can acquire one of the locks that the txn thinks it +// still owns. That's ok, because the txn doesn't want it anymore (it +// unlocks everything at once), but it may find a lock that it does not +// own. +// +// In our example, the txn unlocks key 1, which actually removes the +// whole lock [1,3]. Now, someone else can lock 2 before our txn gets +// around to unlocking 2, so we should not remove that lock. +void locktree::remove_overlapping_locks_for_txnid(TXNID txnid, + const DBT *left_key, + const DBT *right_key) { + keyrange release_range; + release_range.create(left_key, right_key); + + // acquire and prepare a locked keyrange over the release range + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + lkr.acquire(release_range); + + // copy out the set of overlapping row locks. + GrowableArray<row_lock> overlapping_row_locks; + overlapping_row_locks.init(); + iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks); + size_t num_overlapping_row_locks = overlapping_row_locks.get_size(); + + for (size_t i = 0; i < num_overlapping_row_locks; i++) { + row_lock lock = overlapping_row_locks.fetch_unchecked(i); + // If this isn't our lock, that's ok, just don't remove it. + // See rationale above. + // psergey-todo: for shared locks, just remove ourselves from the + // owners. + if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) { + remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr); + } + } + + lkr.release(); + overlapping_row_locks.deinit(); + release_range.destroy(); +} + +bool locktree::sto_txnid_is_valid_unsafe(void) const { + return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE; +} + +int locktree::sto_get_score_unsafe(void) const { + return toku_unsafe_fetch(m_sto_score); +} + +bool locktree::sto_try_release(TXNID txnid) { + bool released = false; + if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) { + // check the bit again with a prepared locked keyrange, + // which protects the optimization bits and rangetree data + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + if (m_sto_txnid != TXNID_NONE) { + // this txnid better be the single txnid on this locktree, + // or else we are in big trouble (meaning the logic is broken) + invariant(m_sto_txnid == txnid); + invariant(m_rangetree->is_empty()); + sto_end(); + released = true; + } + lkr.release(); + } + return released; +} + +// release all of the locks for a txnid whose endpoints are pairs +// in the given range buffer. +void locktree::release_locks(TXNID txnid, const range_buffer *ranges, + bool all_trx_locks_hint) { + // try the single txn optimization. if it worked, then all of the + // locks are already released, otherwise we need to do it here. + bool released; + if (all_trx_locks_hint) { + // This will release all of the locks the transaction is holding + released = sto_try_release(txnid); + } else { + /* + psergey: we are asked to release *Some* of the locks the transaction + is holding. + We could try doing that without leaving the STO mode, but right now, + the easiest way is to exit the STO mode and let the non-STO code path + handle it. + */ + if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) { + // check the bit again with a prepared locked keyrange, + // which protects the optimization bits and rangetree data + concurrent_tree::locked_keyrange lkr; + lkr.prepare(m_rangetree); + if (m_sto_txnid != TXNID_NONE) { + sto_end_early(&lkr); + } + lkr.release(); + } + released = false; + } + if (!released) { + range_buffer::iterator iter(ranges); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + const DBT *left_key = rec.get_left_key(); + const DBT *right_key = rec.get_right_key(); + // All ranges in the locktree must have left endpoints <= right endpoints. + // Range comparisons rely on this fact, so we make a paranoid invariant + // here. + paranoid_invariant(m_cmp(left_key, right_key) <= 0); + remove_overlapping_locks_for_txnid(txnid, left_key, right_key); + iter.next(); + } + // Increase the sto score slightly. Eventually it will hit + // the threshold and we'll try the optimization again. This + // is how a previously multithreaded system transitions into + // a single threaded system that benefits from the optimization. + if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) { + toku_sync_fetch_and_add(&m_sto_score, 1); + } + } +} + +// iterate over a locked keyrange and extract copies of the first N +// row locks, storing each one into the given array of size N, +// then removing each extracted lock from the locked keyrange. +static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr, + locktree_manager *mgr, row_lock *row_locks, + int num_to_extract) { + struct extract_fn_obj { + int num_extracted; + int num_to_extract; + row_lock *row_locks; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + if (num_extracted < num_to_extract) { + row_lock lock; + lock.range.create_copy(range); + lock.txnid = txnid; + lock.is_shared = is_shared; + // deep-copy the set of owners: + if (owners) + lock.owners = new TxnidVector(*owners); + else + lock.owners = nullptr; + row_locks[num_extracted++] = lock; + return true; + } else { + return false; + } + } + } extract_fn; + + extract_fn.row_locks = row_locks; + extract_fn.num_to_extract = num_to_extract; + extract_fn.num_extracted = 0; + lkr->iterate(&extract_fn); + + // now that the ranges have been copied out, complete + // the extraction by removing the ranges from the tree. + // use remove_row_lock_from_tree() so we properly track the + // amount of memory and number of locks freed. + int num_extracted = extract_fn.num_extracted; + invariant(num_extracted <= num_to_extract); + for (int i = 0; i < num_extracted; i++) { + remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr); + } + + return num_extracted; +} + +// Store each newly escalated lock in a range buffer for appropriate txnid. +// We'll rebuild the locktree by iterating over these ranges, and then we +// can pass back each txnid/buffer pair individually through a callback +// to notify higher layers that locks have changed. +struct txnid_range_buffer { + TXNID txnid; + range_buffer buffer; + + static int find_by_txnid(struct txnid_range_buffer *const &other_buffer, + const TXNID &txnid) { + if (txnid < other_buffer->txnid) { + return -1; + } else if (other_buffer->txnid == txnid) { + return 0; + } else { + return 1; + } + } +}; + +// escalate the locks in the locktree by merging adjacent +// locks that have the same txnid into one larger lock. +// +// if there's only one txnid in the locktree then this +// approach works well. if there are many txnids and each +// has locks in a random/alternating order, then this does +// not work so well. +void locktree::escalate(lt_escalate_cb after_escalate_callback, + void *after_escalate_callback_extra) { + omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers; + range_buffers.create(); + + // prepare and acquire a locked keyrange on the entire locktree + concurrent_tree::locked_keyrange lkr; + keyrange infinite_range = keyrange::get_infinite_range(); + lkr.prepare(m_rangetree); + lkr.acquire(infinite_range); + + // if we're in the single txnid optimization, simply call it off. + // if you have to run escalation, you probably don't care about + // the optimization anyway, and this makes things easier. + if (m_sto_txnid != TXNID_NONE) { + // We are already accounting for this escalation time and + // count, so don't do it for sto_end_early too. + sto_end_early_no_accounting(&lkr); + } + + // extract and remove batches of row locks from the locktree + int num_extracted; + const int num_row_locks_per_batch = 128; + row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf); + + // we always remove the "first" n because we are removing n + // each time we do an extraction. so this loops until its empty. + while ((num_extracted = extract_first_n_row_locks( + &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) { + int current_index = 0; + while (current_index < num_extracted) { + // every batch of extracted locks is in range-sorted order. search + // through them and merge adjacent locks with the same txnid into + // one dominating lock and save it to a set of escalated locks. + // + // first, find the index of the next row lock that + // - belongs to a different txnid, or + // - belongs to several txnids, or + // - is a shared lock (we could potentially merge those but + // currently we don't), or + // - is across a lock escalation barrier. + int next_txnid_index = current_index + 1; + + while (next_txnid_index < num_extracted && + (extracted_buf[current_index].txnid == + extracted_buf[next_txnid_index].txnid) && + !extracted_buf[next_txnid_index].is_shared && + !extracted_buf[next_txnid_index].owners && + !m_escalation_barrier( + extracted_buf[current_index].range.get_right_key(), + extracted_buf[next_txnid_index].range.get_left_key(), + m_escalation_barrier_arg)) { + next_txnid_index++; + } + + // Create an escalated range for the current txnid that dominates + // each range between the current indext and the next txnid's index. + // const TXNID current_txnid = extracted_buf[current_index].txnid; + const DBT *escalated_left_key = + extracted_buf[current_index].range.get_left_key(); + const DBT *escalated_right_key = + extracted_buf[next_txnid_index - 1].range.get_right_key(); + + // Try to find a range buffer for the current txnid. Create one if it + // doesn't exist. Then, append the new escalated range to the buffer. (If + // a lock is shared by multiple txnids, append it each of txnid's lists) + TxnidVector *owners_ptr; + TxnidVector singleton_owner; + if (extracted_buf[current_index].owners) + owners_ptr = extracted_buf[current_index].owners; + else { + singleton_owner.insert(extracted_buf[current_index].txnid); + owners_ptr = &singleton_owner; + } + + for (auto cur_txnid : *owners_ptr) { + uint32_t idx; + struct txnid_range_buffer *existing_range_buffer; + int r = + range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>( + cur_txnid, &existing_range_buffer, &idx); + if (r == DB_NOTFOUND) { + struct txnid_range_buffer *XMALLOC(new_range_buffer); + new_range_buffer->txnid = cur_txnid; + new_range_buffer->buffer.create(); + new_range_buffer->buffer.append( + escalated_left_key, escalated_right_key, + !extracted_buf[current_index].is_shared); + range_buffers.insert_at(new_range_buffer, idx); + } else { + invariant_zero(r); + invariant(existing_range_buffer->txnid == cur_txnid); + existing_range_buffer->buffer.append( + escalated_left_key, escalated_right_key, + !extracted_buf[current_index].is_shared); + } + } + + current_index = next_txnid_index; + } + + // destroy the ranges copied during the extraction + for (int i = 0; i < num_extracted; i++) { + delete extracted_buf[i].owners; + extracted_buf[i].range.destroy(); + } + } + toku_free(extracted_buf); + + // Rebuild the locktree from each range in each range buffer, + // then notify higher layers that the txnid's locks have changed. + // + // (shared locks: if a lock was initially shared between transactions TRX1, + // TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on + // TRX2, etc. This will succeed and an identical shared lock will be + // constructed) + + invariant(m_rangetree->is_empty()); + const uint32_t num_range_buffers = range_buffers.size(); + for (uint32_t i = 0; i < num_range_buffers; i++) { + struct txnid_range_buffer *current_range_buffer; + int r = range_buffers.fetch(i, ¤t_range_buffer); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + + const TXNID current_txnid = current_range_buffer->txnid; + range_buffer::iterator iter(¤t_range_buffer->buffer); + range_buffer::iterator::record rec; + while (iter.current(&rec)) { + keyrange range; + range.create(rec.get_left_key(), rec.get_right_key()); + row_lock lock = {.range = range, + .txnid = current_txnid, + .is_shared = !rec.get_exclusive_flag(), + .owners = nullptr}; + insert_row_lock_into_tree(&lkr, lock, m_mgr); + iter.next(); + } + + // Notify higher layers that locks have changed for the current txnid + if (after_escalate_callback) { + after_escalate_callback(current_txnid, this, current_range_buffer->buffer, + after_escalate_callback_extra); + } + current_range_buffer->buffer.destroy(); + } + + while (range_buffers.size() > 0) { + struct txnid_range_buffer *buffer; + int r = range_buffers.fetch(0, &buffer); + invariant_zero(r); + r = range_buffers.delete_at(0); + invariant_zero(r); + toku_free(buffer); + } + range_buffers.destroy(); + + lkr.release(); +} + +void *locktree::get_userdata(void) const { return m_userdata; } + +void locktree::set_userdata(void *userdata) { m_userdata = userdata; } + +struct lt_lock_request_info *locktree::get_lock_request_info(void) { + return &m_lock_request_info; +} + +void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); } + +locktree_manager *locktree::get_manager(void) const { return m_mgr; } + +int locktree::compare(const locktree *lt) const { + if (m_dict_id.dictid < lt->m_dict_id.dictid) { + return -1; + } else if (m_dict_id.dictid == lt->m_dict_id.dictid) { + return 0; + } else { + return 1; + } +} + +DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; } + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h new file mode 100644 index 000000000..f0f4b042d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h @@ -0,0 +1,580 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <atomic> + +#include "../db.h" +#include "../ft/comparator.h" +#include "../portability/toku_external_pthread.h" +#include "../portability/toku_pthread.h" +#include "../portability/toku_time.h" +// PORT #include <ft/ft-ops.h> // just for DICTIONARY_ID.. +// PORT: ft-status for LTM_STATUS: +#include "../ft/ft-status.h" + +struct DICTIONARY_ID { + uint64_t dictid; +}; + +#include "../util/omt.h" +#include "range_buffer.h" +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +class locktree; +class locktree_manager; +class lock_request; +class concurrent_tree; + +typedef int (*lt_create_cb)(locktree *lt, void *extra); +typedef void (*lt_destroy_cb)(locktree *lt); +typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, + const range_buffer &buffer, void *extra); + +typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b, + void *extra); + +struct lt_counters { + uint64_t wait_count, wait_time; + uint64_t long_wait_count, long_wait_time; + uint64_t timeout_count; + + void add(const lt_counters &rhs) { + wait_count += rhs.wait_count; + wait_time += rhs.wait_time; + long_wait_count += rhs.long_wait_count; + long_wait_time += rhs.long_wait_time; + timeout_count += rhs.timeout_count; + } +}; + +// Lock request state for some locktree +struct lt_lock_request_info { + omt<lock_request *> pending_lock_requests; + std::atomic_bool pending_is_empty; + toku_external_mutex_t mutex; + bool should_retry_lock_requests; + lt_counters counters; + std::atomic_ullong retry_want; + unsigned long long retry_done; + toku_mutex_t retry_mutex; + toku_cond_t retry_cv; + bool running_retry; + + void init(toku_external_mutex_factory_t mutex_factory); + void destroy(void); +}; + +// The locktree manager manages a set of locktrees, one for each open +// dictionary. Locktrees are retrieved from the manager. When they are no +// longer needed, they are be released by the user. +class locktree_manager { + public: + // param: create_cb, called just after a locktree is first created. + // destroy_cb, called just before a locktree is destroyed. + // escalate_cb, called after a locktree is escalated (with extra + // param) + void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, + lt_escalate_cb escalate_cb, void *extra, + toku_external_mutex_factory_t mutex_factory_arg); + + void destroy(void); + + size_t get_max_lock_memory(void); + + int set_max_lock_memory(size_t max_lock_memory); + + // effect: Get a locktree from the manager. If a locktree exists with the + // given + // dict_id, it is referenced and then returned. If one did not exist, + // it is created. It will use the comparator for comparing keys. The + // on_create callback (passed to locktree_manager::create()) will be + // called with the given extra parameter. + locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, + void *on_create_extra); + + void reference_lt(locktree *lt); + + // effect: Releases one reference on a locktree. If the reference count + // transitions + // to zero, the on_destroy callback is called before it gets + // destroyed. + void release_lt(locktree *lt); + + void get_status(LTM_STATUS status); + + // effect: calls the iterate function on each pending lock request + // note: holds the manager's mutex + typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id, + TXNID txnid, const DBT *left_key, + const DBT *right_key, + TXNID blocking_txnid, + uint64_t start_time, + void *extra); + int iterate_pending_lock_requests(lock_request_iterate_callback cb, + void *extra); + + // effect: Determines if too many locks or too much memory is being used, + // Runs escalation on the manager if so. + // param: big_txn, if the current transaction is 'big' (has spilled rollback + // logs) returns: 0 if there enough resources to create a new lock, or + // TOKUDB_OUT_OF_LOCKS + // if there are not enough resources and lock escalation failed to + // free up enough resources for a new lock. + int check_current_lock_constraints(bool big_txn); + + bool over_big_threshold(void); + + void note_mem_used(uint64_t mem_used); + + void note_mem_released(uint64_t mem_freed); + + bool out_of_locks(void) const; + + // Escalate all locktrees + void escalate_all_locktrees(void); + + // Escalate a set of locktrees + void escalate_locktrees(locktree **locktrees, int num_locktrees); + + // effect: calls the private function run_escalation(), only ok to + // do for tests. + // rationale: to get better stress test coverage, we want a way to + // deterministicly trigger lock escalation. + void run_escalation_for_test(void); + void run_escalation(void); + + // Add time t to the escalator's wait time statistics + void add_escalator_wait_time(uint64_t t); + + void kill_waiter(void *extra); + + private: + static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; + + // tracks the current number of locks and lock memory + uint64_t m_max_lock_memory; + uint64_t m_current_lock_memory; + + struct lt_counters m_lt_counters; + + // the create and destroy callbacks for the locktrees + lt_create_cb m_lt_create_callback; + lt_destroy_cb m_lt_destroy_callback; + lt_escalate_cb m_lt_escalate_callback; + void *m_lt_escalate_callback_extra; + + omt<locktree *> m_locktree_map; + + toku_external_mutex_factory_t mutex_factory; + + // the manager's mutex protects the locktree map + toku_mutex_t m_mutex; + + void mutex_lock(void); + + void mutex_unlock(void); + + // Manage the set of open locktrees + locktree *locktree_map_find(const DICTIONARY_ID &dict_id); + void locktree_map_put(locktree *lt); + void locktree_map_remove(locktree *lt); + + static int find_by_dict_id(locktree *const <, const DICTIONARY_ID &dict_id); + + void escalator_init(void); + void escalator_destroy(void); + + // statistics about lock escalation. + toku_mutex_t m_escalation_mutex; + uint64_t m_escalation_count; + tokutime_t m_escalation_time; + uint64_t m_escalation_latest_result; + uint64_t m_wait_escalation_count; + uint64_t m_wait_escalation_time; + uint64_t m_long_wait_escalation_count; + uint64_t m_long_wait_escalation_time; + + // the escalator coordinates escalation on a set of locktrees for a bunch of + // threads + class locktree_escalator { + public: + void create(void); + void destroy(void); + void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), + void *extra); + + private: + toku_mutex_t m_escalator_mutex; + toku_cond_t m_escalator_done; + bool m_escalator_running; + }; + + locktree_escalator m_escalator; + + friend class manager_unit_test; +}; + +// A locktree represents the set of row locks owned by all transactions +// over an open dictionary. Read and write ranges are represented as +// a left and right key which are compared with the given comparator +// +// Locktrees are not created and destroyed by the user. Instead, they are +// referenced and released using the locktree manager. +// +// A sample workflow looks like this: +// - Create a manager. +// - Get a locktree by dictionaroy id from the manager. +// - Perform read/write lock acquision on the locktree, add references to +// the locktree using the manager, release locks, release references, etc. +// - ... +// - Release the final reference to the locktree. It will be destroyed. +// - Destroy the manager. +class locktree { + public: + // effect: Creates a locktree + void create(locktree_manager *mgr, DICTIONARY_ID dict_id, + const comparator &cmp, + toku_external_mutex_factory_t mutex_factory); + + void destroy(void); + + // For thread-safe, external reference counting + void add_reference(void); + + // requires: the reference count is > 0 + // returns: the reference count, after decrementing it by one + uint32_t release_reference(void); + + // returns: the current reference count + uint32_t get_reference_count(void); + + // effect: Attempts to grant a read lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + // note: Read locks cannot be shared between txnids, as one would expect. + // This is for simplicity since read locks are rare in MySQL. + int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: Attempts to grant a write lock for the range of keys between + // [left_key, right_key]. returns: If the lock cannot be granted, return + // DB_LOCK_NOTGRANTED, and populate the + // given conflicts set with the txnids that hold conflicting locks in + // the range. If the locktree cannot create more locks, return + // TOKUDB_OUT_OF_LOCKS. + int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, + txnid_set *conflicts, bool big_txn); + + // effect: populate the conflicts set with the txnids that would preventing + // the given txnid from getting a lock on [left_key, right_key] + void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + // effect: Release all of the lock ranges represented by the range buffer for + // a txnid. + void release_locks(TXNID txnid, const range_buffer *ranges, + bool all_trx_locks_hint = false); + + // effect: Runs escalation on this locktree + void escalate(lt_escalate_cb after_escalate_callback, void *extra); + + // returns: The userdata associated with this locktree, or null if it has not + // been set. + void *get_userdata(void) const; + + void set_userdata(void *userdata); + + locktree_manager *get_manager(void) const; + + void set_comparator(const comparator &cmp); + + // Set the user-provided Lock Escalation Barrier check function and its + // argument + // + // Lock Escalation Barrier limits the scope of Lock Escalation. + // For two keys A and B (such that A < B), + // escalation_barrier_check_func(A, B)==true means that there's a lock + // escalation barrier between A and B, and lock escalation is not allowed to + // bridge the gap between A and B. + // + // This method sets the user-provided barrier check function and its + // parameter. + void set_escalation_barrier_func(lt_escalation_barrier_check_func func, + void *extra); + + int compare(const locktree *lt) const; + + DICTIONARY_ID get_dict_id() const; + + // Private info struct for storing pending lock request state. + // Only to be used by lock requests. We store it here as + // something less opaque than usual to strike a tradeoff between + // abstraction and code complexity. It is still fairly abstract + // since the lock_request object is opaque + struct lt_lock_request_info *get_lock_request_info(void); + + typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, + TXNID txnid, bool is_shared, + TxnidVector *owners); + void dump_locks(void *cdata, dump_callback cb); + + private: + locktree_manager *m_mgr; + DICTIONARY_ID m_dict_id; + uint32_t m_reference_count; + + // Since the memory referenced by this comparator is not owned by the + // locktree, the user must guarantee it will outlive the locktree. + // + // The ydb API accomplishes this by opening an ft_handle in the on_create + // callback, which will keep the underlying FT (and its descriptor) in memory + // for as long as the handle is open. The ft_handle is stored opaquely in the + // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra + comparator m_cmp; + + lt_escalation_barrier_check_func m_escalation_barrier; + void *m_escalation_barrier_arg; + + concurrent_tree *m_rangetree; + + void *m_userdata; + struct lt_lock_request_info m_lock_request_info; + + // psergey-todo: + // Each transaction also keeps a list of ranges it has locked. + // So, when a transaction is running in STO mode, two identical + // lists are kept: the STO lock list and transaction's owned locks + // list. Why can't we do with just one list? + + // The following fields and members prefixed with "sto_" are for + // the single txnid optimization, intended to speed up the case + // when only one transaction is using the locktree. If we know + // the locktree has only one transaction, then acquiring locks + // takes O(1) work and releasing all locks takes O(1) work. + // + // How do we know that the locktree only has a single txnid? + // What do we do if it does? + // + // When a txn with txnid T requests a lock: + // - If the tree is empty, the optimization is possible. Set the single + // txnid to T, and insert the lock range into the buffer. + // - If the tree is not empty, check if the single txnid is T. If so, + // append the lock range to the buffer. Otherwise, migrate all of + // the locks in the buffer into the rangetree on behalf of txnid T, + // and invalid the single txnid. + // + // When a txn with txnid T releases its locks: + // - If the single txnid is valid, it must be for T. Destroy the buffer. + // - If it's not valid, release locks the normal way in the rangetree. + // + // To carry out the optimization we need to record a single txnid + // and a range buffer for each locktree, each protected by the root + // lock of the locktree's rangetree. The root lock for a rangetree + // is grabbed by preparing a locked keyrange on the rangetree. + TXNID m_sto_txnid; + range_buffer m_sto_buffer; + + // The single txnid optimization speeds up the case when only one + // transaction is using the locktree. But it has the potential to + // hurt the case when more than one txnid exists. + // + // There are two things we need to do to make the optimization only + // optimize the case we care about, and not hurt the general case. + // + // Bound the worst-case latency for lock migration when the + // optimization stops working: + // - Idea: Stop the optimization and migrate immediate if we notice + // the single txnid has takes many locks in the range buffer. + // - Implementation: Enforce a max size on the single txnid range buffer. + // - Analysis: Choosing the perfect max value, M, is difficult to do + // without some feedback from the field. Intuition tells us that M should + // not be so small that the optimization is worthless, and it should not + // be so big that it's unreasonable to have to wait behind a thread doing + // the work of converting M buffer locks into rangetree locks. + // + // Prevent concurrent-transaction workloads from trying the optimization + // in vain: + // - Idea: Don't even bother trying the optimization if we think the + // system is in a concurrent-transaction state. + // - Implementation: Do something even simpler than detecting whether the + // system is in a concurent-transaction state. Just keep a "score" value + // and some threshold. If at any time the locktree is eligible for the + // optimization, only do it if the score is at this threshold. When you + // actually do the optimization but someone has to migrate locks in the buffer + // (expensive), then reset the score back to zero. Each time a txn + // releases locks, the score is incremented by 1. + // - Analysis: If you let the threshold be "C", then at most 1 / C txns will + // do the optimization in a concurrent-transaction system. Similarly, it + // takes at most C txns to start using the single txnid optimzation, which + // is good when the system transitions from multithreaded to single threaded. + // + // STO_BUFFER_MAX_SIZE: + // + // We choose the max value to be 1 million since most transactions are smaller + // than 1 million and we can create a rangetree of 1 million elements in + // less than a second. So we can be pretty confident that this threshold + // enables the optimization almost always, and prevents super pathological + // latency issues for the first lock taken by a second thread. + // + // STO_SCORE_THRESHOLD: + // + // A simple first guess at a good value for the score threshold is 100. + // By our analysis, we'd end up doing the optimization in vain for + // around 1% of all transactions, which seems reasonable. Further, + // if the system goes single threaded, it ought to be pretty quick + // for 100 transactions to go by, so we won't have to wait long before + // we start doing the single txind optimzation again. + static const int STO_BUFFER_MAX_SIZE = 50 * 1024; + static const int STO_SCORE_THRESHOLD = 100; + int m_sto_score; + + // statistics about time spent ending the STO early + uint64_t m_sto_end_early_count; + tokutime_t m_sto_end_early_time; + + // effect: begins the single txnid optimizaiton, setting m_sto_txnid + // to the given txnid. + // requires: m_sto_txnid is invalid + void sto_begin(TXNID txnid); + + // effect: append a range to the sto buffer + // requires: m_sto_txnid is valid + void sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // effect: ends the single txnid optimization, releaseing any memory + // stored in the sto buffer, notifying the tracker, and + // invalidating m_sto_txnid. + // requires: m_sto_txnid is valid + void sto_end(void); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see below. + // effect: ends the single txnid optimization early, migrating buffer locks + // into the rangetree, calling sto_end(), and then setting the + // sto_score back to zero. + // requires: m_sto_txnid is valid + void sto_end_early(void *prepared_lkr); + void sto_end_early_no_accounting(void *prepared_lkr); + + // params: prepared_lkr is a void * to a prepared locked keyrange. we can't + // use + // the real type because the compiler won't allow us to forward + // declare concurrent_tree::locked_keyrange without including + // concurrent_tree.h, which we cannot do here because it is a template + // implementation. + // requires: the prepared locked keyrange is for the locktree's rangetree + // requires: m_sto_txnid is valid + // effect: migrates each lock in the single txnid buffer into the locktree's + // rangetree, notifying the memory tracker as necessary. + void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr); + + // effect: If m_sto_txnid is valid, then release the txnid's locks + // by ending the optimization. + // requires: If m_sto_txnid is valid, it is equal to the given txnid + // returns: True if locks were released for this txnid + bool sto_try_release(TXNID txnid); + + // params: prepared_lkr is a void * to a prepared locked keyrange. see above. + // requires: the prepared locked keyrange is for the locktree's rangetree + // effect: If m_sto_txnid is valid and equal to the given txnid, then + // append a range onto the buffer. Otherwise, if m_sto_txnid is valid + // but not equal to this txnid, then migrate the buffer's locks + // into the rangetree and end the optimization, setting the score + // back to zero. + // returns: true if the lock was acquired for this txnid + bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key, + const DBT *right_key, bool is_write_request); + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // true if m_sto_txnid is not TXNID_NONE + bool sto_txnid_is_valid_unsafe(void) const; + + // Effect: + // Provides a hook for a helgrind suppression. + // Returns: + // m_sto_score + int sto_get_score_unsafe(void) const; + + void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key, + const DBT *right_key); + + int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, + const DBT *left_key, const DBT *right_key, + bool is_write_request, txnid_set *conflicts); + + int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts); + + int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, + const DBT *right_key, txnid_set *conflicts, + bool big_txn); + + friend class locktree_unit_test; + friend class manager_unit_test; + friend class lock_request_unit_test; + + // engine status reaches into the locktree to read some stats + friend void locktree_manager::get_status(LTM_STATUS status); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc new file mode 100644 index 000000000..4186182be --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc @@ -0,0 +1,527 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <stdlib.h> +#include <string.h> + +#include "../portability/toku_pthread.h" +#include "../util/status.h" +#include "lock_request.h" +#include "locktree.h" + +namespace toku { + +void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, + lt_escalate_cb escalate_cb, void *escalate_extra, + toku_external_mutex_factory_t mutex_factory_arg) { + mutex_factory = mutex_factory_arg; + m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY; + m_current_lock_memory = 0; + + m_locktree_map.create(); + m_lt_create_callback = create_cb; + m_lt_destroy_callback = destroy_cb; + m_lt_escalate_callback = escalate_cb; + m_lt_escalate_callback_extra = escalate_extra; + ZERO_STRUCT(m_mutex); + toku_mutex_init(manager_mutex_key, &m_mutex, nullptr); + + ZERO_STRUCT(m_lt_counters); + + escalator_init(); +} + +void locktree_manager::destroy(void) { + escalator_destroy(); + invariant(m_current_lock_memory == 0); + invariant(m_locktree_map.size() == 0); + m_locktree_map.destroy(); + toku_mutex_destroy(&m_mutex); +} + +void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); } + +void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); } + +size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; } + +int locktree_manager::set_max_lock_memory(size_t max_lock_memory) { + int r = 0; + mutex_lock(); + if (max_lock_memory < m_current_lock_memory) { + r = EDOM; + } else { + m_max_lock_memory = max_lock_memory; + } + mutex_unlock(); + return r; +} + +int locktree_manager::find_by_dict_id(locktree *const <, + const DICTIONARY_ID &dict_id) { + if (lt->get_dict_id().dictid < dict_id.dictid) { + return -1; + } else if (lt->get_dict_id().dictid == dict_id.dictid) { + return 0; + } else { + return 1; + } +} + +locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) { + locktree *lt; + int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, <, + nullptr); + return r == 0 ? lt : nullptr; +} + +void locktree_manager::locktree_map_put(locktree *lt) { + int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>( + lt, lt->get_dict_id(), nullptr); + invariant_zero(r); +} + +void locktree_manager::locktree_map_remove(locktree *lt) { + uint32_t idx; + locktree *found_lt; + int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>( + lt->get_dict_id(), &found_lt, &idx); + invariant_zero(r); + invariant(found_lt == lt); + r = m_locktree_map.delete_at(idx); + invariant_zero(r); +} + +locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp, + void *on_create_extra) { + // hold the mutex around searching and maybe + // inserting into the locktree map + mutex_lock(); + + locktree *lt = locktree_map_find(dict_id); + if (lt == nullptr) { + XCALLOC(lt); + lt->create(this, dict_id, cmp, mutex_factory); + + // new locktree created - call the on_create callback + // and put it in the locktree map + if (m_lt_create_callback) { + int r = m_lt_create_callback(lt, on_create_extra); + if (r != 0) { + lt->release_reference(); + lt->destroy(); + toku_free(lt); + lt = nullptr; + } + } + if (lt) { + locktree_map_put(lt); + } + } else { + reference_lt(lt); + } + + mutex_unlock(); + + return lt; +} + +void locktree_manager::reference_lt(locktree *lt) { + // increment using a sync fetch and add. + // the caller guarantees that the lt won't be + // destroyed while we increment the count here. + // + // the caller can do this by already having an lt + // reference or by holding the manager mutex. + // + // if the manager's mutex is held, it is ok for the + // reference count to transition from 0 to 1 (no race), + // since we're serialized with other opens and closes. + lt->add_reference(); +} + +void locktree_manager::release_lt(locktree *lt) { + bool do_destroy = false; + DICTIONARY_ID dict_id = lt->get_dict_id(); + + // Release a reference on the locktree. If the count transitions to zero, + // then we *may* need to do the cleanup. + // + // Grab the manager's mutex and look for a locktree with this locktree's + // dictionary id. Since dictionary id's never get reused, any locktree + // found must be the one we just released a reference on. + // + // At least two things could have happened since we got the mutex: + // - Another thread gets a locktree with the same dict_id, increments + // the reference count. In this case, we shouldn't destroy it. + // - Another thread gets a locktree with the same dict_id and then + // releases it quickly, transitioning the reference count from zero to + // one and back to zero. In this case, only one of us should destroy it. + // It doesn't matter which. We originally missed this case, see #5776. + // + // After 5776, the high level rule for release is described below. + // + // If a thread releases a locktree and notices the reference count transition + // to zero, then that thread must immediately: + // - assume the locktree object is invalid + // - grab the manager's mutex + // - search the locktree map for a locktree with the same dict_id and remove + // it, if it exists. the destroy may be deferred. + // - release the manager's mutex + // + // This way, if many threads transition the same locktree's reference count + // from 1 to zero and wait behind the manager's mutex, only one of them will + // do the actual destroy and the others will happily do nothing. + uint32_t refs = lt->release_reference(); + if (refs == 0) { + mutex_lock(); + // lt may not have already been destroyed, so look it up. + locktree *find_lt = locktree_map_find(dict_id); + if (find_lt != nullptr) { + // A locktree is still in the map with that dict_id, so it must be + // equal to lt. This is true because dictionary ids are never reused. + // If the reference count is zero, it's our responsibility to remove + // it and do the destroy. Otherwise, someone still wants it. + // If the locktree is still valid then check if it should be deleted. + if (find_lt == lt) { + if (lt->get_reference_count() == 0) { + locktree_map_remove(lt); + do_destroy = true; + } + m_lt_counters.add(lt->get_lock_request_info()->counters); + } + } + mutex_unlock(); + } + + // if necessary, do the destroy without holding the mutex + if (do_destroy) { + if (m_lt_destroy_callback) { + m_lt_destroy_callback(lt); + } + lt->destroy(); + toku_free(lt); + } +} + +void locktree_manager::run_escalation(void) { + struct escalation_fn { + static void run(void *extra) { + locktree_manager *mgr = (locktree_manager *)extra; + mgr->escalate_all_locktrees(); + }; + }; + m_escalator.run(this, escalation_fn::run, this); +} + +// test-only version of lock escalation +void locktree_manager::run_escalation_for_test(void) { run_escalation(); } + +void locktree_manager::escalate_all_locktrees(void) { + uint64_t t0 = toku_current_time_microsec(); + + // get all locktrees + mutex_lock(); + int num_locktrees = m_locktree_map.size(); + locktree **locktrees = new locktree *[num_locktrees]; + for (int i = 0; i < num_locktrees; i++) { + int r = m_locktree_map.fetch(i, &locktrees[i]); + invariant_zero(r); + reference_lt(locktrees[i]); + } + mutex_unlock(); + + // escalate them + escalate_locktrees(locktrees, num_locktrees); + + delete[] locktrees; + + uint64_t t1 = toku_current_time_microsec(); + add_escalator_wait_time(t1 - t0); +} + +void locktree_manager::note_mem_used(uint64_t mem_used) { + (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used); +} + +void locktree_manager::note_mem_released(uint64_t mem_released) { + uint64_t old_mem_used = + toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released); + invariant(old_mem_used >= mem_released); +} + +bool locktree_manager::out_of_locks(void) const { + return m_current_lock_memory >= m_max_lock_memory; +} + +bool locktree_manager::over_big_threshold(void) { + return m_current_lock_memory >= m_max_lock_memory / 2; +} + +int locktree_manager::iterate_pending_lock_requests( + lock_request_iterate_callback callback, void *extra) { + mutex_lock(); + int r = 0; + uint32_t num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees && r == 0; i++) { + locktree *lt; + r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + + struct lt_lock_request_info *info = lt->get_lock_request_info(); + toku_external_mutex_lock(&info->mutex); + + uint32_t num_requests = info->pending_lock_requests.size(); + for (uint32_t k = 0; k < num_requests && r == 0; k++) { + lock_request *req; + r = info->pending_lock_requests.fetch(k, &req); + invariant_zero(r); + if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */ + continue; + r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(), + req->get_right_key(), req->get_conflicting_txnid(), + req->get_start_time(), extra); + } + + toku_external_mutex_unlock(&info->mutex); + } + mutex_unlock(); + return r; +} + +int locktree_manager::check_current_lock_constraints(bool big_txn) { + int r = 0; + if (big_txn && over_big_threshold()) { + run_escalation(); + if (over_big_threshold()) { + r = TOKUDB_OUT_OF_LOCKS; + } + } + if (r == 0 && out_of_locks()) { + run_escalation(); + if (out_of_locks()) { + // return an error if we're still out of locks after escalation. + r = TOKUDB_OUT_OF_LOCKS; + } + } + return r; +} + +void locktree_manager::escalator_init(void) { + ZERO_STRUCT(m_escalation_mutex); + toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr); + m_escalation_count = 0; + m_escalation_time = 0; + m_wait_escalation_count = 0; + m_wait_escalation_time = 0; + m_long_wait_escalation_count = 0; + m_long_wait_escalation_time = 0; + m_escalation_latest_result = 0; + m_escalator.create(); +} + +void locktree_manager::escalator_destroy(void) { + m_escalator.destroy(); + toku_mutex_destroy(&m_escalation_mutex); +} + +void locktree_manager::add_escalator_wait_time(uint64_t t) { + toku_mutex_lock(&m_escalation_mutex); + m_wait_escalation_count += 1; + m_wait_escalation_time += t; + if (t >= 1000000) { + m_long_wait_escalation_count += 1; + m_long_wait_escalation_time += t; + } + toku_mutex_unlock(&m_escalation_mutex); +} + +void locktree_manager::escalate_locktrees(locktree **locktrees, + int num_locktrees) { + // there are too many row locks in the system and we need to tidy up. + // + // a simple implementation of escalation does not attempt + // to reduce the memory foot print of each txn's range buffer. + // doing so would require some layering hackery (or a callback) + // and more complicated locking. for now, just escalate each + // locktree individually, in-place. + tokutime_t t0 = toku_time_now(); + for (int i = 0; i < num_locktrees; i++) { + locktrees[i]->escalate(m_lt_escalate_callback, + m_lt_escalate_callback_extra); + release_lt(locktrees[i]); + } + tokutime_t t1 = toku_time_now(); + + toku_mutex_lock(&m_escalation_mutex); + m_escalation_count++; + m_escalation_time += (t1 - t0); + m_escalation_latest_result = m_current_lock_memory; + toku_mutex_unlock(&m_escalation_mutex); +} + +struct escalate_args { + locktree_manager *mgr; + locktree **locktrees; + int num_locktrees; +}; + +void locktree_manager::locktree_escalator::create(void) { + ZERO_STRUCT(m_escalator_mutex); + toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr); + toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr); + m_escalator_running = false; +} + +void locktree_manager::locktree_escalator::destroy(void) { + toku_cond_destroy(&m_escalator_done); + toku_mutex_destroy(&m_escalator_mutex); +} + +void locktree_manager::locktree_escalator::run( + locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), + void *extra) { + uint64_t t0 = toku_current_time_microsec(); + toku_mutex_lock(&m_escalator_mutex); + if (!m_escalator_running) { + // run escalation on this thread + m_escalator_running = true; + toku_mutex_unlock(&m_escalator_mutex); + escalate_locktrees_fun(extra); + toku_mutex_lock(&m_escalator_mutex); + m_escalator_running = false; + toku_cond_broadcast(&m_escalator_done); + } else { + toku_cond_wait(&m_escalator_done, &m_escalator_mutex); + } + toku_mutex_unlock(&m_escalator_mutex); + uint64_t t1 = toku_current_time_microsec(); + mgr->add_escalator_wait_time(t1 - t0); +} + +void locktree_manager::get_status(LTM_STATUS statp) { + ltm_status.init(); + LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory; + LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory; + LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count; + LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time; + LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result; + LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count; + LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time; + LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count; + LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time; + + uint64_t lock_requests_pending = 0; + uint64_t sto_num_eligible = 0; + uint64_t sto_end_early_count = 0; + tokutime_t sto_end_early_time = 0; + uint32_t num_locktrees = 0; + struct lt_counters lt_counters; + ZERO_STRUCT(lt_counters); // PORT: instead of ={}. + + if (toku_mutex_trylock(&m_mutex) == 0) { + lt_counters = m_lt_counters; + num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees; i++) { + locktree *lt; + int r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r == EINVAL) // Shouldn't happen, avoid compiler warning + continue; + if (toku_external_mutex_trylock(<->m_lock_request_info.mutex) == 0) { + lock_requests_pending += + lt->m_lock_request_info.pending_lock_requests.size(); + lt_counters.add(lt->get_lock_request_info()->counters); + toku_external_mutex_unlock(<->m_lock_request_info.mutex); + } + sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0; + sto_end_early_count += lt->m_sto_end_early_count; + sto_end_early_time += lt->m_sto_end_early_time; + } + mutex_unlock(); + } + + LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees; + LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending; + LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible; + LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count; + LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time; + LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count; + LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time; + LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count; + LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time; + LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count; + *statp = ltm_status; +} + +void locktree_manager::kill_waiter(void *extra) { + mutex_lock(); + int r = 0; + uint32_t num_locktrees = m_locktree_map.size(); + for (uint32_t i = 0; i < num_locktrees; i++) { + locktree *lt; + r = m_locktree_map.fetch(i, <); + invariant_zero(r); + if (r) continue; // Get rid of "may be used uninitialized" warning + lock_request::kill_waiter(lt, extra); + } + mutex_unlock(); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc new file mode 100644 index 000000000..1e1d23ef8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc @@ -0,0 +1,265 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "range_buffer.h" + +#include <string.h> + +#include "../portability/memory.h" +#include "../util/dbt.h" + +namespace toku { + +bool range_buffer::record_header::left_is_infinite(void) const { + return left_neg_inf || left_pos_inf; +} + +bool range_buffer::record_header::right_is_infinite(void) const { + return right_neg_inf || right_pos_inf; +} + +void range_buffer::record_header::init(const DBT *left_key, + const DBT *right_key, + bool is_exclusive) { + is_exclusive_lock = is_exclusive; + left_neg_inf = left_key == toku_dbt_negative_infinity(); + left_pos_inf = left_key == toku_dbt_positive_infinity(); + left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size; + if (right_key) { + right_neg_inf = right_key == toku_dbt_negative_infinity(); + right_pos_inf = right_key == toku_dbt_positive_infinity(); + right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size; + } else { + right_neg_inf = left_neg_inf; + right_pos_inf = left_pos_inf; + right_key_size = 0; + } +} + +const DBT *range_buffer::iterator::record::get_left_key(void) const { + if (_header.left_neg_inf) { + return toku_dbt_negative_infinity(); + } else if (_header.left_pos_inf) { + return toku_dbt_positive_infinity(); + } else { + return &_left_key; + } +} + +const DBT *range_buffer::iterator::record::get_right_key(void) const { + if (_header.right_neg_inf) { + return toku_dbt_negative_infinity(); + } else if (_header.right_pos_inf) { + return toku_dbt_positive_infinity(); + } else { + return &_right_key; + } +} + +size_t range_buffer::iterator::record::size(void) const { + return sizeof(record_header) + _header.left_key_size + _header.right_key_size; +} + +void range_buffer::iterator::record::deserialize(const char *buf) { + size_t current = 0; + + // deserialize the header + memcpy(&_header, buf, sizeof(record_header)); + current += sizeof(record_header); + + // deserialize the left key if necessary + if (!_header.left_is_infinite()) { + // point the left DBT's buffer into ours + toku_fill_dbt(&_left_key, buf + current, _header.left_key_size); + current += _header.left_key_size; + } + + // deserialize the right key if necessary + if (!_header.right_is_infinite()) { + if (_header.right_key_size == 0) { + toku_copyref_dbt(&_right_key, _left_key); + } else { + toku_fill_dbt(&_right_key, buf + current, _header.right_key_size); + } + } +} + +toku::range_buffer::iterator::iterator() + : _ma_chunk_iterator(nullptr), + _current_chunk_base(nullptr), + _current_chunk_offset(0), + _current_chunk_max(0), + _current_rec_size(0) {} + +toku::range_buffer::iterator::iterator(const range_buffer *buffer) + : _ma_chunk_iterator(&buffer->_arena), + _current_chunk_base(nullptr), + _current_chunk_offset(0), + _current_chunk_max(0), + _current_rec_size(0) { + reset_current_chunk(); +} + +void range_buffer::iterator::reset_current_chunk() { + _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max); + _current_chunk_offset = 0; +} + +bool range_buffer::iterator::current(record *rec) { + if (_current_chunk_offset < _current_chunk_max) { + const char *buf = reinterpret_cast<const char *>(_current_chunk_base); + rec->deserialize(buf + _current_chunk_offset); + _current_rec_size = rec->size(); + return true; + } else { + return false; + } +} + +// move the iterator to the next record in the buffer +void range_buffer::iterator::next(void) { + invariant(_current_chunk_offset < _current_chunk_max); + invariant(_current_rec_size > 0); + + // the next record is _current_rec_size bytes forward + _current_chunk_offset += _current_rec_size; + // now, we don't know how big the current is, set it to 0. + _current_rec_size = 0; + + if (_current_chunk_offset >= _current_chunk_max) { + // current chunk is exhausted, try moving to the next one + if (_ma_chunk_iterator.more()) { + _ma_chunk_iterator.next(); + reset_current_chunk(); + } + } +} + +void range_buffer::create(void) { + // allocate buffer space lazily instead of on creation. this way, + // no malloc/free is done if the transaction ends up taking no locks. + _arena.create(0); + _num_ranges = 0; +} + +void range_buffer::append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { + // if the keys are equal, then only one copy is stored. + if (toku_dbt_equals(left_key, right_key)) { + invariant(left_key->size <= MAX_KEY_SIZE); + append_point(left_key, is_write_request); + } else { + invariant(left_key->size <= MAX_KEY_SIZE); + invariant(right_key->size <= MAX_KEY_SIZE); + append_range(left_key, right_key, is_write_request); + } + _num_ranges++; +} + +bool range_buffer::is_empty(void) const { return total_memory_size() == 0; } + +uint64_t range_buffer::total_memory_size(void) const { + return _arena.total_size_in_use(); +} + +int range_buffer::get_num_ranges(void) const { return _num_ranges; } + +void range_buffer::destroy(void) { _arena.destroy(); } + +void range_buffer::append_range(const DBT *left_key, const DBT *right_key, + bool is_exclusive) { + size_t record_length = + sizeof(record_header) + left_key->size + right_key->size; + char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length)); + + record_header h; + h.init(left_key, right_key, is_exclusive); + + // serialize the header + memcpy(buf, &h, sizeof(record_header)); + buf += sizeof(record_header); + + // serialize the left key if necessary + if (!h.left_is_infinite()) { + memcpy(buf, left_key->data, left_key->size); + buf += left_key->size; + } + + // serialize the right key if necessary + if (!h.right_is_infinite()) { + memcpy(buf, right_key->data, right_key->size); + } +} + +void range_buffer::append_point(const DBT *key, bool is_exclusive) { + size_t record_length = sizeof(record_header) + key->size; + char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length)); + + record_header h; + h.init(key, nullptr, is_exclusive); + + // serialize the header + memcpy(buf, &h, sizeof(record_header)); + buf += sizeof(record_header); + + // serialize the key if necessary + if (!h.left_is_infinite()) { + memcpy(buf, key->data, key->size); + } +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h new file mode 100644 index 000000000..76e28d747 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h @@ -0,0 +1,178 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <inttypes.h> +#include <stdint.h> + +#include "../util/dbt.h" +#include "../util/memarena.h" + +namespace toku { + +// a key range buffer represents a set of key ranges that can +// be stored, iterated over, and then destroyed all at once. +class range_buffer { + private: + // the key range buffer is a bunch of records in a row. + // each record has the following header, followed by the + // left key and right key data payload, if applicable. + // we limit keys to be 2^16, since we store lengths as 2 bytes. + static const size_t MAX_KEY_SIZE = 1 << 16; + + struct record_header { + bool left_neg_inf; + bool left_pos_inf; + bool right_pos_inf; + bool right_neg_inf; + uint16_t left_key_size; + uint16_t right_key_size; + bool is_exclusive_lock; + + bool left_is_infinite(void) const; + + bool right_is_infinite(void) const; + + void init(const DBT *left_key, const DBT *right_key, bool is_exclusive); + }; + // PORT static_assert(sizeof(record_header) == 8, "record header format is + // off"); + + public: + // the iterator abstracts reading over a buffer of variable length + // records one by one until there are no more left. + class iterator { + public: + iterator(); + iterator(const range_buffer *buffer); + + // a record represents the user-view of a serialized key range. + // it handles positive and negative infinity and the optimized + // point range case, where left and right points share memory. + class record { + public: + // get a read-only pointer to the left key of this record's range + const DBT *get_left_key(void) const; + + // get a read-only pointer to the right key of this record's range + const DBT *get_right_key(void) const; + + // how big is this record? this tells us where the next record is + size_t size(void) const; + + bool get_exclusive_flag() const { return _header.is_exclusive_lock; } + + // populate a record header and point our DBT's + // buffers into ours if they are not infinite. + void deserialize(const char *buf); + + private: + record_header _header; + DBT _left_key; + DBT _right_key; + }; + + // populate the given record object with the current + // the memory referred to by record is valid for only + // as long as the record exists. + bool current(record *rec); + + // move the iterator to the next record in the buffer + void next(void); + + private: + void reset_current_chunk(); + + // the key range buffer we are iterating over, the current + // offset in that buffer, and the size of the current record. + memarena::chunk_iterator _ma_chunk_iterator; + const void *_current_chunk_base; + size_t _current_chunk_offset; + size_t _current_chunk_max; + size_t _current_rec_size; + }; + + // allocate buffer space lazily instead of on creation. this way, + // no malloc/free is done if the transaction ends up taking no locks. + void create(void); + + // append a left/right key range to the buffer. + // if the keys are equal, then only one copy is stored. + void append(const DBT *left_key, const DBT *right_key, + bool is_write_request = false); + + // is this range buffer empty? + bool is_empty(void) const; + + // how much memory is being used by this range buffer? + uint64_t total_memory_size(void) const; + + // how many ranges are stored in this range buffer? + int get_num_ranges(void) const; + + void destroy(void); + + private: + memarena _arena; + int _num_ranges; + + void append_range(const DBT *left_key, const DBT *right_key, + bool is_write_request); + + // append a point to the buffer. this is the space/time saving + // optimization for key ranges where left == right. + void append_point(const DBT *key, bool is_write_request); +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc new file mode 100644 index 000000000..8997f634b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc @@ -0,0 +1,520 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "treenode.h" + +#include "../portability/toku_race_tools.h" + +namespace toku { + +// TODO: source location info might have to be pulled up one caller +// to be useful +void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); } + +void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); } + +void treenode::init(const comparator *cmp) { + m_txnid = TXNID_NONE; + m_is_root = false; + m_is_empty = true; + m_cmp = cmp; + + m_is_shared = false; + m_owners = nullptr; + + // use an adaptive mutex at each node since we expect the time the + // lock is held to be relatively short compared to a context switch. + // indeed, this improves performance at high thread counts considerably. + memset(&m_mutex, 0, sizeof(toku_mutex_t)); + toku_pthread_mutexattr_t attr; + toku_mutexattr_init(&attr); + toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE); + toku_mutex_init(treenode_mutex_key, &m_mutex, &attr); + toku_mutexattr_destroy(&attr); + m_left_child.set(nullptr); + m_right_child.set(nullptr); +} + +void treenode::create_root(const comparator *cmp) { + init(cmp); + m_is_root = true; +} + +void treenode::destroy_root(void) { + invariant(is_root()); + invariant(is_empty()); + toku_mutex_destroy(&m_mutex); + m_cmp = nullptr; +} + +void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid, + bool is_shared) { + // allocates a new copy of the range for this node + m_range.create_copy(range); + m_txnid = txnid; + m_is_shared = is_shared; + m_is_empty = false; +} + +bool treenode::is_root(void) { return m_is_root; } + +bool treenode::is_empty(void) { return m_is_empty; } + +bool treenode::range_overlaps(const keyrange &range) { + return m_range.overlaps(*m_cmp, range); +} + +treenode *treenode::alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared) { + treenode *XCALLOC(node); + node->init(cmp); + node->set_range_and_txnid(range, txnid, is_shared); + return node; +} + +void treenode::swap_in_place(treenode *node1, treenode *node2) { + keyrange tmp_range = node1->m_range; + TXNID tmp_txnid = node1->m_txnid; + node1->m_range = node2->m_range; + node1->m_txnid = node2->m_txnid; + node2->m_range = tmp_range; + node2->m_txnid = tmp_txnid; + + bool tmp_is_shared = node1->m_is_shared; + node1->m_is_shared = node2->m_is_shared; + node2->m_is_shared = tmp_is_shared; + + auto tmp_m_owners = node1->m_owners; + node1->m_owners = node2->m_owners; + node2->m_owners = tmp_m_owners; +} + +bool treenode::add_shared_owner(TXNID txnid) { + assert(m_is_shared); + if (txnid == m_txnid) + return false; // acquiring a lock on the same range by the same trx + + if (m_txnid != TXNID_SHARED) { + m_owners = new TxnidVector; + m_owners->insert(m_txnid); + m_txnid = TXNID_SHARED; + } + m_owners->insert(txnid); + return true; +} + +void treenode::free(treenode *node) { + // destroy the range, freeing any copied keys + node->m_range.destroy(); + + if (node->m_owners) { + delete node->m_owners; + node->m_owners = nullptr; // need this? + } + + // the root is simply marked as empty. + if (node->is_root()) { + // PORT toku_mutex_assert_locked(&node->m_mutex); + node->m_is_empty = true; + } else { + // PORT toku_mutex_assert_unlocked(&node->m_mutex); + toku_mutex_destroy(&node->m_mutex); + toku_free(node); + } +} + +uint32_t treenode::get_depth_estimate(void) const { + const uint32_t left_est = m_left_child.depth_est; + const uint32_t right_est = m_right_child.depth_est; + return (left_est > right_est ? left_est : right_est) + 1; +} + +treenode *treenode::find_node_with_overlapping_child( + const keyrange &range, const keyrange::comparison *cmp_hint) { + // determine which child to look at based on a comparison. if we were + // given a comparison hint, use that. otherwise, compare them now. + keyrange::comparison c = + cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range); + + treenode *child; + if (c == keyrange::comparison::LESS_THAN) { + child = lock_and_rebalance_left(); + } else { + // The caller (locked_keyrange::acquire) handles the case where + // the root of the locked_keyrange is the node that overlaps. + // range is guaranteed not to overlap this node. + invariant(c == keyrange::comparison::GREATER_THAN); + child = lock_and_rebalance_right(); + } + + // if the search would lead us to an empty subtree (child == nullptr), + // or the child overlaps, then we know this node is the parent we want. + // otherwise we need to recur into that child. + if (child == nullptr) { + return this; + } else { + c = range.compare(*m_cmp, child->m_range); + if (c == keyrange::comparison::EQUALS || + c == keyrange::comparison::OVERLAPS) { + child->mutex_unlock(); + return this; + } else { + // unlock this node before recurring into the locked child, + // passing in a comparison hint since we just comapred range + // to the child's range. + mutex_unlock(); + return child->find_node_with_overlapping_child(range, &c); + } + } +} + +bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) { + int rc = true; + // choose a child to check. if that child is null, then insert the new node + // there. otherwise recur down that child's subtree + keyrange::comparison c = range.compare(*m_cmp, m_range); + if (c == keyrange::comparison::LESS_THAN) { + treenode *left_child = lock_and_rebalance_left(); + if (left_child == nullptr) { + left_child = treenode::alloc(m_cmp, range, txnid, is_shared); + m_left_child.set(left_child); + } else { + left_child->insert(range, txnid, is_shared); + left_child->mutex_unlock(); + } + } else if (c == keyrange::comparison::GREATER_THAN) { + // invariant(c == keyrange::comparison::GREATER_THAN); + treenode *right_child = lock_and_rebalance_right(); + if (right_child == nullptr) { + right_child = treenode::alloc(m_cmp, range, txnid, is_shared); + m_right_child.set(right_child); + } else { + right_child->insert(range, txnid, is_shared); + right_child->mutex_unlock(); + } + } else if (c == keyrange::comparison::EQUALS) { + invariant(is_shared); + invariant(m_is_shared); + rc = add_shared_owner(txnid); + } else { + invariant(0); + } + return rc; +} + +treenode *treenode::find_child_at_extreme(int direction, treenode **parent) { + treenode *child = + direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked(); + + if (child) { + *parent = this; + treenode *child_extreme = child->find_child_at_extreme(direction, parent); + child->mutex_unlock(); + return child_extreme; + } else { + return this; + } +} + +treenode *treenode::find_leftmost_child(treenode **parent) { + return find_child_at_extreme(-1, parent); +} + +treenode *treenode::find_rightmost_child(treenode **parent) { + return find_child_at_extreme(1, parent); +} + +treenode *treenode::remove_root_of_subtree() { + // if this node has no children, just free it and return null + if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) { + // treenode::free requires that non-root nodes are unlocked + if (!is_root()) { + mutex_unlock(); + } + treenode::free(this); + return nullptr; + } + + // we have a child, so get either the in-order successor or + // predecessor of this node to be our replacement. + // replacement_parent is updated by the find functions as + // they recur down the tree, so initialize it to this. + treenode *child, *replacement; + treenode *replacement_parent = this; + if (m_left_child.ptr != nullptr) { + child = m_left_child.get_locked(); + replacement = child->find_rightmost_child(&replacement_parent); + invariant(replacement == child || replacement_parent != this); + + // detach the replacement from its parent + if (replacement_parent == this) { + m_left_child = replacement->m_left_child; + } else { + replacement_parent->m_right_child = replacement->m_left_child; + } + } else { + child = m_right_child.get_locked(); + replacement = child->find_leftmost_child(&replacement_parent); + invariant(replacement == child || replacement_parent != this); + + // detach the replacement from its parent + if (replacement_parent == this) { + m_right_child = replacement->m_right_child; + } else { + replacement_parent->m_left_child = replacement->m_right_child; + } + } + child->mutex_unlock(); + + // swap in place with the detached replacement, then destroy it + treenode::swap_in_place(replacement, this); + treenode::free(replacement); + + return this; +} + +void treenode::recursive_remove(void) { + treenode *left = m_left_child.ptr; + if (left) { + left->recursive_remove(); + } + m_left_child.set(nullptr); + + treenode *right = m_right_child.ptr; + if (right) { + right->recursive_remove(); + } + m_right_child.set(nullptr); + + // we do not take locks on the way down, so we know non-root nodes + // are unlocked here and the caller is required to pass a locked + // root, so this free is correct. + treenode::free(this); +} + +void treenode::remove_shared_owner(TXNID txnid) { + assert(m_owners->size() > 1); + m_owners->erase(txnid); + assert(m_owners->size() > 0); + /* if there is just one owner left, move it to m_txnid */ + if (m_owners->size() == 1) { + m_txnid = *m_owners->begin(); + delete m_owners; + m_owners = nullptr; + } +} + +treenode *treenode::remove(const keyrange &range, TXNID txnid) { + treenode *child; + // if the range is equal to this node's range, then just remove + // the root of this subtree. otherwise search down the tree + // in either the left or right children. + keyrange::comparison c = range.compare(*m_cmp, m_range); + switch (c) { + case keyrange::comparison::EQUALS: { + // if we are the only owners, remove. Otherwise, just remove + // us from the owners list. + if (txnid != TXNID_ANY && has_multiple_owners()) { + remove_shared_owner(txnid); + return this; + } else { + return remove_root_of_subtree(); + } + } + case keyrange::comparison::LESS_THAN: + child = m_left_child.get_locked(); + invariant_notnull(child); + child = child->remove(range, txnid); + + // unlock the child if there still is one. + // regardless, set the right child pointer + if (child) { + child->mutex_unlock(); + } + m_left_child.set(child); + break; + case keyrange::comparison::GREATER_THAN: + child = m_right_child.get_locked(); + invariant_notnull(child); + child = child->remove(range, txnid); + + // unlock the child if there still is one. + // regardless, set the right child pointer + if (child) { + child->mutex_unlock(); + } + m_right_child.set(child); + break; + case keyrange::comparison::OVERLAPS: + // shouldn't be overlapping, since the tree is + // non-overlapping and this range must exist + abort(); + } + + return this; +} + +bool treenode::left_imbalanced(int threshold) const { + uint32_t left_depth = m_left_child.depth_est; + uint32_t right_depth = m_right_child.depth_est; + return m_left_child.ptr != nullptr && left_depth > threshold + right_depth; +} + +bool treenode::right_imbalanced(int threshold) const { + uint32_t left_depth = m_left_child.depth_est; + uint32_t right_depth = m_right_child.depth_est; + return m_right_child.ptr != nullptr && right_depth > threshold + left_depth; +} + +// effect: rebalances the subtree rooted at this node +// using AVL style O(1) rotations. unlocks this +// node if it is not the new root of the subtree. +// requires: node is locked by this thread, children are not +// returns: locked root node of the rebalanced tree +treenode *treenode::maybe_rebalance(void) { + // if we end up not rotating at all, the new root is this + treenode *new_root = this; + treenode *child = nullptr; + + if (left_imbalanced(IMBALANCE_THRESHOLD)) { + child = m_left_child.get_locked(); + if (child->right_imbalanced(0)) { + treenode *grandchild = child->m_right_child.get_locked(); + + child->m_right_child = grandchild->m_left_child; + grandchild->m_left_child.set(child); + + m_left_child = grandchild->m_right_child; + grandchild->m_right_child.set(this); + + new_root = grandchild; + } else { + m_left_child = child->m_right_child; + child->m_right_child.set(this); + new_root = child; + } + } else if (right_imbalanced(IMBALANCE_THRESHOLD)) { + child = m_right_child.get_locked(); + if (child->left_imbalanced(0)) { + treenode *grandchild = child->m_left_child.get_locked(); + + child->m_left_child = grandchild->m_right_child; + grandchild->m_right_child.set(child); + + m_right_child = grandchild->m_left_child; + grandchild->m_left_child.set(this); + + new_root = grandchild; + } else { + m_right_child = child->m_left_child; + child->m_left_child.set(this); + new_root = child; + } + } + + // up to three nodes may be locked. + // - this + // - child + // - grandchild (but if it is locked, its the new root) + // + // one of them is the new root. we unlock everything except the new root. + if (child && child != new_root) { + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex); + child->mutex_unlock(); + } + if (this != new_root) { + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex); + mutex_unlock(); + } + TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex); + return new_root; +} + +treenode *treenode::lock_and_rebalance_left(void) { + treenode *child = m_left_child.get_locked(); + if (child) { + treenode *new_root = child->maybe_rebalance(); + m_left_child.set(new_root); + child = new_root; + } + return child; +} + +treenode *treenode::lock_and_rebalance_right(void) { + treenode *child = m_right_child.get_locked(); + if (child) { + treenode *new_root = child->maybe_rebalance(); + m_right_child.set(new_root); + child = new_root; + } + return child; +} + +void treenode::child_ptr::set(treenode *node) { + ptr = node; + depth_est = ptr ? ptr->get_depth_estimate() : 0; +} + +treenode *treenode::child_ptr::get_locked(void) { + if (ptr) { + ptr->mutex_lock(); + depth_est = ptr->get_depth_estimate(); + } + return ptr; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h new file mode 100644 index 000000000..ec25a8c58 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h @@ -0,0 +1,302 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <string.h> + +#include "../ft/comparator.h" +#include "../portability/memory.h" +#include "../portability/toku_pthread.h" +// PORT: we need LTM_STATUS +#include "../ft/ft-status.h" +#include "../portability/txn_subst.h" +#include "keyrange.h" + +namespace toku { + +// a node in a tree with its own mutex +// - range is the "key" of this node +// - txnid is the single txnid associated with this node +// - left and right children may be null +// +// to build a tree on top of this abstraction, the user: +// - provides memory for a root node, initializes it via create_root() +// - performs tree operations on the root node. memory management +// below the root node is handled by the abstraction, not the user. +// this pattern: +// - guaruntees a root node always exists. +// - does not allow for rebalances on the root node + +class treenode { + public: + // every treenode function has some common requirements: + // - node is locked and children are never locked + // - node may be unlocked if no other thread has visibility + + // effect: create the root node + void create_root(const comparator *cmp); + + // effect: destroys the root node + void destroy_root(void); + + // effect: sets the txnid and copies the given range for this node + void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared); + + // returns: true iff this node is marked as empty + bool is_empty(void); + + // returns: true if this is the root node, denoted by a null parent + bool is_root(void); + + // returns: true if the given range overlaps with this node's range + bool range_overlaps(const keyrange &range); + + // effect: locks the node + void mutex_lock(void); + + // effect: unlocks the node + void mutex_unlock(void); + + // return: node whose child overlaps, or a child that is empty + // and would contain range if it existed + // given: if cmp_hint is non-null, then it is a precomputed + // comparison of this node's range to the given range. + treenode *find_node_with_overlapping_child( + const keyrange &range, const keyrange::comparison *cmp_hint); + + // effect: performs an in-order traversal of the ranges that overlap the + // given range, calling function->fn() on each node that does + // requires: function signature is: bool fn(const keyrange &range, TXNID + // txnid) requires: fn returns true to keep iterating, false to stop iterating + // requires: fn does not attempt to use any ranges read out by value + // after removing a node with an overlapping range from the tree. + template <class F> + void traverse_overlaps(const keyrange &range, F *function) { + keyrange::comparison c = range.compare(*m_cmp, m_range); + if (c == keyrange::comparison::EQUALS) { + // Doesn't matter if fn wants to keep going, there + // is nothing left, so return. + function->fn(m_range, m_txnid, m_is_shared, m_owners); + return; + } + + treenode *left = m_left_child.get_locked(); + if (left) { + if (c != keyrange::comparison::GREATER_THAN) { + // Target range is less than this node, or it overlaps this + // node. There may be something on the left. + left->traverse_overlaps(range, function); + } + left->mutex_unlock(); + } + + if (c == keyrange::comparison::OVERLAPS) { + bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners); + if (!keep_going) { + return; + } + } + + treenode *right = m_right_child.get_locked(); + if (right) { + if (c != keyrange::comparison::LESS_THAN) { + // Target range is greater than this node, or it overlaps this + // node. There may be something on the right. + right->traverse_overlaps(range, function); + } + right->mutex_unlock(); + } + } + + // effect: inserts the given range and txnid into a subtree, recursively + // requires: range does not overlap with any node below the subtree + bool insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the subtree + // requires: range exists in the subtree + // returns: the root of the resulting subtree + treenode *remove(const keyrange &range, TXNID txnid); + + // effect: removes this node and all of its children, recursively + // requires: every node at and below this node is unlocked + void recursive_remove(void); + + private: + // the child_ptr is a light abstraction for the locking of + // a child and the maintenence of its depth estimate. + + struct child_ptr { + // set the child pointer + void set(treenode *node); + + // get and lock this child if it exists + treenode *get_locked(void); + + treenode *ptr; + uint32_t depth_est; + }; + + // the balance factor at which a node is considered imbalanced + static const int32_t IMBALANCE_THRESHOLD = 2; + + // node-level mutex + toku_mutex_t m_mutex; + + // the range and txnid for this node. the range contains a copy + // of the keys originally inserted into the tree. nodes may + // swap ranges. but at the end of the day, when a node is + // destroyed, it frees the memory associated with whatever range + // it has at the time of destruction. + keyrange m_range; + + void remove_shared_owner(TXNID txnid); + + bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); } + + private: + // Owner transaction id. + // A value of TXNID_SHARED means this node has multiple owners + TXNID m_txnid; + + // If true, this lock is a non-exclusive lock, and it can have either + // one or several owners. + bool m_is_shared; + + // List of the owners, or nullptr if there's just one owner. + TxnidVector *m_owners; + + // two child pointers + child_ptr m_left_child; + child_ptr m_right_child; + + // comparator for ranges + // psergey-todo: Is there any sense to store the comparator in each tree + // node? + const comparator *m_cmp; + + // marked for the root node. the root node is never free()'d + // when removed, but instead marked as empty. + bool m_is_root; + + // marked for an empty node. only valid for the root. + bool m_is_empty; + + // effect: initializes an empty node with the given comparator + void init(const comparator *cmp); + + // requires: this is a shared node (m_is_shared==true) + // effect: another transaction is added as an owner. + // returns: true <=> added another owner + // false <=> this transaction is already an owner + bool add_shared_owner(TXNID txnid); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the leftmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_leftmost_child(treenode **parent); + + // requires: *parent is initialized to something meaningful. + // requires: subtree is non-empty + // returns: the rightmost child of the given subtree + // returns: a pointer to the parent of said child in *parent, only + // if this function recurred, otherwise it is untouched. + treenode *find_rightmost_child(treenode **parent); + + // effect: remove the root of this subtree, destroying the old root + // returns: the new root of the subtree + treenode *remove_root_of_subtree(void); + + // requires: subtree is non-empty, direction is not 0 + // returns: the child of the subtree at either the left or rightmost extreme + treenode *find_child_at_extreme(int direction, treenode **parent); + + // effect: retrieves and possibly rebalances the left child + // returns: a locked left child, if it exists + treenode *lock_and_rebalance_left(void); + + // effect: retrieves and possibly rebalances the right child + // returns: a locked right child, if it exists + treenode *lock_and_rebalance_right(void); + + // returns: the estimated depth of this subtree + uint32_t get_depth_estimate(void) const; + + // returns: true iff left subtree depth is sufficiently less than the right + bool left_imbalanced(int threshold) const; + + // returns: true iff right subtree depth is sufficiently greater than the left + bool right_imbalanced(int threshold) const; + + // effect: performs an O(1) rebalance, which will "heal" an imbalance by at + // most 1. effect: if the new root is not this node, then this node is + // unlocked. returns: locked node representing the new root of the rebalanced + // subtree + treenode *maybe_rebalance(void); + + // returns: allocated treenode populated with a copy of the range and txnid + static treenode *alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared); + + // requires: node is a locked root node, or an unlocked non-root node + static void free(treenode *node); + + // effect: swaps the range/txnid pairs for node1 and node2. + static void swap_in_place(treenode *node1, treenode *node2); + + friend class concurrent_tree_unit_test; +}; + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc new file mode 100644 index 000000000..4caf1e26f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc @@ -0,0 +1,120 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "txnid_set.h" + +#include "../db.h" + +namespace toku { + +int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b); +int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) { + if (txnid_a < txnid_b) { + return -1; + } else if (txnid_a == txnid_b) { + return 0; + } else { + return 1; + } +} + +void txnid_set::create(void) { + // lazily allocate the underlying omt, since it is common + // to create a txnid set and never put anything in it. + m_txnids.create_no_array(); +} + +void txnid_set::destroy(void) { m_txnids.destroy(); } + +// Return true if the given transaction id is a member of the set. +// Otherwise, return false. +bool txnid_set::contains(TXNID txnid) const { + TXNID find_txnid; + int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr); + return r == 0 ? true : false; +} + +// Add a given txnid to the set +void txnid_set::add(TXNID txnid) { + int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr); + invariant(r == 0 || r == DB_KEYEXIST); +} + +// Delete a given txnid from the set. +void txnid_set::remove(TXNID txnid) { + uint32_t idx; + int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx); + if (r == 0) { + r = m_txnids.delete_at(idx); + invariant_zero(r); + } +} + +// Return the size of the set +uint32_t txnid_set::size(void) const { return m_txnids.size(); } + +// Get the ith id in the set, assuming that the set is sorted. +TXNID txnid_set::get(uint32_t i) const { + TXNID txnid; + int r = m_txnids.fetch(i, &txnid); + if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */ + return TXNID_NONE; + invariant_zero(r); + return txnid; +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h new file mode 100644 index 000000000..d79c24fb0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h @@ -0,0 +1,92 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../portability/txn_subst.h" +#include "../util/omt.h" + +namespace toku { + +class txnid_set { + public: + // effect: Creates an empty set. Does not malloc space for + // any entries yet. That is done lazily on add(). + void create(void); + + // effect: Destroy the set's internals. + void destroy(void); + + // returns: True if the given txnid is a member of the set. + bool contains(TXNID id) const; + + // effect: Adds a given txnid to the set if it did not exist + void add(TXNID txnid); + + // effect: Deletes a txnid from the set if it exists. + void remove(TXNID txnid); + + // returns: Size of the set + uint32_t size(void) const; + + // returns: The "i'th" id in the set, as if it were sorted. + TXNID get(uint32_t i) const; + + private: + toku::omt<TXNID> m_txnids; + + friend class txnid_set_unit_test; +}; +ENSURE_POD(txnid_set); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc new file mode 100644 index 000000000..24536c88e --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc @@ -0,0 +1,213 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "../db.h" +#include "../portability/memory.h" +// PORT #include <toku_assert.h> +#include <memory.h> +#include <string.h> + +#include "txnid_set.h" +#include "wfg.h" + +namespace toku { + +// Create a lock request graph +void wfg::create(void) { m_nodes.create(); } + +// Destroy the internals of the lock request graph +void wfg::destroy(void) { + uint32_t n_nodes = m_nodes.size(); + for (uint32_t i = 0; i < n_nodes; i++) { + node *n; + int r = m_nodes.fetch(i, &n); + invariant_zero(r); + invariant_notnull(n); + if (r) continue; // Get rid of "may be used uninitialized" warning + node::free(n); + } + m_nodes.destroy(); +} + +// Add an edge (a_id, b_id) to the graph +void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) { + node *a_node = find_create_node(a_txnid); + node *b_node = find_create_node(b_txnid); + a_node->edges.add(b_node->txnid); +} + +// Return true if a node with the given transaction id exists in the graph. +// Return false otherwise. +bool wfg::node_exists(TXNID txnid) { + node *n = find_node(txnid); + return n != NULL; +} + +bool wfg::cycle_exists_from_node(node *target, node *head, + std::function<void(TXNID)> reporter) { + bool cycle_found = false; + head->visited = true; + uint32_t n_edges = head->edges.size(); + for (uint32_t i = 0; i < n_edges && !cycle_found; i++) { + TXNID edge_id = head->edges.get(i); + if (target->txnid == edge_id) { + cycle_found = true; + if (reporter) reporter(edge_id); + } else { + node *new_head = find_node(edge_id); + if (new_head && !new_head->visited) { + cycle_found = cycle_exists_from_node(target, new_head, reporter); + if (cycle_found && reporter) reporter(edge_id); + } + } + } + head->visited = false; + return cycle_found; +} + +// Return true if there exists a cycle from a given transaction id in the graph. +// Return false otherwise. +bool wfg::cycle_exists_from_txnid(TXNID txnid, + std::function<void(TXNID)> reporter) { + node *a_node = find_node(txnid); + bool cycles_found = false; + if (a_node) { + cycles_found = cycle_exists_from_node(a_node, a_node, reporter); + } + return cycles_found; +} + +// Apply a given function f to all of the nodes in the graph. The apply +// function returns when the function f is called for all of the nodes in the +// graph, or the function f returns non-zero. +void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) { + int r = 0; + uint32_t n_nodes = m_nodes.size(); + for (uint32_t i = 0; i < n_nodes && r == 0; i++) { + node *n; + r = m_nodes.fetch(i, &n); + invariant_zero(r); + if (r) continue; // Get rid of "may be used uninitialized" warning + r = fn(n->txnid, extra); + } +} + +// Apply a given function f to all of the edges whose origin is a given node id. +// The apply function returns when the function f is called for all edges in the +// graph rooted at node id, or the function f returns non-zero. +void wfg::apply_edges(TXNID txnid, + int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra), + void *extra) { + node *n = find_node(txnid); + if (n) { + int r = 0; + uint32_t n_edges = n->edges.size(); + for (uint32_t i = 0; i < n_edges && r == 0; i++) { + r = fn(txnid, n->edges.get(i), extra); + } + } +} + +// find node by id +wfg::node *wfg::find_node(TXNID txnid) { + node *n = nullptr; + int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr); + invariant(r == 0 || r == DB_NOTFOUND); + return n; +} + +// this is the omt comparison function +// nodes are compared by their txnid. +int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) { + TXNID txnid_a = node_a->txnid; + if (txnid_a < txnid_b) { + return -1; + } else if (txnid_a == txnid_b) { + return 0; + } else { + return 1; + } +} + +// insert a new node +wfg::node *wfg::find_create_node(TXNID txnid) { + node *n; + uint32_t idx; + int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx); + if (r == DB_NOTFOUND) { + n = node::alloc(txnid); + r = m_nodes.insert_at(n, idx); + invariant_zero(r); + } + invariant_notnull(n); + return n; +} + +wfg::node *wfg::node::alloc(TXNID txnid) { + node *XCALLOC(n); + n->txnid = txnid; + n->visited = false; + n->edges.create(); + return n; +} + +void wfg::node::free(wfg::node *n) { + n->edges.destroy(); + toku_free(n); +} + +} /* namespace toku */ +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h new file mode 100644 index 000000000..804202170 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h @@ -0,0 +1,124 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <functional> + +#include "../util/omt.h" +#include "txnid_set.h" + +namespace toku { + +// A wfg is a 'wait-for' graph. A directed edge in represents one +// txn waiting for another to finish before it can acquire a lock. + +class wfg { + public: + // Create a lock request graph + void create(void); + + // Destroy the internals of the lock request graph + void destroy(void); + + // Add an edge (a_id, b_id) to the graph + void add_edge(TXNID a_txnid, TXNID b_txnid); + + // Return true if a node with the given transaction id exists in the graph. + // Return false otherwise. + bool node_exists(TXNID txnid); + + // Return true if there exists a cycle from a given transaction id in the + // graph. Return false otherwise. + bool cycle_exists_from_txnid(TXNID txnid, + std::function<void(TXNID)> reporter); + + // Apply a given function f to all of the nodes in the graph. The apply + // function returns when the function f is called for all of the nodes in the + // graph, or the function f returns non-zero. + void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra); + + // Apply a given function f to all of the edges whose origin is a given node + // id. The apply function returns when the function f is called for all edges + // in the graph rooted at node id, or the function f returns non-zero. + void apply_edges(TXNID txnid, + int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra), + void *extra); + + private: + struct node { + // txnid for this node and the associated set of edges + TXNID txnid; + txnid_set edges; + bool visited; + + static node *alloc(TXNID txnid); + + static void free(node *n); + }; + ENSURE_POD(node); + + toku::omt<node *> m_nodes; + + node *find_node(TXNID txnid); + + node *find_create_node(TXNID txnid); + + bool cycle_exists_from_node(node *target, node *head, + std::function<void(TXNID)> reporter); + + static int find_by_txnid(node *const &node_a, const TXNID &txnid_b); +}; +ENSURE_POD(wfg); + +} /* namespace toku */ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h new file mode 100644 index 000000000..0a621f8e0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h @@ -0,0 +1,215 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <stdlib.h> + +#include "toku_portability.h" + +/* Percona memory allocation functions and macros. + * These are functions for malloc and free */ + +int toku_memory_startup(void) __attribute__((constructor)); +void toku_memory_shutdown(void) __attribute__((destructor)); + +/* Generally: errno is set to 0 or a value to indicate problems. */ + +// Everything should call toku_malloc() instead of malloc(), and toku_calloc() +// instead of calloc() That way the tests can can, e.g., replace the malloc +// function using toku_set_func_malloc(). +void *toku_calloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xcalloc(size_t nmemb, size_t size) + __attribute__((__visibility__("default"))); +void *toku_malloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_malloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); + +// xmalloc aborts instead of return NULL if we run out of memory +void *toku_xmalloc(size_t size) __attribute__((__visibility__("default"))); +void *toku_xrealloc(void *, size_t size) + __attribute__((__visibility__("default"))); +void *toku_xmalloc_aligned(size_t alignment, size_t size) + __attribute__((__visibility__("default"))); +// Effect: Perform a os_malloc_aligned(size) with the additional property that +// the returned pointer is a multiple of ALIGNMENT. +// Fail with a resource_assert if the allocation fails (don't return an error +// code). If the alloc_aligned function has been set then call it instead. +// Requires: alignment is a power of two. + +void toku_free(void *) __attribute__((__visibility__("default"))); + +size_t toku_malloc_usable_size(void *p) + __attribute__((__visibility__("default"))); + +/* MALLOC is a macro that helps avoid a common error: + * Suppose I write + * struct foo *x = malloc(sizeof(struct foo)); + * That works fine. But if I change it to this, I've probably made an mistake: + * struct foo *x = malloc(sizeof(struct bar)); + * It can get worse, since one might have something like + * struct foo *x = malloc(sizeof(struct foo *)) + * which looks reasonable, but it allocoates enough to hold a pointer instead of + * the amount needed for the struct. So instead, write struct foo *MALLOC(x); + * and you cannot go wrong. + */ +#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v))) +/* MALLOC_N is like calloc(Except no 0ing of data): It makes an array. Write + * int *MALLOC_N(5,x); + * to make an array of 5 integers. + */ +#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v))) +#define MALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v))) + +// CALLOC_N is like calloc with auto-figuring out size of members +#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v))) + +#define CALLOC(v) CALLOC_N(1, v) + +// XMALLOC macros are like MALLOC except they abort if the operation fails +#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v))) +#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v))) +#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v)))) +#define XCALLOC(v) XCALLOC_N(1, v) +#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s)) +#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v))) + +#define XMALLOC_N_ALIGNED(align, n, v) \ + CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v))) + +#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src))) +#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len)) + +// ZERO_ARRAY writes zeroes to a stack-allocated array +#define ZERO_ARRAY(o) \ + do { \ + memset((o), 0, sizeof(o)); \ + } while (0) +// ZERO_STRUCT writes zeroes to a stack-allocated struct +#define ZERO_STRUCT(o) \ + do { \ + memset(&(o), 0, sizeof(o)); \ + } while (0) + +/* Copy memory. Analogous to strdup() */ +void *toku_memdup(const void *v, size_t len); +/* Toku-version of strdup. Use this so that it calls toku_malloc() */ +char *toku_strdup(const char *s) __attribute__((__visibility__("default"))); +/* Toku-version of strndup. Use this so that it calls toku_malloc() */ +char *toku_strndup(const char *s, size_t n) + __attribute__((__visibility__("default"))); +/* Copy memory. Analogous to strdup() Crashes instead of returning NULL */ +void *toku_xmemdup(const void *v, size_t len) + __attribute__((__visibility__("default"))); +/* Toku-version of strdup. Use this so that it calls toku_xmalloc() Crashes + * instead of returning NULL */ +char *toku_xstrdup(const char *s) __attribute__((__visibility__("default"))); + +void toku_malloc_cleanup( + void); /* Before exiting, call this function to free up any internal data + structures from toku_malloc. Otherwise valgrind will complain of + memory leaks. */ + +/* Check to see if everything malloc'd was free. Might be a no-op depending on + * how memory.c is configured. */ +void toku_memory_check_all_free(void); +/* Check to see if memory is "sane". Might be a no-op. Probably better to + * simply use valgrind. */ +void toku_do_memory_check(void); + +typedef void *(*malloc_fun_t)(size_t); +typedef void (*free_fun_t)(void *); +typedef void *(*realloc_fun_t)(void *, size_t); +typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/); +typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/, + size_t /*size*/); + +void toku_set_func_malloc(malloc_fun_t f); +void toku_set_func_xmalloc_only(malloc_fun_t f); +void toku_set_func_malloc_only(malloc_fun_t f); +void toku_set_func_realloc(realloc_fun_t f); +void toku_set_func_xrealloc_only(realloc_fun_t f); +void toku_set_func_realloc_only(realloc_fun_t f); +void toku_set_func_free(free_fun_t f); + +typedef struct memory_status { + uint64_t malloc_count; // number of malloc operations + uint64_t free_count; // number of free operations + uint64_t realloc_count; // number of realloc operations + uint64_t malloc_fail; // number of malloc operations that failed + uint64_t realloc_fail; // number of realloc operations that failed + uint64_t requested; // number of bytes requested + uint64_t used; // number of bytes used (requested + overhead), obtained from + // malloc_usable_size() + uint64_t freed; // number of bytes freed; + uint64_t max_requested_size; // largest attempted allocation size + uint64_t last_failed_size; // size of the last failed allocation attempt + volatile uint64_t + max_in_use; // maximum memory footprint (used - freed), approximate (not + // worth threadsafety overhead for exact) + const char *mallocator_version; + uint64_t mmap_threshold; +} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS; + +void toku_memory_get_status(LOCAL_MEMORY_STATUS s); + +// Effect: Like toku_memory_footprint, except instead of passing p, +// we pass toku_malloc_usable_size(p). +size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable); + +// Effect: Return an estimate how how much space an object is using, possibly by +// using toku_malloc_usable_size(p). +// If p is NULL then returns 0. +size_t toku_memory_footprint(void *p, size_t touched); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h new file mode 100644 index 000000000..af47800fb --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h @@ -0,0 +1,39 @@ +// +// A replacement for toku_assert.h +// +#pragma once + +#include <assert.h> +#include <errno.h> + +#ifdef NDEBUG + +#define assert_zero(a) ((void)(a)) +#define invariant(a) ((void)(a)) +#define invariant_notnull(a) ((void)(a)) +#define invariant_zero(a) ((void)(a)) + +#else + +#define assert_zero(a) assert((a) == 0) +#define invariant(a) assert(a) +#define invariant_notnull(a) assert(a) +#define invariant_zero(a) assert_zero(a) + +#endif + +#define lazy_assert_zero(a) assert_zero(a) + +#define paranoid_invariant_zero(a) assert_zero(a) +#define paranoid_invariant_notnull(a) assert(a) +#define paranoid_invariant(a) assert(a) + +#define ENSURE_POD(type) \ + static_assert( \ + std::is_standard_layout<type>::value && std::is_trivial<type>::value, \ + #type "isn't POD") + +inline int get_error_errno(void) { + invariant(errno); + return errno; +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h new file mode 100644 index 000000000..aaa2298fa --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h @@ -0,0 +1,130 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include <portability/toku_config.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include "toku_assert_subst.h" + +__attribute__((const, always_inline)) static inline intptr_t which_cache_line( + intptr_t addr) { + static const size_t assumed_cache_line_size = 64; + return addr / assumed_cache_line_size; +} +template <typename T> +__attribute__((const, always_inline)) static inline bool crosses_boundary( + T *addr, size_t width) { + const intptr_t int_addr = reinterpret_cast<intptr_t>(addr); + const intptr_t last_byte = int_addr + width - 1; + return which_cache_line(int_addr) != which_cache_line(last_byte); +} + +template <typename T, typename U> +__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_add(addr, diff); +} +template <typename T, typename U> +__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_add_and_fetch(addr, diff); +} +template <typename T, typename U> +__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_fetch_and_sub(addr, diff); +} +template <typename T, typename U> +__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr, + U diff) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_sub_and_fetch(addr, diff); +} +template <typename T, typename U, typename V> +__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap( + T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_val_compare_and_swap(addr, oldval, newval); +} +template <typename T, typename U, typename V> +__attribute__((always_inline)) static inline bool +toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) { + paranoid_invariant(!crosses_boundary(addr, sizeof *addr)); + return __sync_bool_compare_and_swap(addr, oldval, newval); +} + +// in case you include this but not toku_portability.h +#pragma GCC poison __sync_fetch_and_add +#pragma GCC poison __sync_fetch_and_sub +#pragma GCC poison __sync_fetch_and_or +#pragma GCC poison __sync_fetch_and_and +#pragma GCC poison __sync_fetch_and_xor +#pragma GCC poison __sync_fetch_and_nand +#pragma GCC poison __sync_add_and_fetch +#pragma GCC poison __sync_sub_and_fetch +#pragma GCC poison __sync_or_and_fetch +#pragma GCC poison __sync_and_and_fetch +#pragma GCC poison __sync_xor_and_fetch +#pragma GCC poison __sync_nand_and_fetch +#pragma GCC poison __sync_bool_compare_and_swap +#pragma GCC poison __sync_val_compare_and_swap +#pragma GCC poison __sync_synchronize +#pragma GCC poison __sync_lock_test_and_set +#pragma GCC poison __sync_release diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h new file mode 100644 index 000000000..eb8291c1d --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h @@ -0,0 +1,83 @@ +/* + A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided + condition and mutex that provides toku_pthread_*-like interface. The functions + are named + + toku_external_{mutex|cond}_XXX + + Lock Tree uses this mutex and condition for interruptible (long) lock waits. + + (It also still uses toku_pthread_XXX calls for mutexes/conditions for + shorter waits on internal objects) +*/ + +#pragma once + +#include <pthread.h> +#include <stdint.h> +#include <time.h> + +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "toku_portability.h" + +using ROCKSDB_NAMESPACE::TransactionDBCondVar; +using ROCKSDB_NAMESPACE::TransactionDBMutex; + +typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory> + toku_external_mutex_factory_t; + +typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t; +typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t; + +static inline void toku_external_cond_init( + toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) { + *cond = mutex_factory->AllocateCondVar(); +} + +inline void toku_external_cond_destroy(toku_external_cond_t *cond) { + cond->reset(); // this will destroy the managed object +} + +inline void toku_external_cond_signal(toku_external_cond_t *cond) { + (*cond)->Notify(); +} + +inline void toku_external_cond_broadcast(toku_external_cond_t *cond) { + (*cond)->NotifyAll(); +} + +inline int toku_external_cond_timedwait(toku_external_cond_t *cond, + toku_external_mutex_t *mutex, + int64_t timeout_microsec) { + auto res = (*cond)->WaitFor(*mutex, timeout_microsec); + if (res.ok()) + return 0; + else + return ETIMEDOUT; +} + +inline void toku_external_mutex_init(toku_external_mutex_factory_t factory, + toku_external_mutex_t *mutex) { + // Use placement new: the memory has been allocated but constructor wasn't + // called + new (mutex) toku_external_mutex_t; + *mutex = factory->AllocateMutex(); +} + +inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); +} + +inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) { + (*mutex)->Lock(); + return 0; +} + +inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) { + (*mutex)->UnLock(); +} + +inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) { + mutex->reset(); // this will destroy the managed object +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h new file mode 100644 index 000000000..c967e7177 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h @@ -0,0 +1,286 @@ +/*====== +This file is part of PerconaFT. + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#pragma once + +#include <stdio.h> // FILE + +// Performance instrumentation object identifier type +typedef unsigned int pfs_key_t; + +enum class toku_instr_object_type { mutex, rwlock, cond, thread, file }; + +struct PSI_file; + +struct TOKU_FILE { + /** The real file. */ + FILE *file; + struct PSI_file *key; + TOKU_FILE() : file(nullptr), key(nullptr) {} +}; + +struct PSI_mutex; +struct PSI_cond; +struct PSI_rwlock; + +struct toku_mutex_t; +struct toku_cond_t; +struct toku_pthread_rwlock_t; + +class toku_instr_key; + +class toku_instr_probe_empty { + public: + explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {} + + void start_with_source_location(UU(const char *src_file), UU(int src_line)) {} + + void stop() {} +}; + +#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__) +#define TOKU_PROBE_STOP(p) p->stop + +extern toku_instr_key toku_uninstrumented; + +#ifndef MYSQL_TOKUDB_ENGINE + +#include <pthread.h> + +class toku_instr_key { + public: + toku_instr_key(UU(toku_instr_object_type type), UU(const char *group), + UU(const char *name)) {} + + explicit toku_instr_key(UU(pfs_key_t key_id)) {} + // No-instrumentation constructor: + toku_instr_key() {} + ~toku_instr_key() {} +}; + +typedef toku_instr_probe_empty toku_instr_probe; + +enum class toku_instr_file_op { + file_stream_open, + file_create, + file_open, + file_delete, + file_rename, + file_read, + file_write, + file_sync, + file_stream_close, + file_close, + file_stat +}; + +struct PSI_file {}; +struct PSI_mutex {}; + +struct toku_io_instrumentation {}; + +inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) { + return pthread_create(thread, attr, start_routine, arg); +} + +inline void toku_instr_register_current_thread() {} + +inline void toku_instr_delete_current_thread() {} + +// Instrument file creation, opening, closing, and renaming +inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr), + UU(const toku_instr_key &key), + UU(toku_instr_file_op op), + UU(const char *name), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_open_end( + UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {} + +inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr), + UU(int fd)) {} + +inline void toku_instr_file_name_close_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_stream_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_fd_close_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(int fd), UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr), + UU(int result)) {} + +inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr), + UU(toku_instr_file_op op), UU(int fd), + UU(unsigned int count), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_name_io_begin( + UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key), + UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_file_stream_io_begin( + UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op), + UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr), + UU(unsigned int count)) {} + +struct toku_mutex_t; + +struct toku_mutex_instrumentation {}; + +inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key), + UU(toku_mutex_t &mutex)) { + return nullptr; +} + +inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {} + +inline void toku_instr_mutex_lock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_trylock_start( + UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex), + UU(const char *src_file), UU(int src_line)) {} + +inline void toku_instr_mutex_lock_end( + UU(toku_mutex_instrumentation &mutex_instr), + UU(int pthread_mutex_lock_result)) {} + +inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {} + +struct toku_cond_instrumentation {}; + +enum class toku_instr_cond_op { + cond_wait, + cond_timedwait, +}; + +inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key), + UU(toku_cond_t &cond)) { + return nullptr; +} + +inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {} + +inline void toku_instr_cond_wait_start( + UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op), + UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr), + UU(int pthread_cond_wait_result)) {} + +inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {} + +inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {} + +#if 0 +// rw locks are not used +// rwlock instrumentation +struct toku_rwlock_instrumentation {}; + +inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key), + UU(toku_pthread_rwlock_t &rwlock)) { + return nullptr; +} + +inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {} + +inline void toku_instr_rwlock_rdlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_wrlock_wait_start( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(toku_pthread_rwlock_t &rwlock), + UU(const char *src_file), + UU(int src_line)) {} + +inline void toku_instr_rwlock_rdlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_wrlock_wait_end( + UU(toku_rwlock_instrumentation &rwlock_instr), + UU(int pthread_rwlock_wait_result)) {} + +inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {} +#endif + +#else // MYSQL_TOKUDB_ENGINE +// There can be not only mysql but also mongodb or any other PFS stuff +#include <toku_instr_mysql.h> +#endif // MYSQL_TOKUDB_ENGINE + +// Mutexes +extern toku_instr_key manager_escalation_mutex_key; +extern toku_instr_key manager_escalator_mutex_key; +extern toku_instr_key manager_mutex_key; +extern toku_instr_key treenode_mutex_key; +extern toku_instr_key locktree_request_info_mutex_key; +extern toku_instr_key locktree_request_info_retry_mutex_key; + +// condition vars +extern toku_instr_key lock_request_m_wait_cond_key; +extern toku_instr_key locktree_request_info_retry_cv_key; +extern toku_instr_key manager_m_escalator_done_key; // unused diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h new file mode 100644 index 000000000..9a95b38bd --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h @@ -0,0 +1,87 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#if defined(__clang__) +#define constexpr_static_assert(a, b) +#else +#define constexpr_static_assert(a, b) static_assert(a, b) +#endif + +// include here, before they get deprecated +#include <inttypes.h> +#include <stdint.h> +#include <stdio.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#include "toku_atomic.h" + +#if defined(__cplusplus) +#include <type_traits> +#endif + +#if defined(__cplusplus) +// decltype() here gives a reference-to-pointer instead of just a pointer, +// just use __typeof__ +#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value) +#else +#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value) +#endif + +#define UU(x) x __attribute__((__unused__)) + +#include "toku_instrumentation.h" diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h new file mode 100644 index 000000000..571b950e1 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h @@ -0,0 +1,520 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <pthread.h> +#include <stdint.h> +#include <time.h> + +#include "toku_portability.h" +// PORT2: #include "toku_assert.h" + +// TODO: some things moved toku_instrumentation.h, not necessarily the best +// place +typedef pthread_attr_t toku_pthread_attr_t; +typedef pthread_t toku_pthread_t; +typedef pthread_mutex_t toku_pthread_mutex_t; +typedef pthread_condattr_t toku_pthread_condattr_t; +typedef pthread_cond_t toku_pthread_cond_t; +typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t; +typedef pthread_key_t toku_pthread_key_t; +typedef struct timespec toku_timespec_t; + +// TODO: break this include loop +#include <pthread.h> +typedef pthread_mutexattr_t toku_pthread_mutexattr_t; + +struct toku_mutex_t { + pthread_mutex_t pmutex; + struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */ +#if defined(TOKU_PTHREAD_DEBUG) + pthread_t owner; // = pthread_self(); // for debugging + bool locked; + bool valid; + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +struct toku_cond_t { + pthread_cond_t pcond; + struct PSI_cond *psi_cond; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 } +#else +#define TOKU_COND_INITIALIZER \ + { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +struct toku_pthread_rwlock_t { + pthread_rwlock_t rwlock; + struct PSI_rwlock *psi_rwlock; +#if defined(TOKU_PTHREAD_DEBUG) + pfs_key_t instr_key_id; +#endif // defined(TOKU_PTHREAD_DEBUG) +}; + +typedef struct toku_mutex_aligned { + toku_mutex_t aligned_mutex __attribute__((__aligned__(64))); +} toku_mutex_aligned_t; + +// Initializing with {} will fill in a struct with all zeros. +// But you may also need a pragma to suppress the warnings, as follows +// +// #pragma GCC diagnostic push +// #pragma GCC diagnostic ignored "-Wmissing-field-initializers" +// toku_mutex_t foo = ZERO_MUTEX_INITIALIZER; +// #pragma GCC diagnostic pop +// +// In general it will be a lot of busy work to make this codebase compile +// cleanly with -Wmissing-field-initializers + +#define ZERO_MUTEX_INITIALIZER \ + {} + +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) + +// Darwin doesn't provide adaptive mutexes +#if defined(__APPLE__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \ + .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#else // __FreeBSD__, __linux__, at least +#if defined(__GLIBC__) +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP +#else +// not all libc (e.g. musl) implement NP (Non-POSIX) attributes +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#endif +#if defined(TOKU_PTHREAD_DEBUG) +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { \ + .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \ + .owner = 0, .locked = false, .valid = true, .instr_key_id = 0 \ + } +#else +#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ + { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr } +#endif // defined(TOKU_PTHREAD_DEBUG) +#endif // defined(__APPLE__) + +// Different OSes implement mutexes as different amounts of nested structs. +// C++ will fill out all missing values with zeroes if you provide at least one +// zero, but it needs the right amount of nesting. +#if defined(__FreeBSD__) +#define ZERO_COND_INITIALIZER \ + { 0 } +#elif defined(__APPLE__) +#define ZERO_COND_INITIALIZER \ + { \ + { 0 } \ + } +#else // __linux__, at least +#define ZERO_COND_INITIALIZER \ + {} +#endif + +static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_init(attr); + assert_zero(r); +} + +static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr, + int type) { + int r = pthread_mutexattr_settype(attr, type); + assert_zero(r); +} + +static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) { + int r = pthread_mutexattr_destroy(attr); + assert_zero(r); +} + +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) { + invariant(mutex->locked); + invariant(mutex->owner == pthread_self()); +} +#else +static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +// asserting that a mutex is unlocked only makes sense +// if the calling thread can guaruntee that no other threads +// are trying to lock this mutex at the time of the assertion +// +// a good example of this is a tree with mutexes on each node. +// when a node is locked the caller knows that no other threads +// can be trying to lock its childrens' mutexes. the children +// are in one of two fixed states: locked or unlocked. +#if defined(TOKU_PTHREAD_DEBUG) +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) { + invariant(mutex->owner == 0); + invariant(!mutex->locked); +} +#else +static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex + __attribute__((unused))) {} +#endif // defined(TOKU_PTHREAD_DEBUG) + +#define toku_mutex_lock(M) \ + toku_mutex_lock_with_source_location(M, __FILE__, __LINE__) + +static inline void toku_cond_init(toku_cond_t *cond, + const toku_pthread_condattr_t *attr) { + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +#define toku_mutex_trylock(M) \ + toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__) + +inline void toku_mutex_unlock(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->owner == pthread_self()); + invariant(mutex->valid); + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_unlock(mutex->psi_mutex); + int r = pthread_mutex_unlock(&mutex->pmutex); + assert_zero(r); +} + +inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex, + const char *src_file, + int src_line) { + toku_mutex_instrumentation mutex_instr; + toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line); + + const int r = pthread_mutex_lock(&mutex->pmutex); + toku_instr_mutex_lock_end(mutex_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + if (r == 0) { + invariant(mutex->valid); + invariant(!mutex->locked); + invariant(mutex->owner == 0); + mutex->locked = true; + mutex->owner = pthread_self(); + } +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +#define toku_cond_wait(C, M) \ + toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__) + +#define toku_cond_timedwait(C, M, W) \ + toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__) + +inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond, + const pthread_condattr_t *attr) { + toku_instr_cond_init(key, *cond); + int r = pthread_cond_init(&cond->pcond, attr); + assert_zero(r); +} + +inline void toku_cond_destroy(toku_cond_t *cond) { + toku_instr_cond_destroy(cond->psi_cond); + int r = pthread_cond_destroy(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_wait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond, + *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond, + toku_mutex_t *mutex, + toku_timespec_t *wakeup_at, + const char *src_file, + int src_line) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->locked); + mutex->locked = false; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) + + /* Instrumentation start */ + toku_cond_instrumentation cond_instr; + toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait, + *cond, *mutex, src_file, src_line); + + /* Instrumented code */ + const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at); + + /* Instrumentation end */ + toku_instr_cond_wait_end(cond_instr, r); + +#if defined(TOKU_PTHREAD_DEBUG) + invariant(!mutex->locked); + mutex->locked = true; + mutex->owner = pthread_self(); +#endif // defined(TOKU_PTHREAD_DEBUG) + return r; +} + +inline void toku_cond_signal(toku_cond_t *cond) { + toku_instr_cond_signal(*cond); + const int r = pthread_cond_signal(&cond->pcond); + assert_zero(r); +} + +inline void toku_cond_broadcast(toku_cond_t *cond) { + toku_instr_cond_broadcast(*cond); + const int r = pthread_cond_broadcast(&cond->pcond); + assert_zero(r); +} + +inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex, + const toku_pthread_mutexattr_t *attr) { +#if defined(TOKU_PTHREAD_DEBUG) + mutex->valid = true; +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_init(key, *mutex); + const int r = pthread_mutex_init(&mutex->pmutex, attr); + assert_zero(r); +#if defined(TOKU_PTHREAD_DEBUG) + mutex->locked = false; + invariant(mutex->valid); + mutex->valid = true; + mutex->owner = 0; +#endif // defined(TOKU_PTHREAD_DEBUG) +} + +inline void toku_mutex_destroy(toku_mutex_t *mutex) { +#if defined(TOKU_PTHREAD_DEBUG) + invariant(mutex->valid); + mutex->valid = false; + invariant(!mutex->locked); +#endif // defined(TOKU_PTHREAD_DEBUG) + toku_instr_mutex_destroy(mutex->psi_mutex); + int r = pthread_mutex_destroy(&mutex->pmutex); + assert_zero(r); +} + +#define toku_pthread_rwlock_rdlock(RW) \ + toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__) + +#define toku_pthread_rwlock_wrlock(RW) \ + toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__) + +#if 0 +inline void toku_pthread_rwlock_init( + const toku_instr_key &key, + toku_pthread_rwlock_t *__restrict rwlock, + const toku_pthread_rwlockattr_t *__restrict attr) { + toku_instr_rwlock_init(key, *rwlock); + int r = pthread_rwlock_init(&rwlock->rwlock, attr); + assert_zero(r); +} + +inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_destroy(rwlock->psi_rwlock); + int r = pthread_rwlock_destroy(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_rdlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_rdlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrlock_with_source_location( + toku_pthread_rwlock_t *rwlock, + const char *src_file, + uint src_line) { + + /* Instrumentation start */ + toku_rwlock_instrumentation rwlock_instr; + toku_instr_rwlock_wrlock_wait_start( + rwlock_instr, *rwlock, src_file, src_line); + /* Instrumented code */ + const int r = pthread_rwlock_wrlock(&rwlock->rwlock); + + /* Instrumentation end */ + toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r); + + assert_zero(r); +} + +inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} + +inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) { + toku_instr_rwlock_unlock(*rwlock); + const int r = pthread_rwlock_unlock(&rwlock->rwlock); + assert_zero(r); +} +#endif + +static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) { + return pthread_join(thread, value_ptr); +} + +static inline int toku_pthread_detach(toku_pthread_t thread) { + return pthread_detach(thread); +} + +static inline int toku_pthread_key_create(toku_pthread_key_t *key, + void (*destroyf)(void *)) { + return pthread_key_create(key, destroyf); +} + +static inline int toku_pthread_key_delete(toku_pthread_key_t key) { + return pthread_key_delete(key); +} + +static inline void *toku_pthread_getspecific(toku_pthread_key_t key) { + return pthread_getspecific(key); +} + +static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) { + return pthread_setspecific(key, data); +} + +int toku_pthread_yield(void) __attribute__((__visibility__("default"))); + +static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); } + +static inline void *toku_pthread_done(void *exit_value) { + toku_instr_delete_current_thread(); + pthread_exit(exit_value); + return nullptr; // Avoid compiler warning +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h new file mode 100644 index 000000000..3cb5b5790 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h @@ -0,0 +1,179 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include <portability/toku_config.h> + +#ifdef HAVE_valgrind +#undef USE_VALGRIND +#define USE_VALGRIND 1 +#endif + +#if defined(__linux__) && USE_VALGRIND + +#include <valgrind/drd.h> +#include <valgrind/helgrind.h> + +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \ + VALGRIND_HG_ENABLE_CHECKING(p, size) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \ + VALGRIND_HG_DISABLE_CHECKING(p, size) +#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN() +#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END() +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN() +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END() + +/* + * How to make helgrind happy about tree rotations and new mutex orderings: + * + * // Tell helgrind that we unlocked it so that the next call doesn't get a + * "destroyed a locked mutex" error. + * // Tell helgrind that we destroyed the mutex. + * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka); + * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka); + * + * // And recreate it. It would be better to simply be able to say that the + * order on these two can now be reversed, because this code forgets all the + * ordering information for this mutex. + * // Then tell helgrind that we have locked it again. + * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0); + * VALGRIND_HG_MUTEX_LOCK_POST(&locka); + * + * When the ordering of two locks changes, we don't need tell Helgrind about do + * both locks. Just one is good enough. + */ + +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \ + VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex); \ + VALGRIND_HG_MUTEX_DESTROY_PRE(mutex); \ + VALGRIND_HG_MUTEX_INIT_POST(mutex, 0); \ + VALGRIND_HG_MUTEX_LOCK_POST(mutex); + +#else // !defined(__linux__) || !USE_VALGRIND + +#define NVALGRIND 1 +#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0) +#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0) +#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0) +#define TOKU_DRD_IGNORE_VAR(v) +#define TOKU_DRD_STOP_IGNORING_VAR(v) +#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0) +#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0) +#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) +#undef RUNNING_ON_VALGRIND +#define RUNNING_ON_VALGRIND (0U) +#endif + +// Valgrind 3.10.1 (and previous versions). +// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING. +// Helgrind's implementation of disable and enable checking causes false races +// to be reported. In addition, the race report does not include ANY +// information about the code that uses the helgrind disable and enable +// functions. Therefore, it is very difficult to figure out the cause of the +// race. DRD does implement the disable and enable functions. + +// Problems with ANNOTATE_IGNORE_READS. +// Helgrind does not implement ignore reads. +// Annotate ignore reads is the way to inform DRD to ignore racy reads. + +// FT code uses unsafe reads in several places. These unsafe reads have been +// noted as valid since they use the toku_unsafe_fetch function. Unfortunately, +// this causes helgrind to report erroneous data races which makes use of +// helgrind problematic. + +// Unsafely fetch and return a `T' from src, telling drd to ignore +// racey access to src for the next sizeof(*src) bytes +template <typename T> +T toku_unsafe_fetch(T *src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(src, + sizeof *src); // disabled, see comment + TOKU_ANNOTATE_IGNORE_READS_BEGIN(); + T r = *src; + TOKU_ANNOTATE_IGNORE_READS_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(src, + sizeof *src); // disabled, see comment + return r; +} + +template <typename T> +T toku_unsafe_fetch(T &src) { + return toku_unsafe_fetch(&src); +} + +// Unsafely set a `T' value into *dest from src, telling drd to ignore +// racey access to dest for the next sizeof(*dest) bytes +template <typename T> +void toku_unsafe_set(T *dest, const T src) { + if (0) + TOKU_VALGRIND_HG_DISABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment + TOKU_ANNOTATE_IGNORE_WRITES_BEGIN(); + *dest = src; + TOKU_ANNOTATE_IGNORE_WRITES_END(); + if (0) + TOKU_VALGRIND_HG_ENABLE_CHECKING(dest, + sizeof *dest); // disabled, see comment +} + +template <typename T> +void toku_unsafe_set(T &dest, const T src) { + toku_unsafe_set(&dest, src); +} diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h new file mode 100644 index 000000000..46111e7f0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h @@ -0,0 +1,193 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// PORT2: #include "toku_config.h" + +#include <stdint.h> +#include <sys/time.h> +#include <time.h> +#if defined(__powerpc__) +#include <sys/platform/ppc.h> +#endif + +#if 0 +static inline float toku_tdiff (struct timeval *a, struct timeval *b) { + return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec)); +} +// PORT2: temporary: +#define HAVE_CLOCK_REALTIME +#if !defined(HAVE_CLOCK_REALTIME) +// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time. +typedef int clockid_t; +// just something bogus, it doesn't matter, we just want to make sure we're +// only supporting this mode because we're not sure we can support other modes +// without a real clock_gettime() +#define CLOCK_REALTIME 0x01867234 +#endif +int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default"))); +#endif + +// *************** Performance timers ************************ +// What do you really want from a performance timer: +// (1) Can determine actual time of day from the performance time. +// (2) Time goes forward, never backward. +// (3) Same time on different processors (or even different machines). +// (4) Time goes forward at a constant rate (doesn't get faster and slower) +// (5) Portable. +// (6) Getting the time is cheap. +// Unfortuately it seems tough to get Properties 1-5. So we go for Property 6,, +// but we abstract it. We offer a type tokutime_t which can hold the time. This +// type can be subtracted to get a time difference. We can get the present time +// cheaply. We can convert this type to seconds (but that can be expensive). The +// implementation is to use RDTSC (hence we lose property 3: not portable). +// Recent machines have constant_tsc in which case we get property (4). +// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock +// skew, so we get property (3). We get property 2 with RDTSC (as long as +// there's not any skew). We don't even try to get propety 1, since we don't +// need it. The decision here is that these times are really accurate only on +// modern machines with modern OSs. +typedef uint64_t tokutime_t; // Time type used in by tokutek timers. + +#if 0 +// The value of tokutime_t is not specified here. +// It might be microseconds since 1/1/1970 (if gettimeofday() is +// used), or clock cycles since boot (if rdtsc is used). Or something +// else. +// Two tokutime_t values can be subtracted to get a time difference. +// Use tokutime_to_seconds to that convert difference to seconds. +// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds(); +// +// For accurate time calculations do the subtraction in the right order: +// Right: tokutime_to_seconds(t1-t2); +// Wrong tokutime_to_seconds(t1)-toku_time_to_seconds(t2); +// Doing it the wrong way is likely to result in loss of precision. +// A double can hold numbers up to about 53 bits. RDTSC which uses about 33 bits every second, so that leaves +// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double. +// +double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default"))); // Convert tokutime to seconds. + +#endif + +// Get the value of tokutime for right now. We want this to be fast, so we +// expose the implementation as RDTSC. +static inline tokutime_t toku_time_now(void) { +#if defined(__x86_64__) || defined(__i386__) + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return (uint64_t)hi << 32 | lo; +#elif defined(__aarch64__) + uint64_t result; + __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result)); + return result; +#elif defined(__powerpc__) + return __ppc_get_timebase(); +#elif defined(__s390x__) + uint64_t result; + asm volatile("stckf %0" : "=Q"(result) : : "cc"); + return result; +#elif defined(__riscv) && __riscv_xlen == 32 + uint32_t cycles_lo, cycles_hi0, cycles_hi1; + // Implemented in assembly because Clang insisted on branching. + asm volatile( + "rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "sub %0, %0, %2\n" + "seqz %0, %0\n" + "sub %0, zero, %0\n" + "and %1, %1, %0\n" + : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); + return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo; +#elif defined(__riscv) && __riscv_xlen == 64 + uint64_t cycles; + asm volatile("rdcycle %0" : "=r"(cycles)); + return cycles; +#else +#error No timer implementation for this platform +#endif +} + +static inline uint64_t toku_current_time_microsec(void) { + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec; +} + +#if 0 +// sleep microseconds +static inline void toku_sleep_microsec(uint64_t ms) { + struct timeval t; + + t.tv_sec = ms / 1000000; + t.tv_usec = ms % 1000000; + + select(0, NULL, NULL, NULL, &t); +} +#endif + +/* + PORT: Usage of this file: + + uint64_t toku_current_time_microsec() // uses gettimeoday + is used to track how much time various operations took (for example, lock + escalation). (TODO: it is not clear why these operations are tracked with + microsecond precision while others use nanoseconds) + + tokutime_t toku_time_now() // uses rdtsc + seems to be used for a very similar purpose. This has greater precision + + RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which + should be adequate substitutes. +*/ diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h new file mode 100644 index 000000000..803914862 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h @@ -0,0 +1,27 @@ +// +// A substitute for ft/txn/txn.h +// +#pragma once + +#include <set> + +#include "../util/omt.h" + +typedef uint64_t TXNID; +#define TXNID_NONE ((TXNID)0) + +// A set of transactions +// (TODO: consider using class toku::txnid_set. The reason for using STL +// container was that its API is easier) +class TxnidVector : public std::set<TXNID> { + public: + bool contains(TXNID txnid) { return find(txnid) != end(); } +}; + +// A value for lock structures with a meaning "the lock is owned by multiple +// transactions (and one has to check the TxnidVector to get their ids) +#define TXNID_SHARED (TXNID(-1)) + +// Auxiliary value meaning "any transaction id will do". No real transaction +// may have this is as id. +#define TXNID_ANY (TXNID(-2)) diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc new file mode 100644 index 000000000..50dc879ce --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc @@ -0,0 +1,132 @@ +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +/* + This is a dump ground to make Lock Tree work without the rest of TokuDB. +*/ +#include <string.h> + +#include "db.h" +#include "ft/ft-status.h" +#include "portability/memory.h" +#include "util/dbt.h" + +// portability/os_malloc.cc + +void toku_free(void *p) { free(p); } + +void *toku_xmalloc(size_t size) { return malloc(size); } + +void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); } + +void *toku_xmemdup(const void *v, size_t len) { + void *p = toku_xmalloc(len); + memcpy(p, v, len); + return p; +} + +// TODO: what are the X-functions? Xcalloc, Xrealloc? +void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); } + +// ft-ft-opts.cc: + +// locktree +toku_instr_key lock_request_m_wait_cond_key; +toku_instr_key manager_m_escalator_done_key; +toku_instr_key locktree_request_info_mutex_key; +toku_instr_key locktree_request_info_retry_mutex_key; +toku_instr_key locktree_request_info_retry_cv_key; + +toku_instr_key treenode_mutex_key; +toku_instr_key manager_mutex_key; +toku_instr_key manager_escalation_mutex_key; +toku_instr_key manager_escalator_mutex_key; + +// portability/memory.cc +size_t toku_memory_footprint(void *, size_t touched) { return touched; } + +// ft/ft-status.c +// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with +// "TOKU" +LTM_STATUS_S ltm_status; +void LTM_STATUS_S::init() { + if (m_initialized) return; +#define LTM_STATUS_INIT(k, c, t, l) \ + TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \ + TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS) + LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64, + "memory size"); + LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64, + "memory size limit"); + LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64, + "number of times lock escalation ran"); + LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS, + STATUS_TOKUTIME, "time spent running escalation (seconds)"); + LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT, + LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64, + "latest post-escalation memory size"); + LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64, + "number of locktrees open now"); + LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS, + STATUS_UINT64, "number of pending lock requests"); + LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM, + STATUS_UINT64, "number of locktrees eligible for the STO"); + LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM, + STATUS_UINT64, + "number of times a locktree ended the STO early"); + LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS, + STATUS_TOKUTIME, "time spent ending the STO early (seconds)"); + LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64, + "number of wait locks"); + LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64, + "time waiting for locks"); + LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64, + "number of long wait locks"); + LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64, + "long time waiting for locks"); + LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64, + "number of lock timeouts"); + LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT, + STATUS_UINT64, "number of waits on lock escalation"); + LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME, + STATUS_UINT64, "time waiting on lock escalation"); + LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT, + LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64, + "number of long waits on lock escalation"); + LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME, + LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64, + "long time waiting on lock escalation"); + + m_initialized = true; +#undef LTM_STATUS_INIT +} +void LTM_STATUS_S::destroy() { + if (!m_initialized) return; + for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) { + if (status[i].type == STATUS_PARCOUNT) { + // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount); + } + } +} + +int toku_keycompare(const void *key1, size_t key1len, const void *key2, + size_t key2len) { + size_t comparelen = key1len < key2len ? key1len : key2len; + int c = memcmp(key1, key2, comparelen); + if (__builtin_expect(c != 0, 1)) { + return c; + } else { + if (key1len < key2len) { + return -1; + } else if (key1len > key2len) { + return 1; + } else { + return 0; + } + } +} + +int toku_builtin_compare_fun(const DBT *a, const DBT *b) { + return toku_keycompare(a->data, a->size, b->data, b->size); +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc new file mode 100644 index 000000000..63cc3a267 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc @@ -0,0 +1,153 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "dbt.h" + +#include <string.h> + +#include "../db.h" +#include "../portability/memory.h" + +DBT *toku_init_dbt(DBT *dbt) { + memset(dbt, 0, sizeof(*dbt)); + return dbt; +} + +DBT toku_empty_dbt(void) { + static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0}; + return empty_dbt; +} + +DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) { + toku_init_dbt(dbt); + dbt->flags = flags; + return dbt; +} + +void toku_destroy_dbt(DBT *dbt) { + switch (dbt->flags) { + case DB_DBT_MALLOC: + case DB_DBT_REALLOC: + toku_free(dbt->data); + toku_init_dbt(dbt); + break; + } +} + +DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) { + toku_init_dbt(dbt); + dbt->size = len; + dbt->data = (char *)k; + return dbt; +} + +DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) { + toku_init_dbt_flags(dbt, DB_DBT_MALLOC); + dbt->size = len; + dbt->data = toku_xmemdup(k, len); + return dbt; +} + +DBT *toku_copyref_dbt(DBT *dst, const DBT src) { + dst->flags = 0; + dst->ulen = 0; + dst->size = src.size; + dst->data = src.data; + return dst; +} + +DBT *toku_clone_dbt(DBT *dst, const DBT &src) { + return toku_memdup_dbt(dst, src.data, src.size); +} + +void toku_sdbt_cleanup(struct simple_dbt *sdbt) { + if (sdbt->data) toku_free(sdbt->data); + memset(sdbt, 0, sizeof(*sdbt)); +} + +const DBT *toku_dbt_positive_infinity(void) { + static DBT positive_infinity_dbt = { + .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + return &positive_infinity_dbt; +} + +const DBT *toku_dbt_negative_infinity(void) { + static DBT negative_infinity_dbt = { + .data = 0, .size = 0, .ulen = 0, .flags = 0}; // port + return &negative_infinity_dbt; +} + +bool toku_dbt_is_infinite(const DBT *dbt) { + return dbt == toku_dbt_positive_infinity() || + dbt == toku_dbt_negative_infinity(); +} + +bool toku_dbt_is_empty(const DBT *dbt) { + // can't have a null data field with a non-zero size + paranoid_invariant(dbt->data != nullptr || dbt->size == 0); + return dbt->data == nullptr; +} + +int toku_dbt_infinite_compare(const DBT *a, const DBT *b) { + if (a == b) { + return 0; + } else if (a == toku_dbt_positive_infinity()) { + return 1; + } else if (b == toku_dbt_positive_infinity()) { + return -1; + } else if (a == toku_dbt_negative_infinity()) { + return -1; + } else { + invariant(b == toku_dbt_negative_infinity()); + return 1; + } +} + +bool toku_dbt_equals(const DBT *a, const DBT *b) { + if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) { + return a->data == b->data && a->size == b->size; + } else { + // a or b is infinite, so they're equal if they are the same infinite + return a == b ? true : false; + } +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h new file mode 100644 index 000000000..d86c440f8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h @@ -0,0 +1,98 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "../db.h" + +// TODO: John +// Document this API a little better so that DBT +// memory management can be morm widely understood. + +DBT *toku_init_dbt(DBT *); + +// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true) +DBT toku_empty_dbt(void); + +DBT *toku_init_dbt_flags(DBT *, uint32_t flags); + +void toku_destroy_dbt(DBT *); + +DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len); + +DBT *toku_copyref_dbt(DBT *dst, const DBT src); + +DBT *toku_clone_dbt(DBT *dst, const DBT &src); + +void toku_sdbt_cleanup(struct simple_dbt *sdbt); + +// returns: special DBT pointer representing positive infinity +const DBT *toku_dbt_positive_infinity(void); + +// returns: special DBT pointer representing negative infinity +const DBT *toku_dbt_negative_infinity(void); + +// returns: true if the given dbt is either positive or negative infinity +bool toku_dbt_is_infinite(const DBT *dbt); + +// returns: true if the given dbt has no data (ie: dbt->data == nullptr) +bool toku_dbt_is_empty(const DBT *dbt); + +// effect: compares two potentially infinity-valued dbts +// requires: at least one is infinite (assert otherwise) +int toku_dbt_infinite_compare(const DBT *a, const DBT *b); + +// returns: true if the given dbts have the same data pointer and size +bool toku_dbt_equals(const DBT *a, const DBT *b); diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h new file mode 100644 index 000000000..158750fdb --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h @@ -0,0 +1,144 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <memory.h> + +//****************************************************************************** +// +// Overview: A growable array is a little bit like std::vector except that +// it doesn't have constructors (hence can be used in static constructs, since +// the google style guide says no constructors), and it's a little simpler. +// Operations: +// init and deinit (we don't have constructors and destructors). +// fetch_unchecked to get values out. +// store_unchecked to put values in. +// push to add an element at the end +// get_size to find out the size +// get_memory_size to find out how much memory the data stucture is using. +// +//****************************************************************************** + +namespace toku { + +template <typename T> +class GrowableArray { + public: + void init(void) + // Effect: Initialize the array to contain no elements. + { + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + void deinit(void) + // Effect: Deinitialize the array (freeing any memory it uses, for example). + { + toku_free(m_array); + m_array = NULL; + m_size = 0; + m_size_limit = 0; + } + + T fetch_unchecked(size_t i) const + // Effect: Fetch the ith element. If i is out of range, the system asserts. + { + return m_array[i]; + } + + void store_unchecked(size_t i, T v) + // Effect: Store v in the ith element. If i is out of range, the system + // asserts. + { + paranoid_invariant(i < m_size); + m_array[i] = v; + } + + void push(T v) + // Effect: Add v to the end of the array (increasing the size). The amortized + // cost of this operation is constant. Implementation hint: Double the size + // of the array when it gets too big so that the amortized cost stays + // constant. + { + if (m_size >= m_size_limit) { + if (m_array == NULL) { + m_size_limit = 1; + } else { + m_size_limit *= 2; + } + XREALLOC_N(m_size_limit, m_array); + } + m_array[m_size++] = v; + } + + size_t get_size(void) const + // Effect: Return the number of elements in the array. + { + return m_size; + } + size_t memory_size(void) const + // Effect: Return the size (in bytes) that the array occupies in memory. This + // is really only an estimate. + { + return sizeof(*this) + sizeof(T) * m_size_limit; + } + + private: + T *m_array; + size_t m_size; + size_t m_size_limit; // How much space is allocated in array. +}; + +} // namespace toku diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc new file mode 100644 index 000000000..0e7a9880b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc @@ -0,0 +1,201 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ifndef ROCKSDB_LITE +#ifndef OS_WIN +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "memarena.h" + +#include <string.h> + +#include <algorithm> + +#include "../portability/memory.h" + +void memarena::create(size_t initial_size) { + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _size_of_other_chunks = 0; + _footprint_of_other_chunks = 0; + _n_other_chunks = 0; + + _current_chunk.size = initial_size; + if (_current_chunk.size > 0) { + XMALLOC_N(_current_chunk.size, _current_chunk.buf); + } +} + +void memarena::destroy(void) { + if (_current_chunk.buf) { + toku_free(_current_chunk.buf); + } + for (int i = 0; i < _n_other_chunks; i++) { + toku_free(_other_chunks[i].buf); + } + if (_other_chunks) { + toku_free(_other_chunks); + } + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _n_other_chunks = 0; +} + +static size_t round_to_page(size_t size) { + const size_t page_size = 4096; + const size_t r = page_size + ((size - 1) & ~(page_size - 1)); + assert((r & (page_size - 1)) == 0); // make sure it's aligned + assert(r >= size); // make sure it's not too small + assert(r < + size + page_size); // make sure we didn't grow by more than a page. + return r; +} + +static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024; + +void *memarena::malloc_from_arena(size_t size) { + if (_current_chunk.buf == nullptr || + _current_chunk.size < _current_chunk.used + size) { + // The existing block isn't big enough. + // Add the block to the vector of blocks. + if (_current_chunk.buf) { + invariant(_current_chunk.size > 0); + int old_n = _n_other_chunks; + XREALLOC_N(old_n + 1, _other_chunks); + _other_chunks[old_n] = _current_chunk; + _n_other_chunks = old_n + 1; + _size_of_other_chunks += _current_chunk.size; + _footprint_of_other_chunks += + toku_memory_footprint(_current_chunk.buf, _current_chunk.used); + } + + // Make a new one. Grow the buffer size exponentially until we hit + // the max chunk size, but make it at least `size' bytes so the + // current allocation always fit. + size_t new_size = + std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size); + if (new_size < size) { + new_size = size; + } + new_size = round_to_page( + new_size); // at least size, but round to the next page size + XMALLOC_N(new_size, _current_chunk.buf); + _current_chunk.used = 0; + _current_chunk.size = new_size; + } + invariant(_current_chunk.buf != nullptr); + + // allocate in the existing block. + char *p = _current_chunk.buf + _current_chunk.used; + _current_chunk.used += size; + return p; +} + +void memarena::move_memory(memarena *dest) { + // Move memory to dest + XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks); + dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size; + dest->_footprint_of_other_chunks += + _footprint_of_other_chunks + + toku_memory_footprint(_current_chunk.buf, _current_chunk.used); + for (int i = 0; i < _n_other_chunks; i++) { + dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i]; + } + dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk; + + // Clear out this memarena's memory + toku_free(_other_chunks); + _current_chunk = arena_chunk(); + _other_chunks = nullptr; + _size_of_other_chunks = 0; + _footprint_of_other_chunks = 0; + _n_other_chunks = 0; +} + +size_t memarena::total_memory_size(void) const { + return sizeof(*this) + total_size_in_use() + + _n_other_chunks * sizeof(*_other_chunks); +} + +size_t memarena::total_size_in_use(void) const { + return _size_of_other_chunks + _current_chunk.used; +} + +size_t memarena::total_footprint(void) const { + return sizeof(*this) + _footprint_of_other_chunks + + toku_memory_footprint(_current_chunk.buf, _current_chunk.used) + + _n_other_chunks * sizeof(*_other_chunks); +} + +//////////////////////////////////////////////////////////////////////////////// + +const void *memarena::chunk_iterator::current(size_t *used) const { + if (_chunk_idx < 0) { + *used = _ma->_current_chunk.used; + return _ma->_current_chunk.buf; + } else if (_chunk_idx < _ma->_n_other_chunks) { + *used = _ma->_other_chunks[_chunk_idx].used; + return _ma->_other_chunks[_chunk_idx].buf; + } + *used = 0; + return nullptr; +} + +void memarena::chunk_iterator::next() { _chunk_idx++; } + +bool memarena::chunk_iterator::more() const { + if (_chunk_idx < 0) { + return _ma->_current_chunk.buf != nullptr; + } + return _chunk_idx < _ma->_n_other_chunks; +} +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h new file mode 100644 index 000000000..ddcc1144f --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h @@ -0,0 +1,141 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <stdlib.h> + +/* + * A memarena is used to efficiently store a collection of objects that never + * move The pattern is allocate more and more stuff and free all of the items at + * once. The underlying memory will store 1 or more objects per chunk. Each + * chunk is contiguously laid out in memory but chunks are not necessarily + * contiguous with each other. + */ +class memarena { + public: + memarena() + : _current_chunk(arena_chunk()), + _other_chunks(nullptr), + _n_other_chunks(0), + _size_of_other_chunks(0), + _footprint_of_other_chunks(0) {} + + // Effect: Create a memarena with the specified initial size + void create(size_t initial_size); + + void destroy(void); + + // Effect: Allocate some memory. The returned value remains valid until the + // memarena is cleared or closed. + // In case of ENOMEM, aborts. + void *malloc_from_arena(size_t size); + + // Effect: Move all the memory from this memarena into DEST. + // When SOURCE is closed the memory won't be freed. + // When DEST is closed, the memory will be freed, unless DEST moves + // its memory to another memarena... + void move_memory(memarena *dest); + + // Effect: Calculate the amount of memory used by a memory arena. + size_t total_memory_size(void) const; + + // Effect: Calculate the used space of the memory arena (ie: excludes unused + // space) + size_t total_size_in_use(void) const; + + // Effect: Calculate the amount of memory used, according to + // toku_memory_footprint(), + // which is a more expensive but more accurate count of memory used. + size_t total_footprint(void) const; + + // iterator over the underlying chunks that store objects in the memarena. + // a chunk is represented by a pointer to const memory and a usable byte + // count. + class chunk_iterator { + public: + chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {} + + // returns: base pointer to the current chunk + // *used set to the number of usable bytes + // if more() is false, returns nullptr and *used = 0 + const void *current(size_t *used) const; + + // requires: more() is true + void next(); + + bool more() const; + + private: + // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk + // >= 0 represents the i'th chunk in the ma->_other_chunks array + const memarena *_ma; + int _chunk_idx; + }; + + private: + struct arena_chunk { + arena_chunk() : buf(nullptr), used(0), size(0) {} + char *buf; + size_t used; + size_t size; + }; + + struct arena_chunk _current_chunk; + struct arena_chunk *_other_chunks; + int _n_other_chunks; + size_t _size_of_other_chunks; // the buf_size of all the other chunks. + size_t _footprint_of_other_chunks; // the footprint of all the other chunks. + + friend class memarena_unit_test; +}; diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h new file mode 100644 index 000000000..f208002d3 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h @@ -0,0 +1,794 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <memory.h> +#include <stdint.h> + +#include "../portability/toku_portability.h" +#include "../portability/toku_race_tools.h" +#include "growable_array.h" + +namespace toku { + +/** + * Order Maintenance Tree (OMT) + * + * Maintains a collection of totally ordered values, where each value has an + * integer weight. The OMT is a mutable datatype. + * + * The Abstraction: + * + * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector. + * The vector is numbered from $0$ to $|V|-1$. + * Each value has a weight. The weight of the $i$th element is denoted + * $w(V_i)$. + * + * We can create a new OMT, which is the empty vector. + * + * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where + * $|V'|=1+|V|$ and + * + * V'_j = V_j if $j<i$ + * x if $j=i$ + * V_{j-1} if $j>i$. + * + * We can specify $i$ using a kind of function instead of as an integer. + * Let $b$ be a function mapping from values to nonzero integers, such that + * the signum of $b$ is monotically increasing. + * We can specify $i$ as the minimum integer such that $b(V_i)>0$. + * + * We look up a value using its index, or using a Heaviside function. + * For lookups, we allow $b$ to be zero for some values, and again the signum of + * $b$ must be monotonically increasing. When lookup up values, we can look up + * $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a + * special return code if no such value exists.) (Rationale: Ordinarily we want + * $i$ to be unique. But for various reasons we want to allow multiple zeros, + * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum + * integer such that $b(V_i)>0$. (Or an indication that no such value exists.) + * $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an + * indication that no such value exists.) + * + * When looking up a value using a Heaviside function, we get the value and its + * index. + * + * We can also split an OMT into two OMTs, splitting the weight of the values + * evenly. Find a value $j$ such that the values to the left of $j$ have about + * the same total weight as the values to the right of $j$. The resulting two + * OMTs contain the values to the left of $j$ and the values to the right of $j$ + * respectively. All of the values from the original OMT go into one of the new + * OMTs. If the weights of the values don't split exactly evenly, then the + * implementation has the freedom to choose whether the new left OMT or the new + * right OMT is larger. + * + * Performance: + * Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ + * calls to the Heaviside function. The memory required is O(|V|). + * + * Usage: + * The omt is templated by two parameters: + * - omtdata_t is what will be stored within the omt. These could be pointers + * or real data types (ints, structs). + * - omtdataout_t is what will be returned by find and related functions. By + * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To + * create an omt which will store "TXNID"s, for example, it is a good idea to + * typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing + * structs, you may want to be able to get a pointer to the data actually stored + * in the omt (see find_zero). To do this, use the second template parameter: + * typedef omt<struct foo, struct foo *> foo_omt_t; + */ + +namespace omt_internal { + +template <bool subtree_supports_marks> +class subtree_templated { + private: + uint32_t m_index; + + public: + static const uint32_t NODE_NULL = UINT32_MAX; + inline void set_to_null(void) { m_index = NODE_NULL; } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { return m_index; } + + inline void set_index(uint32_t index) { + paranoid_invariant(index != NODE_NULL); + m_index = index; + } +} __attribute__((__packed__, aligned(4))); + +template <> +class subtree_templated<true> { + private: + uint32_t m_bitfield; + static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31); + static const uint32_t MASK_BIT = ((uint32_t)1) << 31; + + inline void set_index_internal(uint32_t new_index) { + m_bitfield = (m_bitfield & MASK_BIT) | new_index; + } + + public: + static const uint32_t NODE_NULL = INT32_MAX; + inline void set_to_null(void) { this->set_index_internal(NODE_NULL); } + + inline bool is_null(void) const { return NODE_NULL == this->get_index(); } + + inline uint32_t get_index(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return bits & MASK_INDEX; + } + + inline void set_index(uint32_t index) { + paranoid_invariant(index < NODE_NULL); + this->set_index_internal(index); + } + + inline bool get_bit(void) const { + TOKU_DRD_IGNORE_VAR(m_bitfield); + const uint32_t bits = m_bitfield; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + return (bits & MASK_BIT) != 0; + } + + inline void enable_bit(void) { + // These bits may be set by a thread with a write lock on some + // leaf, and the index can be read by another thread with a (read + // or write) lock on another thread. Also, the has_marks_below + // bit can be set by two threads simultaneously. Neither of these + // are real races, so if we are using DRD we should tell it to + // ignore these bits just while we set this bit. If there were a + // race in setting the index, that would be a real race. + TOKU_DRD_IGNORE_VAR(m_bitfield); + m_bitfield |= MASK_BIT; + TOKU_DRD_STOP_IGNORING_VAR(m_bitfield); + } + + inline void disable_bit(void) { m_bitfield &= MASK_INDEX; } +} __attribute__((__packed__)); + +template <typename omtdata_t, bool subtree_supports_marks> +class omt_node_templated { + public: + omtdata_t value; + uint32_t weight; + subtree_templated<subtree_supports_marks> left; + subtree_templated<subtree_supports_marks> right; + + // this needs to be in both implementations because we don't have + // a "static if" the caller can use + inline void clear_stolen_bits(void) {} +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +template <typename omtdata_t> +class omt_node_templated<omtdata_t, true> { + public: + omtdata_t value; + uint32_t weight; + subtree_templated<true> left; + subtree_templated<true> right; + inline bool get_marked(void) const { return left.get_bit(); } + inline void set_marked_bit(void) { return left.enable_bit(); } + inline void unset_marked_bit(void) { return left.disable_bit(); } + + inline bool get_marks_below(void) const { return right.get_bit(); } + inline void set_marks_below_bit(void) { + // This function can be called by multiple threads. + // Checking first reduces cache invalidation. + if (!this->get_marks_below()) { + right.enable_bit(); + } + } + inline void unset_marks_below_bit(void) { right.disable_bit(); } + + inline void clear_stolen_bits(void) { + this->unset_marked_bit(); + this->unset_marks_below_bit(); + } +}; // note: originally this class had __attribute__((__packed__, aligned(4))) + +} // namespace omt_internal + +template <typename omtdata_t, typename omtdataout_t = omtdata_t, + bool supports_marks = false> +class omt { + public: + /** + * Effect: Create an empty OMT. + * Performance: constant time. + */ + void create(void); + + /** + * Effect: Create an empty OMT with no internal allocated space. + * Performance: constant time. + * Rationale: In some cases we need a valid omt but don't want to malloc. + */ + void create_no_array(void); + + /** + * Effect: Create a OMT containing values. The number of values is in + * numvalues. Stores the new OMT in *omtp. Requires: this has not been created + * yet Requires: values != NULL Requires: values is sorted Performance: + * time=O(numvalues) Rationale: Normally to insert N values takes O(N lg N) + * amortized time. If the N values are known in advance, are sorted, and the + * structure is empty, we can batch insert them much faster. + */ + __attribute__((nonnull)) void create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues); + + /** + * Effect: Create an OMT containing values. The number of values is in + * numvalues. On success the OMT takes ownership of *values array, and sets + * values=NULL. Requires: this has not been created yet Requires: values != + * NULL Requires: *values is sorted Requires: *values was allocated with + * toku_malloc Requires: Capacity of the *values array is <= new_capacity + * Requires: On success, *values may not be accessed again by the caller. + * Performance: time=O(1) + * Rational: create_from_sorted_array takes O(numvalues) time. + * By taking ownership of the array, we save a malloc and + * memcpy, and possibly a free (if the caller is done with the array). + */ + void create_steal_sorted_array(omtdata_t **const values, + const uint32_t numvalues, + const uint32_t new_capacity); + + /** + * Effect: Create a new OMT, storing it in *newomt. + * The values to the right of index (starting at index) are moved to *newomt. + * Requires: newomt != NULL + * Returns + * 0 success, + * EINVAL if index > toku_omt_size(omt) + * On nonzero return, omt and *newomt are unmodified. + * Performance: time=O(n) + * Rationale: We don't need a split-evenly operation. We need to split items + * so that their total sizes are even, and other similar splitting criteria. + * It's easy to split evenly by calling size(), and dividing by two. + */ + __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx); + + /** + * Effect: Appends leftomt and rightomt to produce a new omt. + * Creates this as the new omt. + * leftomt and rightomt are destroyed. + * Performance: time=O(n) is acceptable, but one can imagine implementations + * that are O(\log n) worst-case. + */ + __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt); + + /** + * Effect: Creates a copy of an omt. + * Creates this as the clone. + * Each element is copied directly. If they are pointers, the underlying + * data is not duplicated. Performance: O(n) or the running time of + * fill_array_with_subtree_values() + */ + void clone(const omt &src); + + /** + * Effect: Set the tree to be empty. + * Note: Will not reallocate or resize any memory. + * Performance: time=O(1) + */ + void clear(void); + + /** + * Effect: Destroy an OMT, freeing all its memory. + * If the values being stored are pointers, their underlying data is not + * freed. See free_items() Those values may be freed before or after calling + * toku_omt_destroy. Rationale: Returns no values since free() cannot fail. + * Rationale: Does not free the underlying pointers to reduce complexity. + * Performance: time=O(1) + */ + void destroy(void); + + /** + * Effect: return |this|. + * Performance: time=O(1) + */ + uint32_t size(void) const; + + /** + * Effect: Insert value into the OMT. + * If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST. + * Otherwise, let i be the minimum value such that $h(V_i, v)>0$. + * If no such i exists, then let i be |V| + * Then this has the same effect as + * insert_at(tree, value, i); + * If idx!=NULL then i is stored in *idx + * Requires: The signum of h must be monotonically increasing. + * Returns: + * 0 success + * DB_KEYEXIST the key is present (h was equal to zero for some value) + * On nonzero return, omt is unchanged. + * Performance: time=O(\log N) amortized. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx); + + /** + * Effect: Increases indexes of all items at slot >= idx by 1. + * Insert value into the position at idx. + * Returns: + * 0 success + * EINVAL if idx > this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) amortized time. + * Rationale: Some future implementation may be O(\log N) worst-case time, but + * O(\log N) amortized is good enough for now. + */ + int insert_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Replaces the item at idx with value. + * Returns: + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Performance: time=O(\log N) + * Rationale: The FT needs to be able to replace a value with another copy of + * the same value (allocated in a different location) + * + */ + int set_at(const omtdata_t &value, const uint32_t idx); + + /** + * Effect: Delete the item in slot idx. + * Decreases indexes of all items at slot > idx by 1. + * Returns + * 0 success + * EINVAL if idx>=this->size() + * On error, omt is unchanged. + * Rationale: To delete an item, first find its index using find or find_zero, + * then delete it. Performance: time=O(\log N) amortized. + */ + int delete_at(const uint32_t idx); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL + * Returns: + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate. If f always returns zero, then + * iterate returns 0. Requires: Don't modify the omt while running. (E.g., f + * may not insert or delete values from the omt.) Performance: time=O(i+\log + * N) where i is the number of times f is called, and N is the number of + * elements in the omt. Rationale: Although the functional iterator requires + * defining another function (as opposed to C++ style iterator), it is much + * easier to read. Rationale: We may at some point use functors, but for now + * this is a smaller change from the old OMT. + */ + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a ref-to-const of the + * value stored in the omt. The second argument passed to f is the index of + * the value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). We will iterate only over + * [left,right) + * + * Requires: left <= right + * Requires: f != NULL + * Returns: + * EINVAL if right > this->size() + * If f ever returns nonzero, then the iteration stops, and the value + * returned by f is returned by iterate_on_range. If f always returns zero, + * then iterate_on_range returns 0. Requires: Don't modify the omt while + * running. (E.g., f may not insert or delete values from the omt.) + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Rational: Although the + * functional iterator requires defining another function (as opposed to C++ + * style iterator), it is much easier to read. + */ + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_on_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Iterate over the values of the omt, and mark the nodes that are + * visited. Other than the marks, this behaves the same as iterate_on_range. + * Requires: supports_marks == true + * Performance: time=O(i+\log N) where i is the number of times f is called, + * and N is the number of elements in the omt. Notes: This function MAY be + * called concurrently by multiple threads, but not concurrently with any + * other non-const function. + */ + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_and_mark_range(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value whose node has been marked. Other than the marks, this + * behaves the same as iterate. Requires: supports_marks == true Performance: + * time=O(i+\log N) where i is the number of times f is called, and N is the + * number of elements in the omt. + */ + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_over_marked(iterate_extra_t *const iterate_extra) const; + + /** + * Effect: Delete all elements from the omt, whose nodes have been marked. + * Requires: supports_marks == true + * Performance: time=O(N + i\log N) where i is the number of marked elements, + * {c,sh}ould be faster + */ + void delete_all_marked(void); + + /** + * Effect: Verify that the internal state of the marks in the tree are + * self-consistent. Crashes the system if the marks are in a bad state. + * Requires: supports_marks == true + * Performance: time=O(N) + * Notes: + * Even though this is a const function, it requires exclusive access. + * Rationale: + * The current implementation of the marks relies on a sort of + * "cache" bit representing the state of bits below it in the tree. + * This allows glass-box testing that these bits are correct. + */ + void verify_marks_consistent(void) const; + + /** + * Effect: None + * Returns whether there are any marks in the tree. + */ + bool has_marks(void) const; + + /** + * Effect: Iterate over the values of the omt, from left to right, calling f + * on each value. The first argument passed to f is a pointer to the value + * stored in the omt. The second argument passed to f is the index of the + * value. The third argument passed to f is iterate_extra. The indices run + * from 0 (inclusive) to this->size() (exclusive). Requires: same as for + * iterate() Returns: same as for iterate() Performance: same as for iterate() + * Rationale: In general, most iterators should use iterate() since they + * should not modify the data stored in the omt. This function is for + * iterators which need to modify values (for example, free_items). Rationale: + * We assume if you are transforming the data in place, you want to do it to + * everything at once, so there is not yet an iterate_on_range_ptr (but there + * could be). + */ + template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> + void iterate_ptr(iterate_extra_t *const iterate_extra); + + /** + * Effect: Set *value=V_idx + * Returns + * 0 success + * EINVAL if index>=toku_omt_size(omt) + * On nonzero return, *value is unchanged + * Performance: time=O(\log N) + */ + int fetch(const uint32_t idx, omtdataout_t *const value) const; + + /** + * Effect: Find the smallest i such that h(V_i, extra)>=0 + * If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = + * V_i, and return 0. If there is such an i and h(V_i,extra)>0 then set + * *idxp=i and return DB_NOTFOUND. If there is no such i then set + * *idx=this->size() and return DB_NOTFOUND. Note: value is of type + * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is + * fixed by the instantiation. If it is the value type, then the value is + * copied out (even if the value type is a pointer to something else) If it is + * the pointer type, then *value is set to a pointer to the data within the + * omt. This is determined by the type of the omt as initially declared. If + * the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's + * will be returned by find and related functions. If the omt is declared as + * omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the + * stored items will be returned by find and related functions. Rationale: + * Structs too small for malloc should be stored directly in the omt. + * These structs may need to be edited as they exist inside the omt, so we + * need a way to get a pointer within the omt. Using separate functions for + * returning pointers and values increases code duplication and reduces + * type-checking. That also reduces the ability of the creator of a data + * structure to give advice to its future users. Slight overloading in this + * case seemed to provide a better API and better type checking. + */ + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_zero(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: + * If direction >0 then find the smallest i such that h(V_i,extra)>0. + * If direction <0 then find the largest i such that h(V_i,extra)<0. + * (Direction may not be equal to zero.) + * If value!=NULL then store V_i in *value + * If idxp!=NULL then store i in *idxp. + * Requires: The signum of h is monotically increasing. + * Returns + * 0 success + * DB_NOTFOUND no such value is found. + * On nonzero return, *value and *idxp are unchanged + * Performance: time=O(\log N) + * Rationale: + * Here's how to use the find function to find various things + * Cases for find: + * find first value: ( h(v)=+1, direction=+1 ) + * find last value ( h(v)=-1, direction=-1 ) + * find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 ) + * find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 ) + * find X or successor to X ( same as find first X. ) + * + * Rationale: To help understand heaviside functions and behavor of find: + * There are 7 kinds of heaviside functions. + * The signus of the h must be monotonically increasing. + * Given a function of the following form, A is the element + * returned for direction>0, B is the element returned + * for direction<0, C is the element returned for + * direction==0 (see find_zero) (with a return of 0), and D is the element + * returned for direction==0 (see find_zero) with a return of DB_NOTFOUND. + * If any of A, B, or C are not found, then asking for the + * associated direction will return DB_NOTFOUND. + * See find_zero for more information. + * + * Let the following represent the signus of the heaviside function. + * + * -...- + * A + * D + * + * +...+ + * B + * D + * + * 0...0 + * C + * + * -...-0...0 + * AC + * + * 0...0+...+ + * C B + * + * -...-+...+ + * AB + * D + * + * -...-0...0+...+ + * AC B + */ + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const; + + /** + * Effect: Return the size (in bytes) of the omt, as it resides in main + * memory. If the data stored are pointers, don't include the size of what + * they all point to. + */ + size_t memory_size(void); + + private: + typedef uint32_t node_idx; + typedef omt_internal::subtree_templated<supports_marks> subtree; + typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node; + ENSURE_POD(subtree); + + struct omt_array { + uint32_t start_idx; + uint32_t num_values; + omtdata_t *values; + }; + + struct omt_tree { + subtree root; + uint32_t free_idx; + omt_node *nodes; + }; + + bool is_array; + uint32_t capacity; + union { + struct omt_array a; + struct omt_tree t; + } d; + + __attribute__((nonnull)) void unmark(const subtree &subtree, + const uint32_t index, + GrowableArray<node_idx> *const indexes); + + void create_internal_no_array(const uint32_t new_capacity); + + void create_internal(const uint32_t new_capacity); + + uint32_t nweight(const subtree &subtree) const; + + node_idx node_malloc(void); + + void node_free(const node_idx idx); + + void maybe_resize_array(const uint32_t n); + + __attribute__((nonnull)) void fill_array_with_subtree_values( + omtdata_t *const array, const subtree &subtree) const; + + void convert_to_array(void); + + __attribute__((nonnull)) void rebuild_from_sorted_array( + subtree *const subtree, const omtdata_t *const values, + const uint32_t numvalues); + + void convert_to_tree(void); + + void maybe_resize_or_convert(const uint32_t n); + + bool will_need_rebalance(const subtree &subtree, const int leftmod, + const int rightmod) const; + + __attribute__((nonnull)) void insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree); + + void set_at_internal_array(const omtdata_t &value, const uint32_t idx); + + void set_at_internal(const subtree &subtree, const omtdata_t &value, + const uint32_t idx); + + void delete_internal(subtree *const subtreep, const uint32_t idx, + omt_node *const copyn, + subtree **const rebalance_subtree); + + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const; + + template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> + void iterate_ptr_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> + void iterate_ptr_internal_array(const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra); + + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &subtree, + const uint32_t idx, + iterate_extra_t *const iterate_extra); + + template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> + int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx, + iterate_extra_t *const iterate_extra) const; + + uint32_t verify_marks_consistent_internal(const subtree &subtree, + const bool allow_marks) const; + + void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const; + + void fetch_internal(const subtree &subtree, const uint32_t i, + omtdataout_t *const value) const; + + __attribute__((nonnull)) void fill_array_with_subtree_idxs( + node_idx *const array, const subtree &subtree) const; + + __attribute__((nonnull)) void rebuild_subtree_from_idxs( + subtree *const subtree, const node_idx *const idxs, + const uint32_t numvalues); + + __attribute__((nonnull)) void rebalance(subtree *const subtree); + + __attribute__((nonnull)) static void copyout(omtdata_t *const out, + const omt_node *const n); + + __attribute__((nonnull)) static void copyout(omtdata_t **const out, + omt_node *const n); + + __attribute__((nonnull)) static void copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr); + + __attribute__((nonnull)) static void copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr); + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_zero(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const; + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_plus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, uint32_t *const idxp) const; + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_minus_array(const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; + + template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> + int find_internal_minus(const subtree &subtree, const omtcmp_t &extra, + omtdataout_t *const value, + uint32_t *const idxp) const; +}; + +} // namespace toku + +// include the implementation here +#include "omt_impl.h" diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h new file mode 100644 index 000000000..e77986716 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h @@ -0,0 +1,1295 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <string.h> + +#include "../db.h" +#include "../portability/memory.h" + +namespace toku { + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) { + this->create_internal(2); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) { + if (!supports_marks) { + this->create_internal_no_array(0); + } else { + this->is_array = false; + this->capacity = 0; + this->d.t.nodes = nullptr; + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array( + const omtdata_t *const values, const uint32_t numvalues) { + this->create_internal(numvalues); + memcpy(this->d.a.values, values, numvalues * (sizeof values[0])); + this->d.a.num_values = numvalues; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array( + omtdata_t **const values, const uint32_t numvalues, + const uint32_t new_capacity) { + paranoid_invariant_notnull(values); + this->create_internal_no_array(new_capacity); + this->d.a.num_values = numvalues; + this->d.a.values = *values; + *values = nullptr; + if (supports_marks) { + this->convert_to_tree(); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt, + const uint32_t idx) { + barf_if_marked(*this); + paranoid_invariant_notnull(newomt); + if (idx > this->size()) { + return EINVAL; + } + this->convert_to_array(); + const uint32_t newsize = this->size() - idx; + newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx], + newsize); + this->d.a.num_values = idx; + this->maybe_resize_array(idx); + if (supports_marks) { + this->convert_to_tree(); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt, + omt *const rightomt) { + barf_if_marked(*this); + paranoid_invariant_notnull(leftomt); + paranoid_invariant_notnull(rightomt); + const uint32_t leftsize = leftomt->size(); + const uint32_t rightsize = rightomt->size(); + const uint32_t newsize = leftsize + rightsize; + + if (leftomt->is_array) { + if (leftomt->capacity - + (leftomt->d.a.start_idx + leftomt->d.a.num_values) >= + rightsize) { + this->create_steal_sorted_array( + &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity); + this->d.a.start_idx = leftomt->d.a.start_idx; + } else { + this->create_internal(newsize); + memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx], + leftomt->d.a.num_values * (sizeof this->d.a.values[0])); + } + } else { + this->create_internal(newsize); + leftomt->fill_array_with_subtree_values(&this->d.a.values[0], + leftomt->d.t.root); + } + leftomt->destroy(); + this->d.a.num_values = leftsize; + + if (rightomt->is_array) { + memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + &rightomt->d.a.values[rightomt->d.a.start_idx], + rightomt->d.a.num_values * (sizeof this->d.a.values[0])); + } else { + rightomt->fill_array_with_subtree_values( + &this->d.a.values[this->d.a.start_idx + this->d.a.num_values], + rightomt->d.t.root); + } + rightomt->destroy(); + this->d.a.num_values += rightsize; + paranoid_invariant(this->size() == newsize); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) { + barf_if_marked(*this); + this->create_internal(src.size()); + if (src.is_array) { + memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx], + src.d.a.num_values * (sizeof this->d.a.values[0])); + } else { + src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root); + } + this->d.a.num_values = src.size(); + if (supports_marks) { + this->convert_to_tree(); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) { + if (this->is_array) { + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + } else { + this->d.t.root.set_to_null(); + this->d.t.free_idx = 0; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) { + this->clear(); + this->capacity = 0; + if (this->is_array) { + if (this->d.a.values != nullptr) { + toku_free(this->d.a.values); + } + this->d.a.values = nullptr; + } else { + if (this->d.t.nodes != nullptr) { + toku_free(this->d.t.nodes); + } + this->d.t.nodes = nullptr; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const { + if (this->is_array) { + return this->d.a.num_values; + } else { + return this->nweight(this->d.t.root); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::insert(const omtdata_t &value, + const omtcmp_t &v, + uint32_t *const idx) { + int r; + uint32_t insert_idx; + + r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx); + if (r == 0) { + if (idx) *idx = insert_idx; + return DB_KEYEXIST; + } + if (r != DB_NOTFOUND) return r; + + if ((r = this->insert_at(value, insert_idx))) return r; + if (idx) *idx = insert_idx; + + return 0; +} + +// The following 3 functions implement a static if for us. +template <typename omtdata_t, typename omtdataout_t> +static void barf_if_marked(const omt<omtdata_t, omtdataout_t, false> &UU(omt)) { +} + +template <typename omtdata_t, typename omtdataout_t> +static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) { + invariant(!omt.has_marks()); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const { + static_assert(supports_marks, "Does not support marks"); + if (this->d.t.root.is_null()) { + return false; + } + const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()]; + return node.get_marks_below() || node.get_marked(); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at( + const omtdata_t &value, const uint32_t idx) { + barf_if_marked(*this); + if (idx > this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() + 1); + if (this->is_array && idx != this->d.a.num_values && + (idx != 0 || this->d.a.start_idx == 0)) { + this->convert_to_tree(); + } + if (this->is_array) { + if (idx == this->d.a.num_values) { + this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value; + } else { + this->d.a.values[--this->d.a.start_idx] = value; + } + this->d.a.num_values++; + } else { + subtree *rebalance_subtree = nullptr; + this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(const omtdata_t &value, + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + if (this->is_array) { + this->set_at_internal_array(value, idx); + } else { + this->set_at_internal(this->d.t.root, value, idx); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at( + const uint32_t idx) { + barf_if_marked(*this); + if (idx >= this->size()) { + return EINVAL; + } + + this->maybe_resize_or_convert(this->size() - 1); + if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) { + this->convert_to_tree(); + } + if (this->is_array) { + // Testing for 0 does not rule out it being the last entry. + // Test explicitly for num_values-1 + if (idx != this->d.a.num_values - 1) { + this->d.a.start_idx++; + } + this->d.a.num_values--; + } else { + subtree *rebalance_subtree = nullptr; + this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree); + if (rebalance_subtree != nullptr) { + this->rebalance(rebalance_subtree); + } + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate( + iterate_extra_t *const iterate_extra) const { + return this->iterate_on_range<iterate_extra_t, f>(0, this->size(), + iterate_extra); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + if (this->is_array) { + return this->iterate_internal_array<iterate_extra_t, f>(left, right, + iterate_extra); + } + return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root, + 0, iterate_extra); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + static_assert(supports_marks, "does not support marks"); + if (right > this->size()) { + return EINVAL; + } + if (left == right) { + return 0; + } + paranoid_invariant(!this->is_array); + return this->iterate_and_mark_range_internal<iterate_extra_t, f>( + left, right, this->d.t.root, 0, iterate_extra); +} + +// TODO: We can optimize this if we steal 3 bits. 1 bit: this node is +// marked. 1 bit: left subtree has marks. 1 bit: right subtree has marks. +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked( + iterate_extra_t *const iterate_extra) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + return this->iterate_over_marked_internal<iterate_extra_t, f>( + this->d.t.root, 0, iterate_extra); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::unmark( + const subtree &st, const uint32_t index, + GrowableArray<node_idx> *const indexes) { + if (st.is_null()) { + return; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t index_root = index + this->nweight(n.left); + + const bool below = n.get_marks_below(); + if (below) { + this->unmark(n.left, index, indexes); + } + if (n.get_marked()) { + indexes->push(index_root); + } + n.clear_stolen_bits(); + if (below) { + this->unmark(n.right, index_root + 1, indexes); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) { + static_assert(supports_marks, "does not support marks"); + if (!this->has_marks()) { + return; + } + paranoid_invariant(!this->is_array); + GrowableArray<node_idx> marked_indexes; + marked_indexes.init(); + + // Remove all marks. + // We need to delete all the stolen bits before calling delete_at to + // prevent barfing. + this->unmark(this->d.t.root, 0, &marked_indexes); + + for (uint32_t i = 0; i < marked_indexes.get_size(); i++) { + // Delete from left to right, shift by number already deleted. + // Alternative is delete from right to left. + int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i); + lazy_assert_zero(r); + } + marked_indexes.deinit(); + barf_if_marked(*this); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +uint32_t +omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal( + const subtree &st, const bool UU(allow_marks)) const { + if (st.is_null()) { + return 0; + } + const omt_node &node = this->d.t.nodes[st.get_index()]; + uint32_t num_marks = + verify_marks_consistent_internal(node.left, node.get_marks_below()); + num_marks += + verify_marks_consistent_internal(node.right, node.get_marks_below()); + if (node.get_marks_below()) { + paranoid_invariant(allow_marks); + paranoid_invariant(num_marks > 0); + } else { + // redundant with invariant below, but nice to have explicitly + paranoid_invariant(num_marks == 0); + } + if (node.get_marked()) { + paranoid_invariant(allow_marks); + ++num_marks; + } + return num_marks; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent( + void) const { + static_assert(supports_marks, "does not support marks"); + paranoid_invariant(!this->is_array); + this->verify_marks_consistent_internal(this->d.t.root, true); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> +void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr( + iterate_extra_t *const iterate_extra) { + if (this->is_array) { + this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(), + iterate_extra); + } else { + this->iterate_ptr_internal<iterate_extra_t, f>( + 0, this->size(), this->d.t.root, 0, iterate_extra); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +int omt<omtdata_t, omtdataout_t, supports_marks>::fetch( + const uint32_t idx, omtdataout_t *const value) const { + if (idx >= this->size()) { + return EINVAL; + } + if (this->is_array) { + this->fetch_internal_array(idx, value); + } else { + this->fetch_internal(this->d.t.root, idx, value); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + int r; + if (this->is_array) { + r = this->find_internal_zero_array<omtcmp_t, h>(extra, value, child_idxp); + } else { + r = this->find_internal_zero<omtcmp_t, h>(this->d.t.root, extra, value, + child_idxp); + } + return r; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find( + const omtcmp_t &extra, int direction, omtdataout_t *const value, + uint32_t *const idxp) const { + uint32_t tmp_index; + uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index; + paranoid_invariant(direction != 0); + if (direction < 0) { + if (this->is_array) { + return this->find_internal_minus_array<omtcmp_t, h>(extra, value, + child_idxp); + } else { + return this->find_internal_minus<omtcmp_t, h>(this->d.t.root, extra, + value, child_idxp); + } + } else { + if (this->is_array) { + return this->find_internal_plus_array<omtcmp_t, h>(extra, value, + child_idxp); + } else { + return this->find_internal_plus<omtcmp_t, h>(this->d.t.root, extra, value, + child_idxp); + } + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) { + if (this->is_array) { + return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]); + } + return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array( + const uint32_t new_capacity) { + this->is_array = true; + this->d.a.start_idx = 0; + this->d.a.num_values = 0; + this->d.a.values = nullptr; + this->capacity = new_capacity; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal( + const uint32_t new_capacity) { + this->create_internal_no_array(new_capacity); + XMALLOC_N(this->capacity, this->d.a.values); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight( + const subtree &st) const { + if (st.is_null()) { + return 0; + } else { + return this->d.t.nodes[st.get_index()].weight; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx +omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) { + paranoid_invariant(this->d.t.free_idx < this->capacity); + omt_node &n = this->d.t.nodes[this->d.t.free_idx]; + n.clear_stolen_bits(); + return this->d.t.free_idx++; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::node_free( + const node_idx UU(idx)) { + paranoid_invariant(idx < this->capacity); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array( + const uint32_t n) { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t room = this->capacity - this->d.a.start_idx; + + if (room < n || this->capacity / 2 >= new_size) { + omtdata_t *XMALLOC_N(new_size, tmp_values); + if (this->d.a.num_values) { + memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx], + this->d.a.num_values * (sizeof tmp_values[0])); + } + this->d.a.start_idx = 0; + this->capacity = new_size; + toku_free(this->d.a.values); + this->d.a.values = tmp_values; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, + supports_marks>::fill_array_with_subtree_values(omtdata_t *const array, + const subtree &st) + const { + if (st.is_null()) return; + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_values(&array[0], tree.left); + array[this->nweight(tree.left)] = tree.value; + this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1], + tree.right); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) { + if (!this->is_array) { + const uint32_t num_values = this->size(); + uint32_t new_size = 2 * num_values; + new_size = new_size < 4 ? 4 : new_size; + + omtdata_t *XMALLOC_N(new_size, tmp_values); + this->fill_array_with_subtree_values(tmp_values, this->d.t.root); + toku_free(this->d.t.nodes); + this->is_array = true; + this->capacity = new_size; + this->d.a.num_values = num_values; + this->d.a.values = tmp_values; + this->d.a.start_idx = 0; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array( + subtree *const st, const omtdata_t *const values, + const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + const uint32_t halfway = numvalues / 2; + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = numvalues; + newnode->value = values[halfway]; + st->set_index(newidx); + // update everything before the recursive calls so the second call + // can be a tail call. + this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway); + this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1], + numvalues - (halfway + 1)); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) { + if (this->is_array) { + const uint32_t num_nodes = this->size(); + uint32_t new_size = num_nodes * 2; + new_size = new_size < 4 ? 4 : new_size; + + omt_node *XMALLOC_N(new_size, new_nodes); + omtdata_t *const values = this->d.a.values; + omtdata_t *const tmp_values = &values[this->d.a.start_idx]; + this->is_array = false; + this->d.t.nodes = new_nodes; + this->capacity = new_size; + this->d.t.free_idx = 0; + this->d.t.root.set_to_null(); + this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes); + toku_free(values); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert( + const uint32_t n) { + if (this->is_array) { + this->maybe_resize_array(n); + } else { + const uint32_t new_size = n <= 2 ? 4 : 2 * n; + const uint32_t num_nodes = this->nweight(this->d.t.root); + if ((this->capacity / 2 >= new_size) || + (this->d.t.free_idx >= this->capacity && num_nodes < n) || + (this->capacity < n)) { + this->convert_to_array(); + // if we had a free list, the "supports_marks" version could + // just resize, as it is now, we have to convert to and back + // from an array. + if (supports_marks) { + this->convert_to_tree(); + } + } + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance( + const subtree &st, const int leftmod, const int rightmod) const { + if (st.is_null()) { + return false; + } + const omt_node &n = this->d.t.nodes[st.get_index()]; + // one of the 1's is for the root. + // the other is to take ceil(n/2) + const uint32_t weight_left = this->nweight(n.left) + leftmod; + const uint32_t weight_right = this->nweight(n.right) + rightmod; + return ((1 + weight_left < (1 + 1 + weight_right) / 2) || + (1 + weight_right < (1 + 1 + weight_left) / 2)); +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal( + subtree *const subtreep, const omtdata_t &value, const uint32_t idx, + subtree **const rebalance_subtree) { + if (subtreep->is_null()) { + paranoid_invariant_zero(idx); + const node_idx newidx = this->node_malloc(); + omt_node *const newnode = &this->d.t.nodes[newidx]; + newnode->weight = 1; + newnode->left.set_to_null(); + newnode->right.set_to_null(); + newnode->value = value; + subtreep->set_index(newidx); + } else { + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + n.weight++; + if (idx <= this->nweight(n.left)) { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 1, 0)) { + *rebalance_subtree = subtreep; + } + this->insert_internal(&n.left, value, idx, rebalance_subtree); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, 1)) { + *rebalance_subtree = subtreep; + } + const uint32_t sub_index = idx - this->nweight(n.left) - 1; + this->insert_internal(&n.right, value, sub_index, rebalance_subtree); + } + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array( + const omtdata_t &value, const uint32_t idx) { + this->d.a.values[this->d.a.start_idx + idx] = value; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal( + const subtree &st, const omtdata_t &value, const uint32_t idx) { + paranoid_invariant(!st.is_null()); + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + this->set_at_internal(n.left, value, idx); + } else if (idx == leftweight) { + n.value = value; + } else { + this->set_at_internal(n.right, value, idx - leftweight - 1); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal( + subtree *const subtreep, const uint32_t idx, omt_node *const copyn, + subtree **const rebalance_subtree) { + paranoid_invariant_notnull(subtreep); + paranoid_invariant_notnull(rebalance_subtree); + paranoid_invariant(!subtreep->is_null()); + omt_node &n = this->d.t.nodes[subtreep->get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (idx < leftweight) { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, -1, 0)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.left, idx, copyn, rebalance_subtree); + } else if (idx == leftweight) { + if (n.left.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.right; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else if (n.right.is_null()) { + const uint32_t oldidx = subtreep->get_index(); + *subtreep = n.left; + if (copyn != nullptr) { + copyn->value = n.value; + } + this->node_free(oldidx); + } else { + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + // don't need to copy up value, it's only used by this + // next call, and when that gets to the bottom there + // won't be any more recursion + n.weight--; + this->delete_internal(&n.right, 0, &n, rebalance_subtree); + } + } else { + n.weight--; + if (*rebalance_subtree == nullptr && + this->will_need_rebalance(*subtreep, 0, -1)) { + *rebalance_subtree = subtreep; + } + this->delete_internal(&n.right, idx - leftweight - 1, copyn, + rebalance_subtree); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) const { + int r; + for (uint32_t i = left; i < right; ++i) { + r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + if (r != 0) { + return r; + } + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> +void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) { + if (!st.is_null()) { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx, + iterate_extra); + } + if (left <= idx_root && idx_root < right) { + int r = f(&n.value, idx_root, iterate_extra); + lazy_assert_zero(r); + } + if (idx_root + 1 < right) { + this->iterate_ptr_internal<iterate_extra_t, f>( + left, right, n.right, idx_root + 1, iterate_extra); + } + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)> +void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array( + const uint32_t left, const uint32_t right, + iterate_extra_t *const iterate_extra) { + for (uint32_t i = left; i < right; ++i) { + int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra); + lazy_assert_zero(r); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal( + const uint32_t left, const uint32_t right, const subtree &st, + const uint32_t idx, iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root) { + r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right) { + return this->iterate_internal<iterate_extra_t, f>( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>:: + iterate_and_mark_range_internal(const uint32_t left, const uint32_t right, + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) { + paranoid_invariant(!st.is_null()); + int r; + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (left < idx_root && !n.left.is_null()) { + n.set_marks_below_bit(); + r = this->iterate_and_mark_range_internal<iterate_extra_t, f>( + left, right, n.left, idx, iterate_extra); + if (r != 0) { + return r; + } + } + if (left <= idx_root && idx_root < right) { + n.set_marked_bit(); + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (idx_root + 1 < right && !n.right.is_null()) { + n.set_marks_below_bit(); + return this->iterate_and_mark_range_internal<iterate_extra_t, f>( + left, right, n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename iterate_extra_t, + int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)> +int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal( + const subtree &st, const uint32_t idx, + iterate_extra_t *const iterate_extra) const { + if (st.is_null()) { + return 0; + } + int r; + const omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t idx_root = idx + this->nweight(n.left); + if (n.get_marks_below()) { + r = this->iterate_over_marked_internal<iterate_extra_t, f>(n.left, idx, + iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marked()) { + r = f(n.value, idx_root, iterate_extra); + if (r != 0) { + return r; + } + } + if (n.get_marks_below()) { + return this->iterate_over_marked_internal<iterate_extra_t, f>( + n.right, idx_root + 1, iterate_extra); + } + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array( + const uint32_t i, omtdataout_t *const value) const { + if (value != nullptr) { + copyout(value, &this->d.a.values[this->d.a.start_idx + i]); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal( + const subtree &st, const uint32_t i, omtdataout_t *const value) const { + omt_node &n = this->d.t.nodes[st.get_index()]; + const uint32_t leftweight = this->nweight(n.left); + if (i < leftweight) { + this->fetch_internal(n.left, i, value); + } else if (i == leftweight) { + if (value != nullptr) { + copyout(value, &n); + } + } else { + this->fetch_internal(n.right, i - leftweight - 1, value); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs( + node_idx *const array, const subtree &st) const { + if (!st.is_null()) { + const omt_node &tree = this->d.t.nodes[st.get_index()]; + this->fill_array_with_subtree_idxs(&array[0], tree.left); + array[this->nweight(tree.left)] = st.get_index(); + this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1], + tree.right); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs( + subtree *const st, const node_idx *const idxs, const uint32_t numvalues) { + if (numvalues == 0) { + st->set_to_null(); + } else { + uint32_t halfway = numvalues / 2; + st->set_index(idxs[halfway]); + // node_idx newidx = idxs[halfway]; + omt_node &newnode = this->d.t.nodes[st->get_index()]; + newnode.weight = numvalues; + // value is already in there. + this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway); + this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1], + numvalues - (halfway + 1)); + // n_idx = newidx; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance( + subtree *const st) { + node_idx idx = st->get_index(); + if (idx == this->d.t.root.get_index()) { + // Try to convert to an array. + // If this fails, (malloc) nothing will have changed. + // In the failure case we continue on to the standard rebalance + // algorithm. + this->convert_to_array(); + if (supports_marks) { + this->convert_to_tree(); + } + } else { + const omt_node &n = this->d.t.nodes[idx]; + node_idx *tmp_array; + size_t mem_needed = n.weight * (sizeof tmp_array[0]); + size_t mem_free = + (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]); + bool malloced; + if (mem_needed <= mem_free) { + // There is sufficient free space at the end of the nodes array + // to hold enough node indexes to rebalance. + malloced = false; + tmp_array = + reinterpret_cast<node_idx *>(&this->d.t.nodes[this->d.t.free_idx]); + } else { + malloced = true; + XMALLOC_N(n.weight, tmp_array); + } + this->fill_array_with_subtree_idxs(tmp_array, *st); + this->rebuild_subtree_from_idxs(st, tmp_array, n.weight); + if (malloced) toku_free(tmp_array); + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::copyout( + omtdata_t *const out, const omt_node *const n) { + *out = n->value; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::copyout( + omtdata_t **const out, omt_node *const n) { + *out = &n->value; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::copyout( + omtdata_t *const out, const omtdata_t *const stored_value_ptr) { + *out = *stored_value_ptr; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +void omt<omtdata_t, omtdataout_t, supports_marks>::copyout( + omtdata_t **const out, omtdata_t *const stored_value_ptr) { + *out = stored_value_ptr; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best_pos = subtree::NODE_NULL; + uint32_t best_zero = subtree::NODE_NULL; + + while (min != limit) { + uint32_t mid = (min + limit) / 2; + int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + min = mid + 1; + } else if (hv > 0) { + best_pos = mid; + limit = mid; + } else { + best_zero = mid; + limit = mid; + } + } + if (best_zero != subtree::NODE_NULL) { + // Found a zero + if (value != nullptr) { + copyout(value, &this->d.a.values[best_zero]); + } + *idxp = best_zero - this->d.a.start_idx; + return 0; + } + if (best_pos != subtree::NODE_NULL) + *idxp = best_pos - this->d.a.start_idx; + else + *idxp = this->d.a.num_values; + return DB_NOTFOUND; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + *idxp = 0; + return DB_NOTFOUND; + } + omt_node &n = this->d.t.nodes[st.get_index()]; + int hv = h(n.value, extra); + if (hv < 0) { + int r = this->find_internal_zero<omtcmp_t, h>(n.right, extra, value, idxp); + *idxp += this->nweight(n.left) + 1; + return r; + } else if (hv > 0) { + return this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp); + } else { + int r = this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n.left); + if (value != nullptr) { + copyout(value, &n); + } + r = 0; + } + return r; + } +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv > 0) { + best = mid; + limit = mid; + } else { + min = mid + 1; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + int r; + if (hv > 0) { + r = this->find_internal_plus<omtcmp_t, h>(n->left, extra, value, idxp); + if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + } else { + r = this->find_internal_plus<omtcmp_t, h>(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } + } + return r; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array( + const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + uint32_t min = this->d.a.start_idx; + uint32_t limit = this->d.a.start_idx + this->d.a.num_values; + uint32_t best = subtree::NODE_NULL; + + while (min != limit) { + const uint32_t mid = (min + limit) / 2; + const int hv = h(this->d.a.values[mid], extra); + if (hv < 0) { + best = mid; + min = mid + 1; + } else { + limit = mid; + } + } + if (best == subtree::NODE_NULL) { + return DB_NOTFOUND; + } + if (value != nullptr) { + copyout(value, &this->d.a.values[best]); + } + *idxp = best - this->d.a.start_idx; + return 0; +} + +template <typename omtdata_t, typename omtdataout_t, bool supports_marks> +template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)> +int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus( + const subtree &st, const omtcmp_t &extra, omtdataout_t *const value, + uint32_t *const idxp) const { + paranoid_invariant_notnull(idxp); + if (st.is_null()) { + return DB_NOTFOUND; + } + omt_node *const n = &this->d.t.nodes[st.get_index()]; + int hv = h(n->value, extra); + if (hv < 0) { + int r = + this->find_internal_minus<omtcmp_t, h>(n->right, extra, value, idxp); + if (r == 0) { + *idxp += this->nweight(n->left) + 1; + } else if (r == DB_NOTFOUND) { + *idxp = this->nweight(n->left); + if (value != nullptr) { + copyout(value, n); + } + r = 0; + } + return r; + } else { + return this->find_internal_minus<omtcmp_t, h>(n->left, extra, value, idxp); + } +} +} // namespace toku diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h new file mode 100644 index 000000000..f20eeedf2 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h @@ -0,0 +1,165 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +// Overview: A partitioned_counter provides a counter that can be incremented +// and the running sum can be read at any time. +// We assume that increments are frequent, whereas reading is infrequent. +// Implementation hint: Use thread-local storage so each thread increments its +// own data. The increment does not require a lock or atomic operation. +// Reading the data can be performed by iterating over the thread-local +// versions, summing them up. The data structure also includes a sum for all +// the threads that have died. Use a pthread_key to create the thread-local +// versions. When a thread finishes, the system calls pthread_key destructor +// which can add that thread's copy into the sum_of_dead counter. +// Rationale: For statistics such as are found in engine status, we need a +// counter that requires no cache misses to increment. We've seen significant +// performance speedups by removing certain counters. Rather than removing +// those statistics, we would like to just make the counter fast. We generally +// increment the counters frequently, and want to fetch the values +// infrequently. The counters are monotonic. The counters can be split into +// many counters, which can be summed up at the end. We don't care if we get +// slightly out-of-date counter sums when we read the counter. We don't care +// if there is a race on reading the a counter +// variable and incrementing. +// See tests/test_partitioned_counter.c for some performance measurements. +// Operations: +// create_partitioned_counter Create a counter initialized to zero. +// destroy_partitioned_counter Destroy it. +// increment_partitioned_counter Increment it. This is the frequent +// operation. read_partitioned_counter Get the current value. This is +// infrequent. +// See partitioned_counter.cc for the abstraction function and representation +// invariant. +// +// The google style guide says to avoid using constructors, and it appears that +// constructors may have broken all the tests, because they called +// pthread_key_create before the key was actually created. So the google style +// guide may have some wisdom there... +// +// This version does not use constructors, essentially reverrting to the google +// C++ style guide. +// + +// The old C interface. This required a bunch of explicit +// ___attribute__((__destructor__)) functions to remember to destroy counters at +// the end. +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct partitioned_counter *PARTITIONED_COUNTER; +PARTITIONED_COUNTER create_partitioned_counter(void); +// Effect: Create a counter, initialized to zero. + +void destroy_partitioned_counter(PARTITIONED_COUNTER); +// Effect: Destroy the counter. No operations on that counter are permitted +// after this. + +void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount); +// Effect: Increment the counter by amount. +// Requires: No overflows. This is a 64-bit unsigned counter. + +uint64_t read_partitioned_counter(PARTITIONED_COUNTER) + __attribute__((__visibility__("default"))); +// Effect: Return the current value of the counter. + +void partitioned_counters_init(void); +// Effect: Initialize any partitioned counters data structures that must be set +// up before any partitioned counters run. + +void partitioned_counters_destroy(void); +// Effect: Destroy any partitioned counters data structures. + +#if defined(__cplusplus) +}; +#endif + +#if 0 +#include <pthread.h> + +#include "fttypes.h" + +// Used inside the PARTITIONED_COUNTER. +struct linked_list_head { + struct linked_list_element *first; +}; + + +class PARTITIONED_COUNTER { +public: + PARTITIONED_COUNTER(void); + // Effect: Construct a counter, initialized to zero. + + ~PARTITIONED_COUNTER(void); + // Effect: Destruct the counter. + + void increment(uint64_t amount); + // Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64). + // Requires: Don't use this from a static constructor or destructor. + + uint64_t read(void); + // Effect: Read the sum. + // Requires: Don't use this from a static constructor or destructor. + +private: + uint64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated. + pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage. + struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter. + + // This function is used to destroy the thread-local part of the state when a thread terminates. + // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends. + friend void destroy_thread_local_part_of_partitioned_counters (void *); +}; +#endif diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h new file mode 100644 index 000000000..3fd0095d0 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h @@ -0,0 +1,76 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + +#ident \ + "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include "partitioned_counter.h" +// PORT2: #include <util/constexpr.h> + +#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc) \ + do { \ + array.status[k].keyname = #k; \ + array.status[k].columnname = #c; \ + array.status[k].type = t; \ + array.status[k].legend = l; \ + constexpr_static_assert( \ + strcmp(#c, "NULL") && strcmp(#c, "0"), \ + "Use nullptr for no column name instead of NULL, 0, etc..."); \ + constexpr_static_assert( \ + (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"), \ + "Missing column name."); \ + array.status[k].include = \ + static_cast<toku_engine_status_include_type>(inc); \ + if (t == STATUS_PARCOUNT) { \ + array.status[k].value.parcount = create_partitioned_counter(); \ + } \ + } while (0) diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc new file mode 100644 index 000000000..531165dea --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc @@ -0,0 +1,503 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h" + +#include <algorithm> +#include <cinttypes> +#include <mutex> + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/transaction_db_mutex.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/hash.h" +#include "util/thread_local.h" +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +namespace ROCKSDB_NAMESPACE { + +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr<TransactionDBMutexFactory> mutex_factory) { + std::shared_ptr<TransactionDBMutexFactory> use_factory; + + if (mutex_factory) { + use_factory = mutex_factory; + } else { + use_factory.reset(new TransactionDBMutexFactoryImpl()); + } + return new RangeTreeLockManager(use_factory); +} + +static const char SUFFIX_INFIMUM = 0x0; +static const char SUFFIX_SUPREMUM = 0x1; + +// Convert Endpoint into an internal format used for storing it in locktree +// (DBT structure is used for passing endpoints to locktree and getting back) +void serialize_endpoint(const Endpoint& endp, std::string* buf) { + buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM); + buf->append(endp.slice.data(), endp.slice.size()); +} + +// Decode the endpoint from the format it is stored in the locktree (DBT) to +// the one used outside: either Endpoint or EndpointWithString +template <typename EndpointStruct> +void deserialize_endpoint(const DBT* dbt, EndpointStruct* endp) { + assert(dbt->size >= 1); + const char* dbt_data = (const char*)dbt->data; + char suffix = dbt_data[0]; + assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM); + endp->inf_suffix = (suffix == SUFFIX_SUPREMUM); + endp->slice = decltype(EndpointStruct::slice)(dbt_data + 1, dbt->size - 1); +} + +// Get a range lock on [start_key; end_key] range +Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn, + uint32_t column_family_id, + const Endpoint& start_endp, + const Endpoint& end_endp, Env*, + bool exclusive) { + toku::lock_request request; + request.create(mutex_factory_); + DBT start_key_dbt, end_key_dbt; + + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter"); + std::string start_key; + std::string end_key; + serialize_endpoint(start_endp, &start_key); + serialize_endpoint(end_endp, &end_key); + + toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size()); + toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size()); + + auto lt = GetLockTreeForCF(column_family_id); + + // Put the key waited on into request's m_extra. See + // wait_callback_for_locktree for details. + std::string wait_key(start_endp.slice.data(), start_endp.slice.size()); + + request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt, + exclusive ? toku::lock_request::WRITE : toku::lock_request::READ, + false /* not a big txn */, &wait_key); + + // This is for "periodically wake up and check if the wait is killed" feature + // which we are not using. + uint64_t killed_time_msec = 0; + uint64_t wait_time_msec = txn->GetLockTimeout(); + + if (wait_time_msec == static_cast<uint64_t>(-1)) { + // The transaction has no wait timeout. lock_request::wait doesn't support + // this, it needs a number of milliseconds to wait. Pass it one year to + // be safe. + wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365; + } else { + // convert microseconds to milliseconds + wait_time_msec = (wait_time_msec + 500) / 1000; + } + + std::vector<RangeDeadlockInfo> di_path; + request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive, + const DBT* start_dbt, const DBT* end_dbt) { + EndpointWithString start; + EndpointWithString end; + deserialize_endpoint(start_dbt, &start); + deserialize_endpoint(end_dbt, &end); + + di_path.push_back({txnid, column_family_id, is_exclusive, std::move(start), + std::move(end)}); + }; + + request.start(); + + const int r = request.wait(wait_time_msec, killed_time_msec, + nullptr, // killed_callback + wait_callback_for_locktree, nullptr); + + // Inform the txn that we are no longer waiting: + txn->ClearWaitingTxn(); + + request.destroy(); + switch (r) { + case 0: + break; // fall through + case DB_LOCK_NOTGRANTED: + return Status::TimedOut(Status::SubCode::kLockTimeout); + case TOKUDB_OUT_OF_LOCKS: + return Status::Busy(Status::SubCode::kLockLimit); + case DB_LOCK_DEADLOCK: { + std::reverse(di_path.begin(), di_path.end()); + dlock_buffer_.AddNewPath( + RangeDeadlockPath(di_path, request.get_start_time())); + return Status::Busy(Status::SubCode::kDeadlock); + } + default: + assert(0); + return Status::Busy(Status::SubCode::kLockLimit); + } + + return Status::OK(); +} + +// Wait callback that locktree library will call to inform us about +// the lock waits that are in progress. +void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) { + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:EnterWaitingTxn"); + for (auto wait_info : *infos) { + // As long as we hold the lock on the locktree's pending request queue + // this should be safe. + auto txn = (PessimisticTransaction*)wait_info.waiter; + auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid; + + autovector<TransactionID> waitee_ids; + for (auto waitee : wait_info.waitees) { + waitee_ids.push_back(waitee); + } + txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra); + } + + // Here we can assume that the locktree code will now wait for some lock + TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn"); +} + +void RangeTreeLockManager::UnLock(PessimisticTransaction* txn, + ColumnFamilyId column_family_id, + const std::string& key, Env*) { + auto locktree = GetLockTreeForCF(column_family_id); + std::string endp_image; + serialize_endpoint({key.data(), key.size(), false}, &endp_image); + + DBT key_dbt; + toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size()); + + toku::range_buffer range_buf; + range_buf.create(); + range_buf.append(&key_dbt, &key_dbt); + + locktree->release_locks((TXNID)txn, &range_buf); + range_buf.destroy(); + + toku::lock_request::retry_all_lock_requests( + locktree.get(), wait_callback_for_locktree, nullptr); +} + +void RangeTreeLockManager::UnLock(PessimisticTransaction* txn, + const LockTracker& tracker, Env*) { + const RangeTreeLockTracker* range_tracker = + static_cast<const RangeTreeLockTracker*>(&tracker); + + RangeTreeLockTracker* range_trx_tracker = + static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks()); + bool all_keys = (range_trx_tracker == range_tracker); + + // tracked_locks_->range_list may hold nullptr if the transaction has never + // acquired any locks. + ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys); +} + +int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key, + const DBT* b_key) { + const char* a = (const char*)a_key->data; + const char* b = (const char*)b_key->data; + + size_t a_len = a_key->size; + size_t b_len = b_key->size; + + size_t min_len = std::min(a_len, b_len); + + // Compare the values. The first byte encodes the endpoint type, its value + // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM. + Comparator* cmp = (Comparator*)arg; + int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1)); + if (!res) { + if (b_len > min_len) { + // a is shorter; + if (a[0] == SUFFIX_INFIMUM) { + return -1; //"a is smaller" + } else { + // a is considered padded with 0xFF:FF:FF:FF... + return 1; // "a" is bigger + } + } else if (a_len > min_len) { + // the opposite of the above: b is shorter. + if (b[0] == SUFFIX_INFIMUM) { + return 1; //"b is smaller" + } else { + // b is considered padded with 0xFF:FF:FF:FF... + return -1; // "b" is bigger + } + } else { + // the lengths are equal (and the key values, too) + if (a[0] < b[0]) { + return -1; + } else if (a[0] > b[0]) { + return 1; + } else { + return 0; + } + } + } else { + return res; + } +} + +namespace { +void UnrefLockTreeMapsCache(void* ptr) { + // Called when a thread exits or a ThreadLocalPtr gets destroyed. + auto lock_tree_map_cache = static_cast< + std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>*>( + ptr); + delete lock_tree_map_cache; +} +} // anonymous namespace + +RangeTreeLockManager::RangeTreeLockManager( + std::shared_ptr<TransactionDBMutexFactory> mutex_factory) + : mutex_factory_(mutex_factory), + ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)), + dlock_buffer_(10) { + ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_); +} + +int RangeTreeLockManager::on_create(toku::locktree* lt, void* arg) { + // arg is a pointer to RangeTreeLockManager + lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg); + return 0; +} + +bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT* a, const DBT* b, + void* extra) { + Endpoint a_endp, b_endp; + deserialize_endpoint(a, &a_endp); + deserialize_endpoint(b, &b_endp); + auto self = static_cast<RangeTreeLockManager*>(extra); + return self->barrier_func_(a_endp, b_endp); +} + +void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize( + uint32_t target_size) { + dlock_buffer_.Resize(target_size); +} + +void RangeTreeLockManager::Resize(uint32_t target_size) { + SetRangeDeadlockInfoBufferSize(target_size); +} + +std::vector<RangeDeadlockPath> +RangeTreeLockManager::GetRangeDeadlockInfoBuffer() { + return dlock_buffer_.PrepareBuffer(); +} + +std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() { + std::vector<DeadlockPath> res; + std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer(); + // report left endpoints + for (auto it = data.begin(); it != data.end(); ++it) { + std::vector<DeadlockInfo> path; + + for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) { + path.push_back( + {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice}); + } + res.push_back(DeadlockPath(path, it->deadlock_time)); + } + return res; +} + +// @brief Lock Escalation Callback function +// +// @param txnid Transaction whose locks got escalated +// @param lt Lock Tree where escalation is happening +// @param buffer Escalation result: list of locks that this transaction now +// owns in this lock tree. +// @param void* Callback context +void RangeTreeLockManager::on_escalate(TXNID txnid, const toku::locktree* lt, + const toku::range_buffer& buffer, + void*) { + auto txn = (PessimisticTransaction*)txnid; + ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer); +} + +RangeTreeLockManager::~RangeTreeLockManager() { + autovector<void*> local_caches; + ltree_lookup_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast<LockTreeMap*>(cache); + } + ltree_map_.clear(); // this will call release_lt() for all locktrees + ltm_.destroy(); +} + +RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() { + LTM_STATUS_S ltm_status_test; + ltm_.get_status(<m_status_test); + Counters res; + + // Searching status variable by its string name is how Toku's unit tests + // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?) + // lookup keyname in status + for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) { + TOKU_ENGINE_STATUS_ROW status = <m_status_test.status[i]; + if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) { + res.escalation_count = status->value.num; + continue; + } + if (strcmp(status->keyname, "LTM_WAIT_COUNT") == 0) { + res.lock_wait_count = status->value.num; + continue; + } + if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) { + res.current_lock_memory = status->value.num; + } + } + return res; +} + +std::shared_ptr<toku::locktree> RangeTreeLockManager::MakeLockTreePtr( + toku::locktree* lt) { + toku::locktree_manager* ltm = <m_; + return std::shared_ptr<toku::locktree>( + lt, [ltm](toku::locktree* p) { ltm->release_lt(p); }); +} + +void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) { + uint32_t column_family_id = cfh->GetID(); + + InstrumentedMutexLock l(<ree_map_mutex_); + if (ltree_map_.find(column_family_id) == ltree_map_.end()) { + DICTIONARY_ID dict_id = {.dictid = column_family_id}; + toku::comparator cmp; + cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator()); + toku::locktree* ltree = + ltm_.get_lt(dict_id, cmp, + /* on_create_extra*/ static_cast<void*>(this)); + // This is ok to because get_lt has copied the comparator: + cmp.destroy(); + + ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)}); + } +} + +void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) { + uint32_t column_family_id = cfh->GetID(); + // Remove lock_map for this column family. Since the lock map is stored + // as a shared ptr, concurrent transactions can still keep using it + // until they release their references to it. + + // TODO what if one drops a column family while transaction(s) still have + // locks in it? + // locktree uses column family'c Comparator* as the criteria to do tree + // ordering. If the comparator is gone, we won't even be able to remove the + // elements from the locktree. + // A possible solution might be to remove everything right now: + // - wait until everyone traversing the locktree are gone + // - remove everything from the locktree. + // - some transactions may have acquired locks in their LockTracker objects. + // Arrange something so we don't blow up when they try to release them. + // - ... + // This use case (drop column family while somebody is using it) doesn't seem + // the priority, though. + + { + InstrumentedMutexLock l(<ree_map_mutex_); + + auto lock_maps_iter = ltree_map_.find(column_family_id); + assert(lock_maps_iter != ltree_map_.end()); + ltree_map_.erase(lock_maps_iter); + } // lock_map_mutex_ + + autovector<void*> local_caches; + ltree_lookup_cache_->Scrape(&local_caches, nullptr); + for (auto cache : local_caches) { + delete static_cast<LockTreeMap*>(cache); + } +} + +std::shared_ptr<toku::locktree> RangeTreeLockManager::GetLockTreeForCF( + ColumnFamilyId column_family_id) { + // First check thread-local cache + if (ltree_lookup_cache_->Get() == nullptr) { + ltree_lookup_cache_->Reset(new LockTreeMap()); + } + + auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get()); + + auto it = ltree_map_cache->find(column_family_id); + if (it != ltree_map_cache->end()) { + // Found lock map for this column family. + return it->second; + } + + // Not found in local cache, grab mutex and check shared LockMaps + InstrumentedMutexLock l(<ree_map_mutex_); + + it = ltree_map_.find(column_family_id); + if (it == ltree_map_.end()) { + return nullptr; + } else { + // Found lock map. Store in thread-local cache and return. + ltree_map_cache->insert({column_family_id, it->second}); + return it->second; + } +} + +struct LOCK_PRINT_CONTEXT { + RangeLockManagerHandle::RangeLockStatus* data; // Save locks here + uint32_t cfh_id; // Column Family whose tree we are traversing +}; + +// Report left endpoints of the acquired locks +LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() { + PointLockStatus res; + LockManager::RangeLockStatus data = GetRangeLockStatus(); + // report left endpoints + for (auto it = data.begin(); it != data.end(); ++it) { + auto& val = it->second; + res.insert({it->first, {val.start.slice, val.ids, val.exclusive}}); + } + return res; +} + +static void push_into_lock_status_data(void* param, const DBT* left, + const DBT* right, TXNID txnid_arg, + bool is_shared, TxnidVector* owners) { + struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param; + struct RangeLockInfo info; + + info.exclusive = !is_shared; + + deserialize_endpoint(left, &info.start); + deserialize_endpoint(right, &info.end); + + if (txnid_arg != TXNID_SHARED) { + info.ids.push_back(txnid_arg); + } else { + for (auto it : *owners) { + info.ids.push_back(it); + } + } + ctx->data->insert({ctx->cfh_id, info}); +} + +LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() { + LockManager::RangeLockStatus data; + { + InstrumentedMutexLock l(<ree_map_mutex_); + for (auto it : ltree_map_) { + LOCK_PRINT_CONTEXT ctx = {&data, it.first}; + it.second->dump_locks((void*)&ctx, push_into_lock_status_data); + } + } + return data; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h new file mode 100644 index 000000000..e4236d600 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +// For DeadlockInfoBuffer: +#include "util/thread_local.h" +#include "utilities/transactions/lock/point/point_lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" + +// Lock Tree library: +#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h" +#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h" +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer; + +// A Range Lock Manager that uses PerconaFT's locktree library +class RangeTreeLockManager : public RangeLockManagerBase, + public RangeLockManagerHandle { + public: + LockManager* getLockManager() override { return this; } + + void AddColumnFamily(const ColumnFamilyHandle* cfh) override; + void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override; + + void Resize(uint32_t) override; + std::vector<DeadlockPath> GetDeadlockInfoBuffer() override; + + std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override; + void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override; + + // Get a lock on a range + // @note only exclusive locks are currently supported (requesting a + // non-exclusive lock will get an exclusive one) + using LockManager::TryLock; + Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const Endpoint& start_endp, const Endpoint& end_endp, Env* env, + bool exclusive) override; + + void UnLock(PessimisticTransaction* txn, const LockTracker& tracker, + Env* env) override; + void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id, + const std::string& key, Env* env) override; + void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&, + const Endpoint&, Env*) override { + // TODO: range unlock does nothing... + } + + explicit RangeTreeLockManager( + std::shared_ptr<TransactionDBMutexFactory> mutex_factory); + + ~RangeTreeLockManager() override; + + int SetMaxLockMemory(size_t max_lock_memory) override { + return ltm_.set_max_lock_memory(max_lock_memory); + } + + size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); } + + Counters GetStatus() override; + + bool IsPointLockSupported() const override { + // One could have acquired a point lock (it is reduced to range lock) + return true; + } + + PointLockStatus GetPointLockStatus() override; + + // This is from LockManager + LockManager::RangeLockStatus GetRangeLockStatus() override; + + // This has the same meaning as GetRangeLockStatus but is from + // RangeLockManagerHandle + RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override { + return GetRangeLockStatus(); + } + + bool IsRangeLockSupported() const override { return true; } + + const LockTrackerFactory& GetLockTrackerFactory() const override { + return RangeTreeLockTrackerFactory::Get(); + } + + // Get the locktree which stores locks for the Column Family with given cf_id + std::shared_ptr<toku::locktree> GetLockTreeForCF(ColumnFamilyId cf_id); + + void SetEscalationBarrierFunc(EscalationBarrierFunc func) override { + barrier_func_ = func; + } + + private: + toku::locktree_manager ltm_; + + EscalationBarrierFunc barrier_func_ = + [](const Endpoint&, const Endpoint&) -> bool { return false; }; + + std::shared_ptr<TransactionDBMutexFactory> mutex_factory_; + + // Map from cf_id to locktree*. Can only be accessed while holding the + // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt + using LockTreeMap = + std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>; + LockTreeMap ltree_map_; + + InstrumentedMutex ltree_map_mutex_; + + // Per-thread cache of ltree_map_. + // (uses the same approach as TransactionLockMgr::lock_maps_cache_) + std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_; + + RangeDeadlockInfoBuffer dlock_buffer_; + + std::shared_ptr<toku::locktree> MakeLockTreePtr(toku::locktree* lt); + static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key); + + // Callbacks + static int on_create(toku::locktree*, void*); + static void on_destroy(toku::locktree*) {} + static void on_escalate(TXNID txnid, const toku::locktree* lt, + const toku::range_buffer& buffer, void* extra); + + static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra); +}; + +void serialize_endpoint(const Endpoint& endp, std::string* buf); +void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos); + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc new file mode 100644 index 000000000..be1e1478b --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef OS_WIN + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" + +#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h" + +namespace ROCKSDB_NAMESPACE { + +RangeLockList *RangeTreeLockTracker::getOrCreateList() { + if (range_list_) return range_list_.get(); + + // Doesn't exist, create + range_list_.reset(new RangeLockList()); + return range_list_.get(); +} + +void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) { + DBT key_dbt; + std::string key; + serialize_endpoint(Endpoint(lock_req.key, false), &key); + toku_fill_dbt(&key_dbt, key.data(), key.size()); + RangeLockList *rl = getOrCreateList(); + rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt); +} + +void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) { + DBT start_dbt, end_dbt; + std::string start_key, end_key; + + serialize_endpoint(lock_req.start_endp, &start_key); + serialize_endpoint(lock_req.end_endp, &end_key); + + toku_fill_dbt(&start_dbt, start_key.data(), start_key.size()); + toku_fill_dbt(&end_dbt, end_key.data(), end_key.size()); + + RangeLockList *rl = getOrCreateList(); + rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt); +} + +PointLockStatus RangeTreeLockTracker::GetPointLockStatus( + ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const { + // This function is not expected to be called as RangeTreeLockTracker:: + // IsPointLockSupported() returns false. Return the status which indicates + // the point is not locked. + PointLockStatus p; + p.locked = false; + p.exclusive = true; + p.seq = 0; + return p; +} + +void RangeTreeLockTracker::Clear() { range_list_.reset(); } + +void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key, + const DBT *right_key) { + MutexLock l(&mutex_); + // Only the transaction owner thread calls this function. + // The same thread does the lock release, so we can be certain nobody is + // releasing the locks concurrently. + assert(!releasing_locks_.load()); + auto it = buffers_.find(cf_id); + if (it == buffers_.end()) { + // create a new one + it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first; + it->second->create(); + } + it->second->append(left_key, right_key); +} + +void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr, + PessimisticTransaction *txn, + bool all_trx_locks) { + { + MutexLock l(&mutex_); + // The lt->release_locks() call below will walk range_list->buffer_. We + // need to prevent lock escalation callback from replacing + // range_list->buffer_ while we are doing that. + // + // Additional complication here is internal mutex(es) in the locktree + // (let's call them latches): + // - Lock escalation first obtains latches on the lock tree + // - Then, it calls RangeTreeLockManager::on_escalate to replace + // transaction's range_list->buffer_. = Access to that buffer must be + // synchronized, so it will want to acquire the range_list->mutex_. + // + // While in this function we would want to do the reverse: + // - Acquire range_list->mutex_ to prevent access to the range_list. + // - Then, lt->release_locks() call will walk through the range_list + // - and acquire latches on parts of the lock tree to remove locks from + // it. + // + // How do we avoid the deadlock? The idea is that here we set + // releasing_locks_=true, and release the mutex. + // All other users of the range_list must: + // - Acquire the mutex, then check that releasing_locks_=false. + // (the code in this function doesnt do that as there's only one thread + // that releases transaction's locks) + releasing_locks_.store(true); + } + + for (auto it : buffers_) { + // Don't try to call release_locks() if the buffer is empty! if we are + // not holding any locks, the lock tree might be in the STO-mode with + // another transaction, and our attempt to release an empty set of locks + // will cause an assertion failure. + if (it.second->get_num_ranges()) { + auto lt_ptr = mgr->GetLockTreeForCF(it.first); + toku::locktree *lt = lt_ptr.get(); + + lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks); + + it.second->destroy(); + it.second->create(); + + toku::lock_request::retry_all_lock_requests(lt, + wait_callback_for_locktree); + } + } + + Clear(); + releasing_locks_.store(false); +} + +void RangeLockList::ReplaceLocks(const toku::locktree *lt, + const toku::range_buffer &buffer) { + MutexLock l(&mutex_); + if (releasing_locks_.load()) { + // Do nothing. The transaction is releasing its locks, so it will not care + // about having a correct list of ranges. (In TokuDB, + // toku_db_txn_escalate_callback() makes use of this property, too) + return; + } + + ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid; + + auto it = buffers_.find(cf_id); + it->second->destroy(); + it->second->create(); + + toku::range_buffer::iterator iter(&buffer); + toku::range_buffer::iterator::record rec; + while (iter.current(&rec)) { + it->second->append(rec.get_left_key(), rec.get_right_key()); + iter.next(); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_WIN +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h new file mode 100644 index 000000000..4ef48d252 --- /dev/null +++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <string> +#include <unordered_map> + +#include "util/mutexlock.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/pessimistic_transaction.h" + +// Range Locking: +#include "lib/locktree/lock_request.h" +#include "lib/locktree/locktree.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeTreeLockManager; + +// Storage for locks that are currently held by a transaction. +// +// Locks are kept in toku::range_buffer because toku::locktree::release_locks() +// accepts that as an argument. +// +// Note: the list of locks may differ slighly from the contents of the lock +// tree, due to concurrency between lock acquisition, lock release, and lock +// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details. +// This property is currently harmless. +// +// Append() and ReleaseLocks() are not thread-safe, as they are expected to be +// called only by the owner transaction. ReplaceLocks() is safe to call from +// other threads. +class RangeLockList { + public: + ~RangeLockList() { Clear(); } + + RangeLockList() : releasing_locks_(false) {} + + void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key); + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks); + void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer); + + private: + void Clear() { + for (auto it : buffers_) { + it.second->destroy(); + } + buffers_.clear(); + } + + std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>> + buffers_; + port::Mutex mutex_; + std::atomic<bool> releasing_locks_; +}; + +// A LockTracker-based object that is used together with RangeTreeLockManager. +class RangeTreeLockTracker : public LockTracker { + public: + RangeTreeLockTracker() : range_list_(nullptr) {} + + RangeTreeLockTracker(const RangeTreeLockTracker&) = delete; + RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete; + + void Track(const PointLockRequest&) override; + void Track(const RangeLockRequest&) override; + + bool IsPointLockSupported() const override { + // This indicates that we don't implement GetPointLockStatus() + return false; + } + bool IsRangeLockSupported() const override { return true; } + + // a Not-supported dummy implementation. + UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override { + return UntrackStatus::NOT_TRACKED; + } + + // "If this method is not supported, leave it as a no-op." + void Merge(const LockTracker&) override {} + + // "If this method is not supported, leave it as a no-op." + void Subtract(const LockTracker&) override {} + + void Clear() override; + + // "If this method is not supported, returns nullptr." + virtual LockTracker* GetTrackedLocksSinceSavePoint( + const LockTracker&) const override { + return nullptr; + } + + PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id, + const std::string& key) const override; + + // The return value is only used for tests + uint64_t GetNumPointLocks() const override { return 0; } + + ColumnFamilyIterator* GetColumnFamilyIterator() const override { + return nullptr; + } + + KeyIterator* GetKeyIterator( + ColumnFamilyId /*column_family_id*/) const override { + return nullptr; + } + + void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn, + bool all_trx_locks) { + if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks); + } + + void ReplaceLocks(const toku::locktree* lt, + const toku::range_buffer& buffer) { + // range_list_ cannot be NULL here + range_list_->ReplaceLocks(lt, buffer); + } + + private: + RangeLockList* getOrCreateList(); + std::unique_ptr<RangeLockList> range_list_; +}; + +class RangeTreeLockTrackerFactory : public LockTrackerFactory { + public: + static const RangeTreeLockTrackerFactory& Get() { + static const RangeTreeLockTrackerFactory instance; + return instance; + } + + LockTracker* Create() const override { return new RangeTreeLockTracker(); } + + private: + RangeTreeLockTrackerFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.cc b/src/rocksdb/utilities/transactions/optimistic_transaction.cc new file mode 100644 index 000000000..0ee0f28b6 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/optimistic_transaction.h" + +#include <string> + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/point/point_lock_tracker.h" +#include "utilities/transactions/optimistic_transaction.h" +#include "utilities/transactions/optimistic_transaction_db_impl.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +OptimisticTransaction::OptimisticTransaction( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) + : TransactionBaseImpl(txn_db->GetBaseDB(), write_options, + PointLockTrackerFactory::Get()), + txn_db_(txn_db) { + Initialize(txn_options); +} + +void OptimisticTransaction::Initialize( + const OptimisticTransactionOptions& txn_options) { + if (txn_options.set_snapshot) { + SetSnapshot(); + } +} + +void OptimisticTransaction::Reinitialize( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options); + Initialize(txn_options); +} + +OptimisticTransaction::~OptimisticTransaction() {} + +void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); } + +Status OptimisticTransaction::Prepare() { + return Status::InvalidArgument( + "Two phase commit not supported for optimistic transactions."); +} + +Status OptimisticTransaction::Commit() { + auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl, + OptimisticTransactionDB>(txn_db_); + assert(txn_db_impl); + switch (txn_db_impl->GetValidatePolicy()) { + case OccValidationPolicy::kValidateParallel: + return CommitWithParallelValidate(); + case OccValidationPolicy::kValidateSerial: + return CommitWithSerialValidate(); + default: + assert(0); + } + // unreachable, just void compiler complain + return Status::OK(); +} + +Status OptimisticTransaction::CommitWithSerialValidate() { + // Set up callback which will call CheckTransactionForConflicts() to + // check whether this transaction is safe to be committed. + OptimisticTransactionCallback callback(this); + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB()); + + Status s = db_impl->WriteWithCallback( + write_options_, GetWriteBatch()->GetWriteBatch(), &callback); + + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::CommitWithParallelValidate() { + auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl, + OptimisticTransactionDB>(txn_db_); + assert(txn_db_impl); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB()); + assert(db_impl); + const size_t space = txn_db_impl->GetLockBucketsSize(); + std::set<size_t> lk_idxes; + std::vector<std::unique_lock<std::mutex>> lks; + std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it( + tracked_locks_->GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr<LockTracker::KeyIterator> key_it( + tracked_locks_->GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space)); + } + } + // NOTE: in a single txn, all bucket-locks are taken in ascending order. + // In this way, txns from different threads all obey this rule so that + // deadlock can be avoided. + for (auto v : lk_idxes) { + lks.emplace_back(txn_db_impl->LockBucket(v)); + } + + Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); + if (!s.ok()) { + return s; + } + + s = db_impl->Write(write_options_, GetWriteBatch()->GetWriteBatch()); + if (s.ok()) { + Clear(); + } + + return s; +} + +Status OptimisticTransaction::Rollback() { + Clear(); + return Status::OK(); +} + +// Record this key so that we can check it for conflicts at commit time. +// +// 'exclusive' is unused for OptimisticTransaction. +Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked); // not supported + (void)assume_tracked; + if (!do_validate) { + return Status::OK(); + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + + SetSnapshotIfNeeded(); + + SequenceNumber seq; + if (snapshot_) { + seq = snapshot_->GetSequenceNumber(); + } else { + seq = db_->GetLatestSequenceNumber(); + } + + std::string key_str = key.ToString(); + + TrackKey(cfh_id, key_str, seq, read_only, exclusive); + + // Always return OK. Confilct checking will happen at commit time. + return Status::OK(); +} + +// Returns OK if it is safe to commit this transaction. Returns Status::Busy +// if there are read or write conflicts that would prevent us from committing OR +// if we can not determine whether there would be any such conflicts. +// +// Should only be called on writer thread in order to avoid any race conditions +// in detecting write conflicts. +Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) { + auto db_impl = static_cast_with_check<DBImpl>(db); + + // Since we are on the write thread and do not want to block other writers, + // we will do a cache-only conflict check. This can result in TryAgain + // getting returned if there is not sufficient memtable history to check + // for conflicts. + return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, + true /* cache_only */); +} + +Status OptimisticTransaction::SetName(const TransactionName& /* unused */) { + return Status::InvalidArgument("Optimistic transactions cannot be named."); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.h b/src/rocksdb/utilities/transactions/optimistic_transaction.h new file mode 100644 index 000000000..de23233d5 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <stack> +#include <string> +#include <unordered_map> +#include <vector> + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransaction : public TransactionBaseImpl { + public: + OptimisticTransaction(OptimisticTransactionDB* db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + // No copying allowed + OptimisticTransaction(const OptimisticTransaction&) = delete; + void operator=(const OptimisticTransaction&) = delete; + + virtual ~OptimisticTransaction(); + + void Reinitialize(OptimisticTransactionDB* txn_db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + Status Rollback() override; + + Status SetName(const TransactionName& name) override; + + protected: + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + private: + ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_; + + friend class OptimisticTransactionCallback; + + void Initialize(const OptimisticTransactionOptions& txn_options); + + // Returns OK if it is safe to commit this transaction. Returns Status::Busy + // if there are read or write conflicts that would prevent us from committing + // OR if we can not determine whether there would be any such conflicts. + // + // Should only be called on writer thread. + Status CheckTransactionForConflicts(DB* db); + + void Clear() override; + + void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */, + const Slice& /* unused */) override { + // Nothing to unlock. + } + + Status CommitWithSerialValidate(); + + Status CommitWithParallelValidate(); +}; + +// Used at commit time to trigger transaction validation +class OptimisticTransactionCallback : public WriteCallback { + public: + explicit OptimisticTransactionCallback(OptimisticTransaction* txn) + : txn_(txn) {} + + Status Callback(DB* db) override { + return txn_->CheckTransactionForConflicts(db); + } + + bool AllowWriteBatching() override { return false; } + + private: + OptimisticTransaction* txn_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc new file mode 100644 index 000000000..bffb3d5ed --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/optimistic_transaction_db_impl.h" + +#include <string> +#include <vector> + +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "utilities/transactions/optimistic_transaction.h" + +namespace ROCKSDB_NAMESPACE { + +Transaction* OptimisticTransactionDBImpl::BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new OptimisticTransaction(this, write_options, txn_options); + } +} + +std::unique_lock<std::mutex> OptimisticTransactionDBImpl::LockBucket( + size_t idx) { + assert(idx < bucketed_locks_.size()); + return std::unique_lock<std::mutex>(*bucketed_locks_[idx]); +} + +Status OptimisticTransactionDB::Open(const Options& options, + const std::string& dbname, + OptimisticTransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector<ColumnFamilyDescriptor> column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector<ColumnFamilyHandle*> handles; + Status s = Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + OptimisticTransactionDB** dbptr) { + return OptimisticTransactionDB::Open(db_options, + OptimisticTransactionDBOptions(), dbname, + column_families, handles, dbptr); +} + +Status OptimisticTransactionDB::Open( + const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + OptimisticTransactionDB** dbptr) { + Status s; + DB* db; + + std::vector<ColumnFamilyDescriptor> column_families_copy = column_families; + + // Enable MemTable History if not already enabled + for (auto& column_family : column_families_copy) { + ColumnFamilyOptions* options = &column_family.options; + + if (options->max_write_buffer_size_to_maintain == 0 && + options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + options->max_write_buffer_size_to_maintain = -1; + } + } + + s = DB::Open(db_options, dbname, column_families_copy, handles, &db); + + if (s.ok()) { + *dbptr = new OptimisticTransactionDBImpl(db, occ_options); + } + + return s; +} + +void OptimisticTransactionDBImpl::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + assert(dynamic_cast<OptimisticTransaction*>(txn) != nullptr); + auto txn_impl = reinterpret_cast<OptimisticTransaction*>(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h new file mode 100644 index 000000000..88e86ea4a --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <mutex> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransactionDBImpl : public OptimisticTransactionDB { + public: + explicit OptimisticTransactionDBImpl( + DB* db, const OptimisticTransactionDBOptions& occ_options, + bool take_ownership = true) + : OptimisticTransactionDB(db), + db_owner_(take_ownership), + validate_policy_(occ_options.validate_policy) { + if (validate_policy_ == OccValidationPolicy::kValidateParallel) { + uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets); + bucketed_locks_.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + bucketed_locks_.emplace_back( + std::unique_ptr<std::mutex>(new std::mutex)); + } + } + } + + ~OptimisticTransactionDBImpl() { + // Prevent this stackable from destroying + // base db + if (!db_owner_) { + db_ = nullptr; + } + } + + Transaction* BeginTransaction(const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, + Transaction* old_txn) override; + + // Transactional `DeleteRange()` is not yet supported. + using StackableDB::DeleteRange; + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + + // Range deletions also must not be snuck into `WriteBatch`es as they are + // incompatible with `OptimisticTransactionDB`. + virtual Status Write(const WriteOptions& write_opts, + WriteBatch* batch) override { + if (batch->HasDeleteRange()) { + return Status::NotSupported(); + } + return OptimisticTransactionDB::Write(write_opts, batch); + } + + size_t GetLockBucketsSize() const { return bucketed_locks_.size(); } + + OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } + + std::unique_lock<std::mutex> LockBucket(size_t idx); + + private: + // NOTE: used in validation phase. Each key is hashed into some + // bucket. We then take the lock in the hash value order to avoid deadlock. + std::vector<std::unique_ptr<std::mutex>> bucketed_locks_; + + bool db_owner_; + + const OccValidationPolicy validate_policy_; + + void ReinitializeTransaction(Transaction* txn, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc new file mode 100644 index 000000000..aa8192c32 --- /dev/null +++ b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc @@ -0,0 +1,1491 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <functional> +#include <string> +#include <thread> + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/transaction_test_util.h" +#include "util/crc32c.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class OptimisticTransactionTest + : public testing::Test, + public testing::WithParamInterface<OccValidationPolicy> { + public: + OptimisticTransactionDB* txn_db; + std::string dbname; + Options options; + + OptimisticTransactionTest() { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; + options.merge_operator.reset(new TestPutOperator()); + dbname = test::PerThreadDBPath("optimistic_transaction_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + Open(); + } + ~OptimisticTransactionTest() override { + delete txn_db; + EXPECT_OK(DestroyDB(dbname, options)); + } + + void Reopen() { + delete txn_db; + txn_db = nullptr; + Open(); + } + + private: + void Open() { + ColumnFamilyOptions cf_options(options); + OptimisticTransactionDBOptions occ_opts; + occ_opts.validate_policy = GetParam(); + std::vector<ColumnFamilyDescriptor> column_families; + std::vector<ColumnFamilyHandle*> handles; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + Status s = + OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname, + column_families, &handles, &txn_db); + + ASSERT_OK(s); + ASSERT_NE(txn_db, nullptr); + ASSERT_EQ(handles.size(), 1); + delete handles[0]; + } +}; + +TEST_P(OptimisticTransactionTest, SuccessTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("foo", "bar2")); + + // This Put outside of a transaction will conflict with the previous write + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_EQ(1, txn->GetNumKeys()); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn_db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + // This Put outside of a transaction will conflict with a later write + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn->Put( + "foo", "bar2")); // Conflicts with write done after snapshot taken + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn_db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflictTest3) { + ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar")); + + Transaction* txn = txn_db->BeginTransaction(WriteOptions()); + ASSERT_NE(txn, nullptr); + + std::string value; + ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Merge("foo", "bar3")); + + // Merge outside of a transaction should conflict with the previous merge + ASSERT_OK(txn_db->Merge(WriteOptions(), "foo", "bar2")); + ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_EQ(1, txn->GetNumKeys()); + + Status s = txn->Commit(); + EXPECT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, WriteConflict4) { + ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar")); + + Transaction* txn = txn_db->BeginTransaction(WriteOptions()); + ASSERT_NE(txn, nullptr); + + std::string value; + ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Merge("foo", "bar3")); + + // Range delete outside of a transaction should conflict with the previous + // merge inside txn + auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB()); + ColumnFamilyHandle* default_cf = dbimpl->DefaultColumnFamily(); + ASSERT_OK(dbimpl->DeleteRange(WriteOptions(), default_cf, "foo", "foo1")); + Status s = txn_db->Get(ReadOptions(), "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_EQ(1, txn->GetNumKeys()); + + s = txn->Commit(); + EXPECT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + s = txn_db->Get(ReadOptions(), "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + ASSERT_OK(txn_db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + ASSERT_OK(txn_db->Put(write_options, "foo", "barz")); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); // Txn should not commit + + // Verify that transaction did not write anything + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "barz"); + ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("x", "y")); + + ASSERT_OK(txn->Commit()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy")); + + // force a memtable flush + FlushOptions flush_ops; + ASSERT_OK(txn_db->Flush(flush_ops)); + + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, FlushTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a MemTable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy")); + + // force a memtable flush + FlushOptions flush_ops; + ASSERT_OK(txn_db->Flush(flush_ops)); + + // Put a random key so we have a MemTable to flush + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2")); + + // force a memtable flush + ASSERT_OK(txn_db->Flush(flush_ops)); + + ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3")); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + ASSERT_OK(txn_db->Flush(flush_ops)); + + Status s = txn->Commit(); + // txn should not commit since MemTableList History is not large enough + ASSERT_TRUE(s.IsTryAgain()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + delete txn; +} + +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + Reopen(); + + WriteOptions write_options; + ReadOptions read_options; + ReadOptions snapshot_read_options; + ReadOptions snapshot_read_options2; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn != nullptr); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn2 != nullptr); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + snapshot_read_options2.snapshot = txn2->GetSnapshot(); + ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value)); + ASSERT_EQ(value, "bar"); + ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2"))); + + // txn updates "foo" and txn2 updates "foo2", and now a write is + // issued for "foo", which conflicts with txn but not txn2 + ASSERT_OK(txn_db->Put(write_options, "foo", "bar")); + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"OptimisticTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(txn_db->Flush(flush_ops)); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB()); + ASSERT_OK(db_impl->TEST_SwitchMemtable()); + } + uint64_t num_imm_mems; + ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, when this transaction is commited, + // only need to check the active memtable + Transaction* txn3 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn3 != nullptr); + + // Commit both of txn and txn2. txn will conflict but txn2 will + // pass. In both ways, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + + get_perf_context()->Reset(); + Status s = txn->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + // txn should fail because of conflict, even if the memtable + // has flushed, because it is still preserved in history. + ASSERT_TRUE(s.IsBusy()); + + get_perf_context()->Reset(); + s = txn2->Commit(); + // We should have checked two memtables + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2"))); + get_perf_context()->Reset(); + s = txn3->Commit(); + // txn3 is created after the active memtable is created, so that is the only + // memtable to check. + ASSERT_EQ(1, get_perf_context()->get_from_memtable_count); + ASSERT_TRUE(s.ok()); + + TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + } +} + +TEST_P(OptimisticTransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + // Modify key after transaction start + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("AAA", "bar2")); + + // Should commit since read/write was done after data changed + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar")); + ASSERT_OK(txn_db->Put(write_options, "BBB", "bar")); + ASSERT_OK(txn_db->Put(write_options, "CCC", "bar")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("AAA", "bar2")); + + // Modify BBB before snapshot is taken + ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1")); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("BBB", "bar2")); + + ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1")); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn->Put("CCC", "bar2")); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn_db->Get(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn_db->Get(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + ASSERT_OK(txn_db->Get(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn_db->Get(read_options, "BBB", &value)); + ASSERT_EQ(value, "bar2"); + ASSERT_OK(txn_db->Get(read_options, "CCC", &value)); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = txn_db->BeginTransaction(write_options); + + // Potentially conflicting writes + ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz")); + ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx")); + + txn->SetSnapshot(); + + OptimisticTransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + ASSERT_OK(txn->Put("ZZZ", "zzzz")); + ASSERT_OK(txn->Commit()); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + ASSERT_OK(txn2->Put("ZZZ", "xxxxx")); + + Status s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn2; +} + +TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa)); + ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb)); + + delete cfa; + delete cfb; + delete txn_db; + txn_db = nullptr; + + // open DB with three column families + std::vector<ColumnFamilyDescriptor> column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + std::vector<ColumnFamilyHandle*> handles; + ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families, + &handles, &txn_db)); + assert(txn_db != nullptr); + ASSERT_NE(txn_db, nullptr); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "foo")); + ASSERT_OK(batch.Put(handles[1], "AAA", "bar")); + ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar")); + ASSERT_OK(txn_db->Write(write_options, &batch)); + ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ")); + + // These keys do no conflict with existing writes since they're in + // different column families + ASSERT_OK(txn->Delete("AAA")); + Status s = + txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2))); + + ASSERT_EQ(3, txn->GetNumKeys()); + + // Txn should commit + ASSERT_OK(txn->Commit()); + s = txn_db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + // This write will cause a conflict with the earlier batch write + ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1))); + + ASSERT_OK(txn2->Delete(handles[2], "XXX")); + ASSERT_OK(txn2->Delete(handles[1], "XXX")); + s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Verify txn did not commit + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(value, "barbar"); + + delete txn; + delete txn2; + + txn = txn_db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_NE(txn, nullptr); + + std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector<std::string> values(4); + + std::vector<Status> results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + ASSERT_OK(txn->Delete(handles[2], "ZZZ")); + ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY")); + ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY")); + ASSERT_OK(txn->Delete(handles[2], "ZZZ")); + ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar")); + + ASSERT_EQ(5, txn->GetNumKeys()); + + // Txn should commit + ASSERT_OK(txn->Commit()); + s = txn_db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000")); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + // Verify Txn Did not Commit + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + s = txn_db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = txn_db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(OptimisticTransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->Commit()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->Rollback()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value)); + ASSERT_EQ(value, "aaa"); + + ASSERT_OK(txn->Commit()); + delete txn; + + txn = txn_db->BeginTransaction(write_options); + txn->SetSnapshot(); + ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value)); + ASSERT_EQ(value, "aaa"); + + ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx")); + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn; +} + +TEST_P(OptimisticTransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + OptimisticTransactionOptions txn_options; + std::string value; + + txn_options.set_snapshot = true; + Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector<Slice> multiget_keys = {"1", "2", "3"}; + std::vector<std::string> multiget_values; + + std::vector<Status> results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + ASSERT_OK(txn2->Put("2", "x")); + + ASSERT_OK(txn2->Commit()); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + // should not commit since txn2 wrote a key txn has read + Status s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("4", "x")); + + ASSERT_OK(txn2->Delete("4")); + + // txn1 can commit since txn2's delete hasn't happened yet (it's just batched) + ASSERT_OK(txn1->Commit()); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsNotFound()); + + // txn2 cannot commit since txn1 changed "4" + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; +} + +TEST_P(OptimisticTransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + OptimisticTransactionOptions txn_options; + std::string value; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = txn_db->BeginTransaction(write_options); + Transaction* txn2 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->Put("1", "1")); + ASSERT_OK(txn2->Put("1", "2")); + + ASSERT_OK(txn1->Commit()); + + Status s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "3")); + ASSERT_OK(txn2->Put("1", "4")); + + ASSERT_OK(txn1->Commit()); + + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "5")); + ASSERT_OK(txn1->Commit()); + + ASSERT_OK(txn2->Put("1", "6")); + s = txn2->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + ASSERT_OK(txn1->Put("1", "5")); + ASSERT_OK(txn1->Commit()); + + txn2->SetSnapshot(); + ASSERT_OK(txn2->Put("1", "6")); + ASSERT_OK(txn2->Commit()); + + delete txn1; + delete txn2; + + txn1 = txn_db->BeginTransaction(write_options); + txn2 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->Put("1", "7")); + ASSERT_OK(txn1->Commit()); + + ASSERT_OK(txn2->Put("1", "8")); + ASSERT_OK(txn2->Commit()); + + delete txn1; + delete txn2; + + ASSERT_OK(txn_db->Get(read_options, "1", &value)); + ASSERT_EQ(value, "8"); +} + +TEST_P(OptimisticTransactionTest, UntrackedWrites) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn->PutUntracked("untracked", "0")); + ASSERT_OK(txn->Rollback()); + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn->Put("tracked", "1")); + ASSERT_OK(txn->PutUntracked("untracked", "1")); + ASSERT_OK(txn->MergeUntracked("untracked", "2")); + ASSERT_OK(txn->DeleteUntracked("untracked")); + + // Write to the untracked key outside of the transaction and verify + // it doesn't prevent the transaction from committing. + ASSERT_OK(txn_db->Put(write_options, "untracked", "x")); + + ASSERT_OK(txn->Commit()); + + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn->Put("tracked", "10")); + ASSERT_OK(txn->PutUntracked("untracked", "A")); + + // Write to tracked key outside of the transaction and verify that the + // untracked keys are not written when the commit fails. + ASSERT_OK(txn_db->Delete(write_options, "tracked")); + + s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + s = txn_db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, IteratorTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + // Write some keys to the db + ASSERT_OK(txn_db->Put(write_options, "A", "a")); + ASSERT_OK(txn_db->Put(write_options, "G", "g")); + ASSERT_OK(txn_db->Put(write_options, "F", "f")); + ASSERT_OK(txn_db->Put(write_options, "C", "c")); + ASSERT_OK(txn_db->Put(write_options, "D", "d")); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + // Write some keys in a txn + ASSERT_OK(txn->Put("B", "b")); + ASSERT_OK(txn->Put("H", "h")); + ASSERT_OK(txn->Delete("D")); + ASSERT_OK(txn->Put("E", "e")); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + ASSERT_OK(txn_db->Put(write_options, "BB", "xx")); + ASSERT_OK(txn_db->Put(write_options, "C", "xx")); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr)); + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + // key "C" was modified in the db after txn's snapshot. txn will not commit. + Status s = txn->Commit(); + ASSERT_TRUE(s.IsBusy()); + + delete iter; + delete txn; +} + +TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) { + // `OptimisticTransactionDB` does not allow range deletion in any API. + ASSERT_TRUE( + txn_db + ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + ASSERT_NOK(txn_db->Write(WriteOptions(), &wb)); +} + +TEST_P(OptimisticTransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + Status s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + txn->SetSavePoint(); // 1 + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("B", "b")); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + delete txn; + txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + ASSERT_OK(txn->Put("A", "a")); + ASSERT_OK(txn->Put("B", "bb")); + ASSERT_OK(txn->Put("C", "c")); + + txn->SetSavePoint(); // 2 + + ASSERT_OK(txn->Delete("B")); + ASSERT_OK(txn->Put("C", "cc")); + ASSERT_OK(txn->Put("D", "d")); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2 + + ASSERT_OK(txn->Get(read_options, "A", &value)); + ASSERT_EQ("a", value); + ASSERT_OK(txn->Get(read_options, "B", &value)); + ASSERT_EQ("bb", value); + ASSERT_OK(txn->Get(read_options, "C", &value)); + ASSERT_EQ("c", value); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("A", "a")); + ASSERT_OK(txn->Put("E", "e")); + + // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Rollback()); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Put("A", "aa")); + ASSERT_OK(txn->Put("F", "f")); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + ASSERT_OK(txn->Put("G", "g")); + ASSERT_OK(txn->Delete("F")); + ASSERT_OK(txn->Delete("B")); + + ASSERT_OK(txn->Get(read_options, "A", &value)); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3 + + ASSERT_OK(txn->Get(read_options, "F", &value)); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "F", &value)); + ASSERT_EQ("f", value); + + s = txn_db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(txn_db->Get(read_options, "A", &value)); + ASSERT_EQ("aa", value); + + ASSERT_OK(txn_db->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + s = txn_db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn_db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn_db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, "A", "")); + + Transaction* txn1 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn1->Put("A", "a")); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + Status s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + + txn1->SetSavePoint(); + ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value)); + txn1->UndoGetForUpdate("A"); + + ASSERT_OK(txn1->RollbackToSavePoint()); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + ASSERT_OK(txn2->Put("A", "x")); + ASSERT_OK(txn2->Commit()); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + ASSERT_OK(txn1->Commit()); + delete txn1; +} + +namespace { +Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db, + const size_t num_transactions, + const size_t num_sets, + const size_t num_keys_per_set) { + size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 _rand(seed); + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + txn_options.set_snapshot = true; + + RandomTransactionInserter inserter(&_rand, write_options, read_options, + num_keys_per_set, + static_cast<uint16_t>(num_sets)); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.OptimisticTransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + + inserter.GetLastStatus().PermitUncheckedError(); + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) { + const size_t num_threads = 4; + const size_t num_transactions_per_thread = 10000; + const size_t num_sets = 3; + const size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector<port::Thread> threads; + + std::function<void()> call_inserter = [&] { + ASSERT_OK(OptimisticTransactionStressTestInserter( + txn_db, num_transactions_per_thread, num_sets, num_keys_per_set)); + }; + + // Create N threads that use RandomTransactionInserter to write + // many transactions. + for (uint32_t i = 0; i < num_threads; i++) { + threads.emplace_back(call_inserter); + } + + // Wait for all threads to run + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(txn_db, num_sets); + ASSERT_OK(s); +} + +TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { + WriteOptions write_options; + OptimisticTransactionOptions transaction_options; + + Transaction* transaction( + txn_db->BeginTransaction(write_options, transaction_options)); + Status s = transaction->Put("foo", "val"); + ASSERT_OK(s); + s = transaction->Put("foo2", "val"); + ASSERT_OK(s); + s = transaction->Put("foo3", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + ASSERT_OK(s); + delete transaction; + + Reopen(); + transaction = txn_db->BeginTransaction(write_options, transaction_options); + s = transaction->Put("bar", "val"); + ASSERT_OK(s); + s = transaction->Put("bar2", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + ASSERT_OK(s); + + delete transaction; +} + +TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) { + std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) { + std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions())); + ASSERT_OK(txn->Put("a", "v")); + std::shared_ptr<const Snapshot> snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + +INSTANTIATE_TEST_CASE_P( + InstanceOccGroup, OptimisticTransactionTest, + testing::Values(OccValidationPolicy::kValidateSerial, + OccValidationPolicy::kValidateParallel)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf( + stderr, + "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc new file mode 100644 index 000000000..cb8fd3bb6 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc @@ -0,0 +1,1175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/pessimistic_transaction.h" + +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_util.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +std::atomic<TransactionID> PessimisticTransaction::txn_id_counter_(1); + +TransactionID PessimisticTransaction::GenTxnID() { + return txn_id_counter_.fetch_add(1); +} + +PessimisticTransaction::PessimisticTransaction( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options, const bool init) + : TransactionBaseImpl( + txn_db->GetRootDB(), write_options, + static_cast_with_check<PessimisticTransactionDB>(txn_db) + ->GetLockTrackerFactory()), + txn_db_impl_(nullptr), + expiration_time_(0), + txn_id_(0), + waiting_cf_id_(0), + waiting_key_(nullptr), + lock_timeout_(0), + deadlock_detect_(false), + deadlock_detect_depth_(0), + skip_concurrency_control_(false) { + txn_db_impl_ = static_cast_with_check<PessimisticTransactionDB>(txn_db); + db_impl_ = static_cast_with_check<DBImpl>(db_); + if (init) { + Initialize(txn_options); + } +} + +void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) { + // Range lock manager uses address of transaction object as TXNID + const TransactionDBOptions& db_options = txn_db_impl_->GetTxnDBOptions(); + if (db_options.lock_mgr_handle && + db_options.lock_mgr_handle->getLockManager()->IsRangeLockSupported()) { + txn_id_ = reinterpret_cast<TransactionID>(this); + } else { + txn_id_ = GenTxnID(); + } + + txn_state_ = STARTED; + + deadlock_detect_ = txn_options.deadlock_detect; + deadlock_detect_depth_ = txn_options.deadlock_detect_depth; + write_batch_.SetMaxBytes(txn_options.max_write_batch_size); + skip_concurrency_control_ = txn_options.skip_concurrency_control; + + lock_timeout_ = txn_options.lock_timeout * 1000; + if (lock_timeout_ < 0) { + // Lock timeout not set, use default + lock_timeout_ = + txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000; + } + + if (txn_options.expiration >= 0) { + expiration_time_ = start_time_ + txn_options.expiration * 1000; + } else { + expiration_time_ = 0; + } + + if (txn_options.set_snapshot) { + SetSnapshot(); + } + + if (expiration_time_ > 0) { + txn_db_impl_->InsertExpirableTransaction(txn_id_, this); + } + use_only_the_last_commit_time_batch_for_recovery_ = + txn_options.use_only_the_last_commit_time_batch_for_recovery; + skip_prepare_ = txn_options.skip_prepare; + + read_timestamp_ = kMaxTxnTimestamp; + commit_timestamp_ = kMaxTxnTimestamp; +} + +PessimisticTransaction::~PessimisticTransaction() { + txn_db_impl_->UnLock(this, *tracked_locks_); + if (expiration_time_ > 0) { + txn_db_impl_->RemoveExpirableTransaction(txn_id_); + } + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } +} + +void PessimisticTransaction::Clear() { + txn_db_impl_->UnLock(this, *tracked_locks_); + TransactionBaseImpl::Clear(); +} + +void PessimisticTransaction::Reinitialize( + TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + if (!name_.empty() && txn_state_ != COMMITTED) { + txn_db_impl_->UnregisterTransaction(this); + } + TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options); + Initialize(txn_options); +} + +bool PessimisticTransaction::IsExpired() const { + if (expiration_time_ > 0) { + if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) { + // Transaction is expired. + return true; + } + } + + return false; +} + +WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options) {} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, value, exclusive, + do_validate); +} + +Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, const bool do_validate) { + return GetForUpdateImpl(read_options, column_family, key, pinnable_val, + exclusive, do_validate); +} + +template <typename TValue> +inline Status WriteCommittedTxn::GetForUpdateImpl( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + if (!read_options.timestamp) { + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); + } + } else { + Status s = db_impl_->FailIfTsMismatchCf( + column_family, *(read_options.timestamp), /*ts_for_read=*/true); + if (!s.ok()) { + return s; + } + } + + if (!do_validate) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with read_timestamp is not " + "defined."); + } else if (kMaxTxnTimestamp == read_timestamp_) { + return Status::InvalidArgument("read_timestamp must be set for validation"); + } + + if (!read_options.timestamp) { + ReadOptions read_opts_copy = read_options; + char ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(ts_buf, read_timestamp_); + Slice ts(ts_buf, sizeof(ts_buf)); + read_opts_copy.timestamp = &ts; + return TransactionBaseImpl::GetForUpdate(read_opts_copy, column_family, key, + value, exclusive, do_validate); + } + assert(read_options.timestamp); + const char* const ts_buf = read_options.timestamp->data(); + assert(read_options.timestamp->size() == sizeof(kMaxTxnTimestamp)); + TxnTimestamp ts = DecodeFixed64(ts_buf); + if (ts != read_timestamp_) { + return Status::InvalidArgument("Must read from the same read_timestamp"); + } + return TransactionBaseImpl::GetForUpdate(read_options, column_family, key, + value, exclusive, do_validate); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + return Operate( + column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, &value, this]() { + Status s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + ++num_puts_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + return Operate(column_family, key, /*do_validate=*/false, + /*assume_tracked=*/false, [column_family, &key, this]() { + Status s = + GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + ++num_deletes_; + } + return s; + }); +} + +Status WriteCommittedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + return Operate(column_family, key, do_validate, assume_tracked, + [column_family, &key, &value, this]() { + Status s = + GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + ++num_merges_; + } + return s; + }); +} + +template <typename TKey, typename TOperation> +Status WriteCommittedTxn::Operate(ColumnFamilyHandle* column_family, + const TKey& key, const bool do_validate, + const bool assume_tracked, + TOperation&& operation) { + Status s; + if constexpr (std::is_same_v<Slice, TKey>) { + s = TryLock(column_family, key, /*read_only=*/false, /*exclusive=*/true, + do_validate, assume_tracked); + } else if constexpr (std::is_same_v<SliceParts, TKey>) { + std::string key_buf; + Slice contiguous_key(key, &key_buf); + s = TryLock(column_family, contiguous_key, /*read_only=*/false, + /*exclusive=*/true, do_validate, assume_tracked); + } + if (!s.ok()) { + return s; + } + column_family = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0) { + assert(ts_sz == sizeof(TxnTimestamp)); + if (!IndexingEnabled()) { + cfs_with_ts_tracked_when_indexing_disabled_.insert( + column_family->GetID()); + } + } + return operation(); +} + +Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts < read_timestamp_) { + return Status::InvalidArgument( + "Cannot decrease read timestamp for validation"); + } + read_timestamp_ = ts; + return Status::OK(); +} + +Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) { + if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) { + return Status::InvalidArgument( + "Cannot commit at timestamp smaller than or equal to read timestamp"); + } + commit_timestamp_ = ts; + return Status::OK(); +} + +Status PessimisticTransaction::CommitBatch(WriteBatch* batch) { + if (batch && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + // CommitBatch() needs to lock the keys in the batch. + // However, the application also needs to specify the timestamp for the + // keys in batch before calling this API. + // This means timestamp order may violate the order of locking, thus + // violate the sequence number order for the same user key. + // Therefore, we disallow this operation for now. + return Status::NotSupported( + "Batch to commit includes timestamp assigned before locking"); + } + + std::unique_ptr<LockTracker> keys_to_unlock(lock_tracker_factory_.Create()); + Status s = LockBatch(batch, keys_to_unlock.get()); + + if (!s.ok()) { + return s; + } + + bool can_commit = false; + + if (IsExpired()) { + s = Status::Expired(); + } else if (expiration_time_ > 0) { + TransactionState expected = STARTED; + can_commit = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_COMMIT); + } else if (txn_state_ == STARTED) { + // lock stealing is not a concern + can_commit = true; + } + + if (can_commit) { + txn_state_.store(AWAITING_COMMIT); + s = CommitBatchInternal(batch); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + txn_db_impl_->UnLock(this, *keys_to_unlock); + + return s; +} + +Status PessimisticTransaction::Prepare() { + if (name_.empty()) { + return Status::InvalidArgument( + "Cannot prepare a transaction that has not been named."); + } + + if (IsExpired()) { + return Status::Expired(); + } + + Status s; + bool can_prepare = false; + + if (expiration_time_ > 0) { + // must concern ourselves with expiraton and/or lock stealing + // need to compare/exchange bc locks could be stolen under us here + TransactionState expected = STARTED; + can_prepare = std::atomic_compare_exchange_strong(&txn_state_, &expected, + AWAITING_PREPARE); + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not possible + txn_state_.store(AWAITING_PREPARE); + can_prepare = true; + } + + if (can_prepare) { + // transaction can't expire after preparation + expiration_time_ = 0; + assert(log_number_ == 0 || + txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + + s = PrepareInternal(); + if (s.ok()) { + txn_state_.store(PREPARED); + } + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == PREPARED) { + s = Status::InvalidArgument("Transaction has already been prepared."); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_); + assert(s.ok()); + class MarkLogCallback : public PreReleaseCallback { + public: + MarkLogCallback(DBImpl* db, bool two_write_queues) + : db_(db), two_write_queues_(two_write_queues) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber, bool is_mem_disabled, + uint64_t log_number, size_t /*index*/, + size_t /*total*/) override { +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + assert(log_number != 0); + assert(!two_write_queues_ || is_mem_disabled); // implies the 2nd queue + db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number); + return Status::OK(); + } + + private: + DBImpl* db_; + bool two_write_queues_; + } mark_log_callback(db_impl_, + db_impl_->immutable_db_options().two_write_queues); + + WriteCallback* const kNoWriteCallback = nullptr; + const uint64_t kRefNoLog = 0; + const bool kDisableMemtable = true; + SequenceNumber* const KIgnoreSeqUsed = nullptr; + const size_t kNoBatchCount = 0; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + kNoWriteCallback, &log_number_, kRefNoLog, + kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount, + &mark_log_callback); + return s; +} + +Status PessimisticTransaction::Commit() { + bool commit_without_prepare = false; + bool commit_prepared = false; + + if (IsExpired()) { + return Status::Expired(); + } + + if (expiration_time_ > 0) { + // we must atomicaly compare and exchange the state here because at + // this state in the transaction it is possible for another thread + // to change our state out from under us in the even that we expire and have + // our locks stolen. In this case the only valid state is STARTED because + // a state of PREPARED would have a cleared expiration_time_. + TransactionState expected = STARTED; + commit_without_prepare = std::atomic_compare_exchange_strong( + &txn_state_, &expected, AWAITING_COMMIT); + TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1"); + } else if (txn_state_ == PREPARED) { + // expiration and lock stealing is not a concern + commit_prepared = true; + } else if (txn_state_ == STARTED) { + // expiration and lock stealing is not a concern + if (skip_prepare_) { + commit_without_prepare = true; + } else { + return Status::TxnNotPrepared(); + } + } + + Status s; + if (commit_without_prepare) { + assert(!commit_prepared); + if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) { + s = Status::InvalidArgument( + "Commit-time batch contains values that will not be committed."); + } else { + txn_state_.store(AWAITING_COMMIT); + if (log_number_ > 0) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + s = CommitWithoutPrepareInternal(); + if (!name_.empty()) { + txn_db_impl_->UnregisterTransaction(this); + } + Clear(); + if (s.ok()) { + txn_state_.store(COMMITTED); + } + } + } else if (commit_prepared) { + txn_state_.store(AWAITING_COMMIT); + + s = CommitInternal(); + + if (!s.ok()) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Commit write failed"); + return s; + } + + // FindObsoleteFiles must now look to the memtables + // to determine what prep logs must be kept around, + // not the prep section heap. + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + txn_db_impl_->UnregisterTransaction(this); + + Clear(); + txn_state_.store(COMMITTED); + } else if (txn_state_ == LOCKS_STOLEN) { + s = Status::Expired(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("Transaction has already been committed."); + } else if (txn_state_ == ROLLEDBACK) { + s = Status::InvalidArgument("Transaction has already been rolledback."); + } else { + s = Status::InvalidArgument("Transaction is not in state for commit."); + } + + return s; +} + +Status WriteCommittedTxn::CommitWithoutPrepareInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + + if (needs_ts) { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + + Status s = + wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf); + if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits<uint64_t>::max(); + }); + if (!s.ok()) { + return s; + } + } + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must set transaction commit timestamp"); + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + auto s = db_impl_->WriteImpl(write_options_, wb, + /*callback*/ nullptr, /*log_used*/ nullptr, + /*log_ref*/ 0, /*disable_memtable*/ false, + &seq_used, /*batch_cnt=*/0, + /*pre_release_callback=*/nullptr, post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) { + uint64_t seq_used = kMaxSequenceNumber; + auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ 0, + /*disable_memtable*/ false, &seq_used); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status WriteCommittedTxn::CommitInternal() { + WriteBatchWithIndex* wbwi = GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + + const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb); + if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) { + return Status::InvalidArgument("Must assign a commit timestamp"); + } + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + + Status s; + if (!needs_ts) { + s = WriteBatchInternal::MarkCommit(working_batch, name_); + } else { + assert(commit_timestamp_ != kMaxTxnTimestamp); + char commit_ts_buf[sizeof(kMaxTxnTimestamp)]; + EncodeFixed64(commit_ts_buf, commit_timestamp_); + Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf)); + s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_, + commit_ts); + if (s.ok()) { + s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t { + if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) != + cfs_with_ts_tracked_when_indexing_disabled_.end()) { + return sizeof(kMaxTxnTimestamp); + } + const Comparator* ucmp = + WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf); + return ucmp ? ucmp->timestamp_size() + : std::numeric_limits<uint64_t>::max(); + }); + } + } + + if (!s.ok()) { + return s; + } + + // any operations appended to this working_batch will be ignored from WAL + working_batch->MarkWalTerminationPoint(); + + // insert prepared batch into Memtable only skipping WAL. + // Memtable will ignore BeginPrepare/EndPrepare markers + // in non recovery mode and simply insert the values + s = WriteBatchInternal::Append(working_batch, wb); + assert(s.ok()); + + uint64_t seq_used = kMaxSequenceNumber; + SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_, + snapshot_notifier_, snapshot_); + PostMemTableCallback* post_mem_cb = nullptr; + if (snapshot_needed_) { + if (commit_timestamp_ == kMaxTxnTimestamp) { + s = Status::InvalidArgument("Must set transaction commit timestamp"); + return s; + } else { + post_mem_cb = &snapshot_creation_cb; + } + } + s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr, + /*log_used*/ nullptr, /*log_ref*/ log_number_, + /*disable_memtable*/ false, &seq_used, + /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, + post_mem_cb); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (s.ok()) { + SetId(seq_used); + } + return s; +} + +Status PessimisticTransaction::Rollback() { + Status s; + if (txn_state_ == PREPARED) { + txn_state_.store(AWAITING_ROLLBACK); + + s = RollbackInternal(); + + if (s.ok()) { + // we do not need to keep our prepared section around + assert(log_number_ > 0); + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + Clear(); + txn_state_.store(ROLLEDBACK); + } + } else if (txn_state_ == STARTED) { + if (log_number_ > 0) { + assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED); + assert(GetId() > 0); + s = RollbackInternal(); + + if (s.ok()) { + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + // prepare couldn't have taken place + Clear(); + } else if (txn_state_ == COMMITTED) { + s = Status::InvalidArgument("This transaction has already been committed."); + } else { + s = Status::InvalidArgument( + "Two phase transaction is not in state for rollback."); + } + + return s; +} + +Status WriteCommittedTxn::RollbackInternal() { + WriteBatch rollback_marker; + auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &rollback_marker); + return s; +} + +Status PessimisticTransaction::RollbackToSavePoint() { + if (txn_state_ != STARTED) { + return Status::InvalidArgument("Transaction is beyond state for rollback."); + } + + if (save_points_ != nullptr && !save_points_->empty()) { + // Unlock any keys locked since last transaction + auto& save_point_tracker = *save_points_->top().new_locks_; + std::unique_ptr<LockTracker> t( + tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker)); + if (t) { + txn_db_impl_->UnLock(this, *t); + } + } + + return TransactionBaseImpl::RollbackToSavePoint(); +} + +// Lock all keys in this batch. +// On success, caller should unlock keys_to_unlock +Status PessimisticTransaction::LockBatch(WriteBatch* batch, + LockTracker* keys_to_unlock) { + if (!batch) { + return Status::InvalidArgument("batch is nullptr"); + } + + class Handler : public WriteBatch::Handler { + public: + // Sorted map of column_family_id to sorted set of keys. + // Since LockBatch() always locks keys in sorted order, it cannot deadlock + // with itself. We're not using a comparator here since it doesn't matter + // what the sorting is as long as it's consistent. + std::map<uint32_t, std::set<std::string>> keys_; + + Handler() {} + + void RecordKey(uint32_t column_family_id, const Slice& key) { + std::string key_str = key.ToString(); + + auto& cfh_keys = keys_[column_family_id]; + auto iter = cfh_keys.find(key_str); + if (iter == cfh_keys.end()) { + // key not yet seen, store it. + cfh_keys.insert({std::move(key_str)}); + } + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& /* unused */) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + RecordKey(column_family_id, key); + return Status::OK(); + } + }; + + // Iterating on this handler will add all keys in this batch into keys + Handler handler; + Status s = batch->Iterate(&handler); + if (!s.ok()) { + return s; + } + + // Attempt to lock all keys + for (const auto& cf_iter : handler.keys_) { + uint32_t cfh_id = cf_iter.first; + auto& cfh_keys = cf_iter.second; + + for (const auto& key_iter : cfh_keys) { + const std::string& key = key_iter; + + s = txn_db_impl_->TryLock(this, cfh_id, key, true /* exclusive */); + if (!s.ok()) { + break; + } + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = kMaxSequenceNumber; + r.read_only = false; + r.exclusive = true; + keys_to_unlock->Track(r); + } + + if (!s.ok()) { + break; + } + } + + if (!s.ok()) { + txn_db_impl_->UnLock(this, *keys_to_unlock); + } + + return s; +} + +// Attempt to lock this key. +// Returns OK if the key has been successfully locked. Non-ok, otherwise. +// If check_shapshot is true and this transaction has a snapshot set, +// this key will only be locked if there have been no writes to this key since +// the snapshot time. +Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family, + const Slice& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + assert(!assume_tracked || !do_validate); + Status s; + if (UNLIKELY(skip_concurrency_control_)) { + return s; + } + uint32_t cfh_id = GetColumnFamilyID(column_family); + std::string key_str = key.ToString(); + + PointLockStatus status; + bool lock_upgrade; + bool previously_locked; + if (tracked_locks_->IsPointLockSupported()) { + status = tracked_locks_->GetPointLockStatus(cfh_id, key_str); + previously_locked = status.locked; + lock_upgrade = previously_locked && exclusive && !status.exclusive; + } else { + // If the record is tracked, we can assume it was locked, too. + previously_locked = assume_tracked; + status.locked = false; + lock_upgrade = false; + } + + // Lock this key if this transactions hasn't already locked it or we require + // an upgrade. + if (!previously_locked || lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, exclusive); + } + + const ColumnFamilyHandle* const cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + SetSnapshotIfNeeded(); + + // Even though we do not care about doing conflict checking for this write, + // we still need to take a lock to make sure we do not cause a conflict with + // some other write. However, we do not need to check if there have been + // any writes since this transaction's snapshot. + // TODO(agiardullo): could optimize by supporting shared txn locks in the + // future. + SequenceNumber tracked_at_seq = + status.locked ? status.seq : kMaxSequenceNumber; + if (!do_validate || (snapshot_ == nullptr && + (0 == ts_sz || kMaxTxnTimestamp == read_timestamp_))) { + if (assume_tracked && !previously_locked && + tracked_locks_->IsPointLockSupported()) { + s = Status::InvalidArgument( + "assume_tracked is set but it is not tracked yet"); + } + // Need to remember the earliest sequence number that we know that this + // key has not been modified after. This is useful if this same + // transaction later tries to lock this key again. + if (tracked_at_seq == kMaxSequenceNumber) { + // Since we haven't checked a snapshot, we only know this key has not + // been modified since after we locked it. + // Note: when last_seq_same_as_publish_seq_==false this is less than the + // latest allocated seq but it is ok since i) this is just a heuristic + // used only as a hint to avoid actual check for conflicts, ii) this would + // cause a false positive only if the snapthot is taken right after the + // lock, which would be an unusual sequence. + tracked_at_seq = db_->GetLatestSequenceNumber(); + } + } else if (s.ok()) { + // If a snapshot is set, we need to make sure the key hasn't been modified + // since the snapshot. This must be done after we locked the key. + // If we already have validated an earilier snapshot it must has been + // reflected in tracked_at_seq and ValidateSnapshot will return OK. + s = ValidateSnapshot(column_family, key, &tracked_at_seq); + + if (!s.ok()) { + // Failed to validate key + // Unlock key we just locked + if (lock_upgrade) { + s = txn_db_impl_->TryLock(this, cfh_id, key_str, false /* exclusive */); + assert(s.ok()); + } else if (!previously_locked) { + txn_db_impl_->UnLock(this, cfh_id, key.ToString()); + } + } + } + + if (s.ok()) { + // We must track all the locked keys so that we can unlock them later. If + // the key is already locked, this func will update some stats on the + // tracked key. It could also update the tracked_at_seq if it is lower + // than the existing tracked key seq. These stats are necessary for + // RollbackToSavePoint to determine whether a key can be safely removed + // from tracked_keys_. Removal can only be done if a key was only locked + // during the current savepoint. + // + // Recall that if assume_tracked is true, we assume that TrackKey has been + // called previously since the last savepoint, with the same exclusive + // setting, and at a lower sequence number, so skipping here should be + // safe. + if (!assume_tracked) { + TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive); + } else { +#ifndef NDEBUG + if (tracked_locks_->IsPointLockSupported()) { + PointLockStatus lock_status = + tracked_locks_->GetPointLockStatus(cfh_id, key_str); + assert(lock_status.locked); + assert(lock_status.seq <= tracked_at_seq); + assert(lock_status.exclusive == exclusive); + } +#endif + } + } + + return s; +} + +Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_endp, + const Endpoint& end_endp) { + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + uint32_t cfh_id = GetColumnFamilyID(cfh); + + Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp); + + if (s.ok()) { + RangeLockRequest req{cfh_id, start_endp, end_endp}; + tracked_locks_->Track(req); + } + return s; +} + +// Return OK() if this key has not been modified more recently than the +// transaction snapshot_. +// tracked_at_seq is the global seq at which we either locked the key or already +// have done ValidateSnapshot. +Status PessimisticTransaction::ValidateSnapshot( + ColumnFamilyHandle* column_family, const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_ || read_timestamp_ < kMaxTxnTimestamp); + + SequenceNumber snap_seq = 0; + if (snapshot_) { + snap_seq = snapshot_->GetSequenceNumber(); + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated (or locked) at a sequence number + // earlier than the current snapshot's sequence number, we already know it + // has not been modified aftter snap_seq either. + return Status::OK(); + } + } else { + snap_seq = db_impl_->GetLatestSequenceNumber(); + } + + // Otherwise we have either + // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key + // 2: snap_seq < tracked_at_seq: last time we lock the key was via + // do_validate=false which means we had skipped ValidateSnapshot. In both + // cases we should do ValidateSnapshot now. + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0 && read_timestamp_ < kMaxTxnTimestamp) { + assert(ts_sz == sizeof(read_timestamp_)); + PutFixed64(&ts_buf, read_timestamp_); + } + + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf, + false /* cache_only */); +} + +bool PessimisticTransaction::TryStealingLocks() { + assert(IsExpired()); + TransactionState expected = STARTED; + return std::atomic_compare_exchange_strong(&txn_state_, &expected, + LOCKS_STOLEN); +} + +void PessimisticTransaction::UnlockGetForUpdate( + ColumnFamilyHandle* column_family, const Slice& key) { + txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString()); +} + +Status PessimisticTransaction::SetName(const TransactionName& name) { + Status s; + if (txn_state_ == STARTED) { + if (name_.length()) { + s = Status::InvalidArgument("Transaction has already been named."); + } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) { + s = Status::InvalidArgument("Transaction name must be unique."); + } else if (name.length() < 1 || name.length() > 512) { + s = Status::InvalidArgument( + "Transaction name length must be between 1 and 512 chars."); + } else { + name_ = name; + txn_db_impl_->RegisterTransaction(this); + } + } else { + s = Status::InvalidArgument("Transaction is beyond state for naming."); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.h b/src/rocksdb/utilities/transactions/pessimistic_transaction.h new file mode 100644 index 000000000..d43d1d3ac --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.h @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <atomic> +#include <mutex> +#include <stack> +#include <string> +#include <unordered_map> +#include <vector> + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB; + +// A transaction under pessimistic concurrency control. This class implements +// the locking API and interfaces with the lock manager as well as the +// pessimistic transactional db. +class PessimisticTransaction : public TransactionBaseImpl { + public: + PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options, + const bool init = true); + // No copying allowed + PessimisticTransaction(const PessimisticTransaction&) = delete; + void operator=(const PessimisticTransaction&) = delete; + + ~PessimisticTransaction() override; + + void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + + Status Prepare() override; + + Status Commit() override; + + // It is basically Commit without going through Prepare phase. The write batch + // is also directly provided instead of expecting txn to gradually batch the + // transactions writes to an internal write batch. + Status CommitBatch(WriteBatch* batch); + + Status Rollback() override; + + Status RollbackToSavePoint() override; + + Status SetName(const TransactionName& name) override; + + // Generate a new unique transaction identifier + static TransactionID GenTxnID(); + + TransactionID GetID() const override { return txn_id_; } + + std::vector<TransactionID> GetWaitingTxns(uint32_t* column_family_id, + std::string* key) const override { + std::lock_guard<std::mutex> lock(wait_mutex_); + std::vector<TransactionID> ids(waiting_txn_ids_.size()); + if (key) *key = waiting_key_ ? *waiting_key_ : ""; + if (column_family_id) *column_family_id = waiting_cf_id_; + std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin()); + return ids; + } + + void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id, + const std::string* key) { + std::lock_guard<std::mutex> lock(wait_mutex_); + waiting_txn_ids_ = ids; + waiting_cf_id_ = column_family_id; + waiting_key_ = key; + } + + void ClearWaitingTxn() { + std::lock_guard<std::mutex> lock(wait_mutex_); + waiting_txn_ids_.clear(); + waiting_cf_id_ = 0; + waiting_key_ = nullptr; + } + + // Returns the time (in microseconds according to Env->GetMicros()) + // that this transaction will be expired. Returns 0 if this transaction does + // not expire. + uint64_t GetExpirationTime() const { return expiration_time_; } + + // returns true if this transaction has an expiration_time and has expired. + bool IsExpired() const; + + // Returns the number of microseconds a transaction can wait on acquiring a + // lock or -1 if there is no timeout. + int64_t GetLockTimeout() const { return lock_timeout_; } + void SetLockTimeout(int64_t timeout) override { + lock_timeout_ = timeout * 1000; + } + + // Returns true if locks were stolen successfully, false otherwise. + bool TryStealingLocks(); + + bool IsDeadlockDetect() const override { return deadlock_detect_; } + + int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; } + + virtual Status GetRangeLock(ColumnFamilyHandle* column_family, + const Endpoint& start_key, + const Endpoint& end_key) override; + + protected: + // Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery + bool use_only_the_last_commit_time_batch_for_recovery_ = false; + // Refer to + // TransactionOptions::skip_prepare + bool skip_prepare_ = false; + + virtual Status PrepareInternal() = 0; + + virtual Status CommitWithoutPrepareInternal() = 0; + + // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch + // with no duplicate keys. If zero, then the number of sub-batches is unknown. + virtual Status CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt = 0) = 0; + + virtual Status CommitInternal() = 0; + + virtual Status RollbackInternal() = 0; + + virtual void Initialize(const TransactionOptions& txn_options); + + Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock); + + Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false) override; + + void Clear() override; + + PessimisticTransactionDB* txn_db_impl_; + DBImpl* db_impl_; + + // If non-zero, this transaction should not be committed after this time (in + // microseconds according to Env->NowMicros()) + uint64_t expiration_time_; + + // Timestamp used by the transaction to perform all GetForUpdate. + // Use this timestamp for conflict checking. + // read_timestamp_ == kMaxTxnTimestamp means this transaction has not + // performed any GetForUpdate. It is possible that the transaction has + // performed blind writes or Get, though. + TxnTimestamp read_timestamp_{kMaxTxnTimestamp}; + TxnTimestamp commit_timestamp_{kMaxTxnTimestamp}; + + private: + friend class TransactionTest_ValidateSnapshotTest_Test; + // Used to create unique ids for transactions. + static std::atomic<TransactionID> txn_id_counter_; + + // Unique ID for this transaction + TransactionID txn_id_; + + // IDs for the transactions that are blocking the current transaction. + // + // empty if current transaction is not waiting. + autovector<TransactionID> waiting_txn_ids_; + + // The following two represents the (cf, key) that a transaction is waiting + // on. + // + // If waiting_key_ is not null, then the pointer should always point to + // a valid string object. The reason is that it is only non-null when the + // transaction is blocked in the PointLockManager::AcquireWithTimeout + // function. At that point, the key string object is one of the function + // parameters. + uint32_t waiting_cf_id_; + const std::string* waiting_key_; + + // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_. + mutable std::mutex wait_mutex_; + + // Timeout in microseconds when locking a key or -1 if there is no timeout. + int64_t lock_timeout_; + + // Whether to perform deadlock detection or not. + bool deadlock_detect_; + + // Whether to perform deadlock detection or not. + int64_t deadlock_detect_depth_; + + // Refer to TransactionOptions::skip_concurrency_control + bool skip_concurrency_control_; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq); + + void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; +}; + +class WriteCommittedTxn : public PessimisticTransaction { + public: + WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WriteCommittedTxn(const WriteCommittedTxn&) = delete; + void operator=(const WriteCommittedTxn&) = delete; + + ~WriteCommittedTxn() override {} + + using TransactionBaseImpl::GetForUpdate; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + Status GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + using TransactionBaseImpl::Put; + // `key` does NOT include timestamp even when it's enabled. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::PutUntracked; + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + + using TransactionBaseImpl::Delete; + // `key` does NOT include timestamp even when it's enabled. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::DeleteUntracked; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + + using TransactionBaseImpl::SingleDelete; + // `key` does NOT include timestamp even when it's enabled. + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDeleteUntracked; + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + + using TransactionBaseImpl::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + + Status SetReadTimestampForValidation(TxnTimestamp ts) override; + Status SetCommitTimestamp(TxnTimestamp ts) override; + TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; } + + private: + template <typename TValue> + Status GetForUpdateImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + TValue* value, bool exclusive, + const bool do_validate); + + template <typename TKey, typename TOperation> + Status Operate(ColumnFamilyHandle* column_family, const TKey& key, + const bool do_validate, const bool assume_tracked, + TOperation&& operation); + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + Status CommitInternal() override; + + Status RollbackInternal() override; + + // Column families that enable timestamps and whose data are written when + // indexing_enabled_ is false. If a key is written when indexing_enabled_ is + // true, then the corresponding column family is not added to cfs_with_ts + // even if it enables timestamp. + std::unordered_set<uint32_t> cfs_with_ts_tracked_when_indexing_disabled_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc new file mode 100644 index 000000000..950ef8042 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc @@ -0,0 +1,782 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/pessimistic_transaction_db.h" + +#include <cinttypes> +#include <sstream> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +PessimisticTransactionDB::PessimisticTransactionDB( + DB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check<DBImpl>(db)), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); + info_log_ = db_impl_->GetDBOptions().info_log; +} + +// Support initiliazing PessimisticTransactionDB from a stackable db +// +// PessimisticTransactionDB +// ^ ^ +// | | +// | + +// | StackableDB +// | ^ +// | | +// + + +// DBImpl +// ^ +// |(inherit) +// + +// DB +// +PessimisticTransactionDB::PessimisticTransactionDB( + StackableDB* db, const TransactionDBOptions& txn_db_options) + : TransactionDB(db), + db_impl_(static_cast_with_check<DBImpl>(db->GetRootDB())), + txn_db_options_(txn_db_options), + lock_manager_(NewLockManager(this, txn_db_options)) { + assert(db_impl_ != nullptr); +} + +PessimisticTransactionDB::~PessimisticTransactionDB() { + while (!transactions_.empty()) { + delete transactions_.begin()->second; + // TODO(myabandeh): this seems to be an unsafe approach as it is not quite + // clear whether delete would also remove the entry from transactions_. + } +} + +Status PessimisticTransactionDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + const Comparator* const ucmp = cf_options.comparator; + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (0 == ts_sz) { + return Status::OK(); + } + if (ts_sz != sizeof(TxnTimestamp)) { + std::ostringstream oss; + oss << "Timestamp of transaction must have " << sizeof(TxnTimestamp) + << " bytes. CF comparator " << std::string(ucmp->Name()) + << " timestamp size is " << ts_sz << " bytes"; + return Status::InvalidArgument(oss.str()); + } + if (txn_db_options_.write_policy != WRITE_COMMITTED) { + return Status::NotSupported("Only WriteCommittedTxn supports timestamp"); + } + return Status::OK(); +} + +Status PessimisticTransactionDB::Initialize( + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles) { + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + Status s = EnableAutoCompaction(compaction_enabled_cf_handles); + + // create 'real' transactions from recovered shell transactions + auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB()); + assert(dbimpl != nullptr); + auto rtrxs = dbimpl->recovered_transactions(); + + for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) { + auto recovered_trx = it->second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() == 1); + const auto& seq = recovered_trx->batches_.begin()->first; + const auto& batch_info = recovered_trx->batches_.begin()->second; + assert(batch_info.log_number_); + assert(recovered_trx->name_.length()); + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + // This would help avoiding deadlock for keys that although exist in the WAL + // did not go through concurrency control. This includes the merge that + // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if + // there is a conflict between the keys of two transactions that must be + // avoided, it is already avoided by the application, MyRocks, before the + // restart (ii) application, MyRocks, guarntees to rollback/commit the + // recovered transactions before new transactions start. + t_options.skip_concurrency_control = true; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + real_trx->SetLogNumber(batch_info.log_number_); + assert(seq != kMaxSequenceNumber); + if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) { + real_trx->SetId(seq); + } + + s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + break; + } + + s = real_trx->RebuildFromWriteBatch(batch_info.batch_); + // WriteCommitted set this to to disable this check that is specific to + // WritePrepared txns + assert(batch_info.batch_cnt_ == 0 || + real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_); + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + } + return s; +} + +Transaction* WriteCommittedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteCommittedTxn(this, write_options, txn_options); + } +} + +TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options) { + TransactionDBOptions validated = txn_db_options; + + if (txn_db_options.num_stripes == 0) { + validated.num_stripes = 1; + } + + return validated; +} + +Status TransactionDB::Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector<ColumnFamilyDescriptor> column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector<ColumnFamilyHandle*> handles; + Status s = TransactionDB::Open(db_options, txn_db_options, dbname, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + + return s; +} + +Status TransactionDB::Open( + const DBOptions& db_options, const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) { + Status s; + DB* db = nullptr; + if (txn_db_options.write_policy == WRITE_COMMITTED && + db_options.unordered_write) { + return Status::NotSupported( + "WRITE_COMMITTED is incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_UNPREPARED && + db_options.unordered_write) { + // TODO(lth): support it + return Status::NotSupported( + "WRITE_UNPREPARED is currently incompatible with unordered_writes"); + } + if (txn_db_options.write_policy == WRITE_PREPARED && + db_options.unordered_write && !db_options.two_write_queues) { + return Status::NotSupported( + "WRITE_PREPARED is incompatible with unordered_writes if " + "two_write_queues is not enabled."); + } + + std::vector<ColumnFamilyDescriptor> column_families_copy = column_families; + std::vector<size_t> compaction_enabled_cf_indices; + DBOptions db_options_2pc = db_options; + PrepareWrap(&db_options_2pc, &column_families_copy, + &compaction_enabled_cf_indices); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db, + use_seq_per_batch, use_batch_per_txn); + if (s.ok()) { + ROCKS_LOG_WARN(db->GetDBOptions().info_log, + "Transaction write_policy is %" PRId32, + static_cast<int>(txn_db_options.write_policy)); + // if WrapDB return non-ok, db will be deleted in WrapDB() via + // ~StackableDB(). + s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles, + dbptr); + } + return s; +} + +void TransactionDB::PrepareWrap( + DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families, + std::vector<size_t>* compaction_enabled_cf_indices) { + compaction_enabled_cf_indices->clear(); + + // Enable MemTable History if not already enabled + for (size_t i = 0; i < column_families->size(); i++) { + ColumnFamilyOptions* cf_options = &(*column_families)[i].options; + + if (cf_options->max_write_buffer_size_to_maintain == 0 && + cf_options->max_write_buffer_number_to_maintain == 0) { + // Setting to -1 will set the History size to + // max_write_buffer_number * write_buffer_size. + cf_options->max_write_buffer_size_to_maintain = -1; + } + if (!cf_options->disable_auto_compactions) { + // Disable compactions momentarily to prevent race with DB::Open + cf_options->disable_auto_compactions = true; + compaction_enabled_cf_indices->push_back(i); + } + } + db_options->allow_2pc = true; +} + +namespace { +template <typename DBType> +Status WrapAnotherDBInternal( + DBType* db, const TransactionDBOptions& txn_db_options, + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) { + assert(db != nullptr); + assert(dbptr != nullptr); + *dbptr = nullptr; + std::unique_ptr<PessimisticTransactionDB> txn_db; + // txn_db owns object pointed to by the raw db pointer. + switch (txn_db_options.write_policy) { + case WRITE_UNPREPARED: + txn_db.reset(new WriteUnpreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_PREPARED: + txn_db.reset(new WritePreparedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + break; + case WRITE_COMMITTED: + default: + txn_db.reset(new WriteCommittedTxnDB( + db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options))); + } + txn_db->UpdateCFComparatorMap(handles); + Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles); + // In case of a failure at this point, db is deleted via the txn_db destructor + // and set to nullptr. + if (s.ok()) { + *dbptr = txn_db.release(); + } else { + for (auto* h : handles) { + delete h; + } + // txn_db still owns db, and ~StackableDB() will be called when txn_db goes + // out of scope, deleting the input db pointer. + ROCKS_LOG_FATAL(db->GetDBOptions().info_log, + "Failed to initialize txn_db: %s", s.ToString().c_str()); + } + return s; +} +} // namespace + +Status TransactionDB::WrapDB( + // make sure this db is already opened with memtable history enabled, + // auto compaction distabled and 2 phase commit enabled + DB* db, const TransactionDBOptions& txn_db_options, + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +Status TransactionDB::WrapStackableDB( + // make sure this stackable_db is already opened with memtable history + // enabled, auto compaction distabled and 2 phase commit enabled + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) { + return WrapAnotherDBInternal(db, txn_db_options, + compaction_enabled_cf_indices, handles, dbptr); +} + +// Let LockManager know that this column family exists so it can +// allocate a LockMap for it. +void PessimisticTransactionDB::AddColumnFamily( + const ColumnFamilyHandle* handle) { + lock_manager_->AddColumnFamily(handle); +} + +Status PessimisticTransactionDB::CreateColumnFamily( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle) { + InstrumentedMutexLock l(&column_family_mutex_); + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamily(options, column_family_name, handle); + if (s.ok()) { + lock_manager_->AddColumnFamily(*handle); + UpdateCFComparatorMap(*handle); + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector<std::string>& column_family_names, + std::vector<ColumnFamilyHandle*>* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = VerifyCFOptions(options); + if (!s.ok()) { + return s; + } + + s = db_->CreateColumnFamilies(options, column_family_names, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::CreateColumnFamilies( + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles) { + InstrumentedMutexLock l(&column_family_mutex_); + + for (auto& cf_desc : column_families) { + Status s = VerifyCFOptions(cf_desc.options); + if (!s.ok()) { + return s; + } + } + + Status s = db_->CreateColumnFamilies(column_families, handles); + if (s.ok()) { + for (auto* handle : *handles) { + lock_manager_->AddColumnFamily(handle); + UpdateCFComparatorMap(handle); + } + } + + return s; +} + +// Let LockManager know that it can deallocate the LockMap for this +// column family. +Status PessimisticTransactionDB::DropColumnFamily( + ColumnFamilyHandle* column_family) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamily(column_family); + if (s.ok()) { + lock_manager_->RemoveColumnFamily(column_family); + } + + return s; +} + +Status PessimisticTransactionDB::DropColumnFamilies( + const std::vector<ColumnFamilyHandle*>& column_families) { + InstrumentedMutexLock l(&column_family_mutex_); + + Status s = db_->DropColumnFamilies(column_families); + if (s.ok()) { + for (auto* handle : column_families) { + lock_manager_->RemoveColumnFamily(handle); + } + } + + return s; +} + +Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const std::string& key, + bool exclusive) { + return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive); +} + +Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn, + uint32_t cfh_id, + const Endpoint& start_endp, + const Endpoint& end_endp) { + return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(), + /*exclusive=*/true); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + const LockTracker& keys) { + lock_manager_->UnLock(txn, keys, GetEnv()); +} + +void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn, + uint32_t cfh_id, const std::string& key) { + lock_manager_->UnLock(txn, cfh_id, key, GetEnv()); +} + +// Used when wrapping DB write operations in a transaction +Transaction* PessimisticTransactionDB::BeginInternalTransaction( + const WriteOptions& options) { + TransactionOptions txn_options; + Transaction* txn = BeginTransaction(options, txn_options, nullptr); + + // Use default timeout for non-transactional writes + txn->SetLockTimeout(txn_db_options_.default_lock_timeout); + return txn; +} + +// All user Put, Merge, Delete, and Write requests must be intercepted to make +// sure that they lock all keys that they are writing to avoid causing conflicts +// with any concurrent transactions. The easiest way to do this is to wrap all +// write operations in a transaction. +// +// Put(), Merge(), and Delete() only lock a single key per call. Write() will +// sort its keys before locking them. This guarantees that TransactionDB write +// methods cannot deadlock with each other (but still could deadlock with a +// Transaction). +Status PessimisticTransactionDB::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do PutUntracked(). + s = txn->PutUntracked(column_family, key, val); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // DeleteUntracked(). + s = txn->DeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(wopts); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // SingleDeleteUntracked(). + s = txn->SingleDeleteUntracked(column_family, key); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = FailIfCfEnablesTs(this, column_family); + if (!s.ok()) { + return s; + } + + Transaction* txn = BeginInternalTransaction(options); + txn->DisableIndexing(); + + // Since the client didn't create a transaction, they don't care about + // conflict checking for this write. So we just need to do + // MergeUntracked(). + s = txn->MergeUntracked(column_family, key, value); + + if (s.ok()) { + s = txn->Commit(); + } + + delete txn; + + return s; +} + +Status PessimisticTransactionDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + return WriteWithConcurrencyControl(opts, updates); +} + +Status WriteCommittedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (txn_db_options_.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +Status WriteCommittedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + Status s = FailIfBatchHasTs(updates); + if (!s.ok()) { + return s; + } + if (optimizations.skip_concurrency_control) { + return db_impl_->Write(opts, updates); + } else { + return WriteWithConcurrencyControl(opts, updates); + } +} + +void PessimisticTransactionDB::InsertExpirableTransaction( + TransactionID tx_id, PessimisticTransaction* tx) { + assert(tx->GetExpirationTime() > 0); + std::lock_guard<std::mutex> lock(map_mutex_); + expirable_transactions_map_.insert({tx_id, tx}); +} + +void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) { + std::lock_guard<std::mutex> lock(map_mutex_); + expirable_transactions_map_.erase(tx_id); +} + +bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks( + TransactionID tx_id) { + std::lock_guard<std::mutex> lock(map_mutex_); + + auto tx_it = expirable_transactions_map_.find(tx_id); + if (tx_it == expirable_transactions_map_.end()) { + return true; + } + PessimisticTransaction& tx = *(tx_it->second); + return tx.TryStealingLocks(); +} + +void PessimisticTransactionDB::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + +Transaction* PessimisticTransactionDB::GetTransactionByName( + const TransactionName& name) { + std::lock_guard<std::mutex> lock(name_map_mutex_); + auto it = transactions_.find(name); + if (it == transactions_.end()) { + return nullptr; + } else { + return it->second; + } +} + +void PessimisticTransactionDB::GetAllPreparedTransactions( + std::vector<Transaction*>* transv) { + assert(transv); + transv->clear(); + std::lock_guard<std::mutex> lock(name_map_mutex_); + for (auto it = transactions_.begin(); it != transactions_.end(); ++it) { + if (it->second->GetState() == Transaction::PREPARED) { + transv->push_back(it->second); + } + } +} + +LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() { + return lock_manager_->GetPointLockStatus(); +} + +std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() { + return lock_manager_->GetDeadlockInfoBuffer(); +} + +void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) { + lock_manager_->Resize(target_size); +} + +void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) { + assert(txn); + assert(txn->GetName().length() > 0); + assert(GetTransactionByName(txn->GetName()) == nullptr); + assert(txn->GetState() == Transaction::STARTED); + std::lock_guard<std::mutex> lock(name_map_mutex_); + transactions_[txn->GetName()] = txn; +} + +void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) { + assert(txn); + std::lock_guard<std::mutex> lock(name_map_mutex_); + auto it = transactions_.find(txn->GetName()); + assert(it != transactions_.end()); + transactions_.erase(it); +} + +std::pair<Status, std::shared_ptr<const Snapshot>> +PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) { + if (kMaxTxnTimestamp == ts) { + return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr); + } + assert(db_impl_); + return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts); +} + +std::shared_ptr<const Snapshot> +PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshot(ts); +} + +void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan( + TxnTimestamp ts) { + assert(db_impl_); + db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts); +} + +Status PessimisticTransactionDB::GetTimestampedSnapshots( + TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const { + assert(db_impl_); + return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots); +} + +Status SnapshotCreationCallback::operator()(SequenceNumber seq, + bool disable_memtable) { + assert(db_impl_); + assert(commit_ts_ != kMaxTxnTimestamp); + + const bool two_write_queues = + db_impl_->immutable_db_options().two_write_queues; + assert(!two_write_queues || !disable_memtable); +#ifdef NDEBUG + (void)two_write_queues; + (void)disable_memtable; +#endif + + const bool seq_per_batch = db_impl_->seq_per_batch(); + if (!seq_per_batch) { + assert(db_impl_->GetLastPublishedSequence() <= seq); + } else { + assert(db_impl_->GetLastPublishedSequence() < seq); + } + + // Create a snapshot which can also be used for write conflict checking. + auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_); + snapshot_creation_status_ = ret.first; + snapshot_ = ret.second; + if (snapshot_creation_status_.ok()) { + assert(snapshot_); + } else { + assert(!snapshot_); + } + if (snapshot_ && snapshot_notifier_) { + snapshot_notifier_->SnapshotCreated(snapshot_.get()); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h new file mode 100644 index 000000000..25cd11054 --- /dev/null +++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h @@ -0,0 +1,318 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <mutex> +#include <queue> +#include <set> +#include <string> +#include <unordered_map> +#include <vector> + +#include "db/db_iter.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/lock/lock_manager.h" +#include "utilities/transactions/lock/range/range_lock_manager.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class PessimisticTransactionDB : public TransactionDB { + public: + explicit PessimisticTransactionDB(DB* db, + const TransactionDBOptions& txn_db_options); + + explicit PessimisticTransactionDB(StackableDB* db, + const TransactionDBOptions& txn_db_options); + + virtual ~PessimisticTransactionDB(); + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual Status Initialize( + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles); + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override = 0; + + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override; + + using StackableDB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + inline Status WriteWithConcurrencyControl(const WriteOptions& opts, + WriteBatch* updates) { + Status s; + if (opts.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + updates, opts.protection_bytes_per_key); + } + if (s.ok()) { + // Need to lock all keys in this batch to prevent write conflicts with + // concurrent transactions. + Transaction* txn = BeginInternalTransaction(opts); + txn->DisableIndexing(); + + auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn); + + // Since commitBatch sorts the keys before locking, concurrent Write() + // operations will not cause a deadlock. + // In order to avoid a deadlock with a concurrent Transaction, + // Transactions should use a lock timeout. + s = txn_impl->CommitBatch(updates); + + delete txn; + } + + return s; + } + + using StackableDB::CreateColumnFamily; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override; + + Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector<std::string>& column_family_names, + std::vector<ColumnFamilyHandle*>* handles) override; + + Status CreateColumnFamilies( + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles) override; + + using StackableDB::DropColumnFamily; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + + Status DropColumnFamilies( + const std::vector<ColumnFamilyHandle*>& column_families) override; + + Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key, bool exclusive); + Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id, + const Endpoint& start_endp, const Endpoint& end_endp); + + void UnLock(PessimisticTransaction* txn, const LockTracker& keys); + void UnLock(PessimisticTransaction* txn, uint32_t cfh_id, + const std::string& key); + + void AddColumnFamily(const ColumnFamilyHandle* handle); + + static TransactionDBOptions ValidateTxnDBOptions( + const TransactionDBOptions& txn_db_options); + + const TransactionDBOptions& GetTxnDBOptions() const { + return txn_db_options_; + } + + void InsertExpirableTransaction(TransactionID tx_id, + PessimisticTransaction* tx); + void RemoveExpirableTransaction(TransactionID tx_id); + + // If transaction is no longer available, locks can be stolen + // If transaction is available, try stealing locks directly from transaction + // It is the caller's responsibility to ensure that the referred transaction + // is expirable (GetExpirationTime() > 0) and that it is expired. + bool TryStealingExpiredTransactionLocks(TransactionID tx_id); + + Transaction* GetTransactionByName(const TransactionName& name) override; + + void RegisterTransaction(Transaction* txn); + void UnregisterTransaction(Transaction* txn); + + // not thread safe. current use case is during recovery (single thread) + void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override; + + LockManager::PointLockStatus GetLockStatusData() override; + + std::vector<DeadlockPath> GetDeadlockInfoBuffer() override; + void SetDeadlockInfoBufferSize(uint32_t target_size) override; + + // The default implementation does nothing. The actual implementation is moved + // to the child classes that actually need this information. This was due to + // an odd performance drop we observed when the added std::atomic member to + // the base class even when the subclass do not read it in the fast path. + virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {} + virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {} + + // Use the returned factory to create LockTrackers in transactions. + const LockTrackerFactory& GetLockTrackerFactory() const { + return lock_manager_->GetLockTrackerFactory(); + } + + std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot( + TxnTimestamp ts) override; + + std::shared_ptr<const Snapshot> GetTimestampedSnapshot( + TxnTimestamp ts) const override; + + void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override; + + Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub, + std::vector<std::shared_ptr<const Snapshot>>& + timestamped_snapshots) const override; + + protected: + DBImpl* db_impl_; + std::shared_ptr<Logger> info_log_; + const TransactionDBOptions txn_db_options_; + + static Status FailIfBatchHasTs(const WriteBatch* wb); + + static Status FailIfCfEnablesTs(const DB* db, + const ColumnFamilyHandle* column_family); + + void ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()); + + virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options); + + private: + friend class WritePreparedTxnDB; + friend class WritePreparedTxnDBMock; + friend class WriteUnpreparedTxn; + friend class TransactionTest_DoubleCrashInRecovery_Test; + friend class TransactionTest_DoubleEmptyWrite_Test; + friend class TransactionTest_DuplicateKeys_Test; + friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test; + friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test; + friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test; + friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test; + + Transaction* BeginInternalTransaction(const WriteOptions& options); + + std::shared_ptr<LockManager> lock_manager_; + + // Must be held when adding/dropping column families. + InstrumentedMutex column_family_mutex_; + + // Used to ensure that no locks are stolen from an expirable transaction + // that has started a commit. Only transactions with an expiration time + // should be in this map. + std::mutex map_mutex_; + std::unordered_map<TransactionID, PessimisticTransaction*> + expirable_transactions_map_; + + // map from name to two phase transaction instance + std::mutex name_map_mutex_; + std::unordered_map<TransactionName, Transaction*> transactions_; + + // Signal that we are testing a crash scenario. Some asserts could be relaxed + // in such cases. + virtual void TEST_Crash() {} +}; + +// A PessimisticTransactionDB that writes the data to the DB after the commit. +// In this way the DB only contains the committed data. +class WriteCommittedTxnDB : public PessimisticTransactionDB { + public: + explicit WriteCommittedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + explicit WriteCommittedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options) {} + + virtual ~WriteCommittedTxnDB() {} + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Optimized version of ::Write that makes use of skip_concurrency_control + // hint + using TransactionDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, + WriteBatch* updates) override; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; +}; + +inline Status PessimisticTransactionDB::FailIfBatchHasTs( + const WriteBatch* batch) { + if (batch != nullptr && WriteBatchInternal::HasKeyWithTimestamp(*batch)) { + return Status::NotSupported( + "Writes with timestamp must go through transaction API instead of " + "TransactionDB."); + } + return Status::OK(); +} + +inline Status PessimisticTransactionDB::FailIfCfEnablesTs( + const DB* db, const ColumnFamilyHandle* column_family) { + assert(db); + column_family = column_family ? column_family : db->DefaultColumnFamily(); + assert(column_family); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + return Status::NotSupported( + "Write operation with user timestamp must go through the transaction " + "API instead of TransactionDB."); + } + return Status::OK(); +} + +class SnapshotCreationCallback : public PostMemTableCallback { + public: + explicit SnapshotCreationCallback( + DBImpl* dbi, TxnTimestamp commit_ts, + const std::shared_ptr<TransactionNotifier>& notifier, + std::shared_ptr<const Snapshot>& snapshot) + : db_impl_(dbi), + commit_ts_(commit_ts), + snapshot_notifier_(notifier), + snapshot_(snapshot) { + assert(db_impl_); + } + + ~SnapshotCreationCallback() override { + snapshot_creation_status_.PermitUncheckedError(); + } + + Status operator()(SequenceNumber seq, bool disable_memtable) override; + + private: + DBImpl* const db_impl_; + const TxnTimestamp commit_ts_; + std::shared_ptr<TransactionNotifier> snapshot_notifier_; + std::shared_ptr<const Snapshot>& snapshot_; + + Status snapshot_creation_status_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/snapshot_checker.cc b/src/rocksdb/utilities/transactions/snapshot_checker.cc new file mode 100644 index 000000000..76d16681a --- /dev/null +++ b/src/rocksdb/utilities/transactions/snapshot_checker.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/snapshot_checker.h" + +#ifdef ROCKSDB_LITE +#include <assert.h> +#endif // ROCKSDB_LITE + +#include "port/lang.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_LITE +WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( + WritePreparedTxnDB* /*txn_db*/) {} + +SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( + SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const { + // Should never be called in LITE mode. + assert(false); + return SnapshotCheckerResult::kInSnapshot; +} + +#else + +WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( + WritePreparedTxnDB* txn_db) + : txn_db_(txn_db){}; + +SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const { + bool snapshot_released = false; + // TODO(myabandeh): set min_uncommitted + bool in_snapshot = txn_db_->IsInSnapshot( + sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released); + if (snapshot_released) { + return SnapshotCheckerResult::kSnapshotReleased; + } + return in_snapshot ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; +} + +#endif // ROCKSDB_LITE + +DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() { + STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance); + return &instance; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc new file mode 100644 index 000000000..e9b474415 --- /dev/null +++ b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc @@ -0,0 +1,466 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef ROCKSDB_LITE +#include <cstdio> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n"); + return 0; +} +#else // ROCKSDB_LITE +#include <cassert> + +#include "util/cast_util.h" +#include "utilities/transactions/transaction_test.h" + +namespace ROCKSDB_NAMESPACE { +INSTANTIATE_TEST_CASE_P( + Unsupported, TimestampedSnapshotWithTsSanityCheck, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite))); + +INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Values(WRITE_COMMITTED), + ::testing::Values(kOrderedWrite))); + +namespace { +// Not thread-safe. Caller needs to provide external synchronization. +class TsCheckingTxnNotifier : public TransactionNotifier { + public: + explicit TsCheckingTxnNotifier() = default; + + ~TsCheckingTxnNotifier() override {} + + void SnapshotCreated(const Snapshot* new_snapshot) override { + assert(new_snapshot); + if (prev_snapshot_seq_ != kMaxSequenceNumber) { + assert(prev_snapshot_seq_ <= new_snapshot->GetSequenceNumber()); + } + prev_snapshot_seq_ = new_snapshot->GetSequenceNumber(); + if (prev_snapshot_ts_ != kMaxTxnTimestamp) { + assert(prev_snapshot_ts_ <= new_snapshot->GetTimestamp()); + } + prev_snapshot_ts_ = new_snapshot->GetTimestamp(); + } + + TxnTimestamp prev_snapshot_ts() const { return prev_snapshot_ts_; } + + private: + SequenceNumber prev_snapshot_seq_ = kMaxSequenceNumber; + TxnTimestamp prev_snapshot_ts_ = kMaxTxnTimestamp; +}; +} // anonymous namespace + +TEST_P(TimestampedSnapshotWithTsSanityCheck, WithoutCommitTs) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TimestampedSnapshotWithTsSanityCheck, SetCommitTs) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr<const Snapshot> snapshot; + Status s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_P(TransactionTest, WithoutCommitTs) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + ASSERT_OK(txn->Prepare()); + Status s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(txn->Rollback()); + + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v")); + s = txn->CommitAndTryCreateSnapshot(); + ASSERT_TRUE(s.IsInvalidArgument()); +} + +TEST_P(TransactionTest, ReuseExistingTxn) { + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + assert(txn); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("a", "v1")); + ASSERT_OK(txn->Prepare()); + + auto notifier = std::make_shared<TsCheckingTxnNotifier>(); + std::shared_ptr<const Snapshot> snapshot1; + Status s = + txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/100, &snapshot1); + ASSERT_OK(s); + ASSERT_EQ(100, snapshot1->GetTimestamp()); + + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions(), txn); + assert(txn1 == txn); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn->Put("a", "v2")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr<const Snapshot> snapshot2; + s = txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/110, &snapshot2); + ASSERT_OK(s); + ASSERT_EQ(110, snapshot2->GetTimestamp()); + delete txn; + + { + std::string value; + ReadOptions read_opts; + read_opts.snapshot = snapshot1.get(); + ASSERT_OK(db->Get(read_opts, "a", &value)); + ASSERT_EQ("v1", value); + + read_opts.snapshot = snapshot2.get(); + ASSERT_OK(db->Get(read_opts, "a", &value)); + ASSERT_EQ("v2", value); + } +} + +TEST_P(TransactionTest, CreateSnapshotWhenCommit) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn); + + constexpr int batch_size = 10; + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), "v0")); + } + const SequenceNumber seq0 = db->GetLatestSequenceNumber(); + ASSERT_EQ(static_cast<SequenceNumber>(batch_size), seq0); + + txn->SetSnapshot(); + { + const Snapshot* const snapshot = txn->GetSnapshot(); + assert(snapshot); + ASSERT_EQ(seq0, snapshot->GetSequenceNumber()); + } + + for (int i = 0; i < batch_size; ++i) { + ASSERT_OK(txn->Put("k" + std::to_string(i), "v1")); + } + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Prepare()); + + std::shared_ptr<const Snapshot> snapshot; + constexpr TxnTimestamp timestamp = 1; + auto notifier = std::make_shared<TsCheckingTxnNotifier>(); + Status s = txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot); + ASSERT_OK(s); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + assert(snapshot); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_EQ(seq0 + batch_size, snapshot->GetSequenceNumber()); + const Snapshot* const raw_snapshot_ptr = txn->GetSnapshot(); + ASSERT_EQ(raw_snapshot_ptr, snapshot.get()); + ASSERT_EQ(snapshot, txn->GetTimestampedSnapshot()); + + { + std::shared_ptr<const Snapshot> snapshot1 = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::shared_ptr<const Snapshot> snapshot1 = + db->GetTimestampedSnapshot(timestamp); + ASSERT_EQ(snapshot, snapshot1); + } + { + std::vector<std::shared_ptr<const Snapshot> > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{snapshot}, + snapshots); + } +} + +TEST_P(TransactionTest, CreateSnapshot) { + // First create a non-timestamped snapshot + ManagedSnapshot snapshot_guard(db); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), + "v0_" + std::to_string(i))); + } + { + auto ret = db->CreateTimestampedSnapshot(kMaxTxnTimestamp); + ASSERT_TRUE(ret.first.IsInvalidArgument()); + auto snapshot = ret.second; + ASSERT_EQ(nullptr, snapshot.get()); + } + constexpr TxnTimestamp timestamp = 100; + Status s; + std::shared_ptr<const Snapshot> ts_snap0; + std::tie(s, ts_snap0) = db->CreateTimestampedSnapshot(timestamp); + ASSERT_OK(s); + assert(ts_snap0); + ASSERT_EQ(timestamp, ts_snap0->GetTimestamp()); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db->Delete(WriteOptions(), "k" + std::to_string(i))); + } + { + ReadOptions read_opts; + read_opts.snapshot = ts_snap0.get(); + for (int i = 0; i < 10; ++i) { + std::string value; + s = db->Get(read_opts, "k" + std::to_string(i), &value); + ASSERT_OK(s); + ASSERT_EQ("v0_" + std::to_string(i), value); + } + } + { + std::shared_ptr<const Snapshot> snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::shared_ptr<const Snapshot> snapshot = + db->GetTimestampedSnapshot(timestamp); + ASSERT_OK(s); + ASSERT_EQ(ts_snap0, snapshot); + } + { + std::vector<std::shared_ptr<const Snapshot> > snapshots; + s = db->GetAllTimestampedSnapshots(snapshots); + ASSERT_OK(s); + ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{ts_snap0}, + snapshots); + } +} + +TEST_P(TransactionTest, SequenceAndTsOrder) { + Status s; + std::shared_ptr<const Snapshot> snapshot; + std::tie(s, snapshot) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + assert(snapshot); + { + // Cannot request smaller timestamp for the new timestamped snapshot. + std::shared_ptr<const Snapshot> tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(50); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + + // If requesting a new timestamped snapshot with the same timestamp and + // sequence number, we avoid creating new snapshot object but reuse + // exisisting one. + std::shared_ptr<const Snapshot> snapshot1; + std::tie(s, snapshot1) = db->CreateTimestampedSnapshot(100); + ASSERT_OK(s); + ASSERT_EQ(snapshot.get(), snapshot1.get()); + + // If there is no write, but we request a larger timestamp, we still create + // a new snapshot object. + std::shared_ptr<const Snapshot> snapshot2; + std::tie(s, snapshot2) = db->CreateTimestampedSnapshot(200); + ASSERT_OK(s); + assert(snapshot2); + ASSERT_NE(snapshot.get(), snapshot2.get()); + ASSERT_EQ(snapshot2->GetSequenceNumber(), snapshot->GetSequenceNumber()); + ASSERT_EQ(200, snapshot2->GetTimestamp()); + + // Increase sequence number. + ASSERT_OK(db->Put(WriteOptions(), "foo", "v0")); + { + // We are requesting the same timestamp for a larger sequence number, thus + // we cannot create timestamped snapshot. + std::shared_ptr<const Snapshot> tmp_snapshot; + std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(200); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(nullptr, tmp_snapshot.get()); + } + { + std::unique_ptr<Transaction> txn1( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn1->Put("bar", "v0")); + std::shared_ptr<const Snapshot> ss; + ASSERT_OK(txn1->CommitAndTryCreateSnapshot(nullptr, 200, &ss)); + // Cannot create snapshot because requested timestamp is the same as the + // latest timestamped snapshot while sequence number is strictly higher. + ASSERT_EQ(nullptr, ss); + } + { + std::unique_ptr<Transaction> txn2( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->Put("bar", "v0")); + std::shared_ptr<const Snapshot> ss; + // Application should never do this. This is just to demonstrate error + // handling. + ASSERT_OK(txn2->CommitAndTryCreateSnapshot(nullptr, 100, &ss)); + // Cannot create snapshot because requested timestamp is smaller than + // latest timestamped snapshot. + ASSERT_EQ(nullptr, ss); + } +} + +TEST_P(TransactionTest, CloseDbWithSnapshots) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn0")); + ASSERT_OK(txn->Put("foo", "v")); + ASSERT_OK(txn->Prepare()); + std::shared_ptr<const Snapshot> snapshot; + constexpr TxnTimestamp timestamp = 121; + auto notifier = std::make_shared<TsCheckingTxnNotifier>(); + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot)); + assert(snapshot); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(timestamp, snapshot->GetTimestamp()); + ASSERT_TRUE(db->Close().IsAborted()); +} + +TEST_P(TransactionTest, MultipleTimestampedSnapshots) { + auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB()); + assert(dbimpl); + const bool seq_per_batch = dbimpl->seq_per_batch(); + // TODO: remove the following assert(!seq_per_batch) once timestamped snapshot + // is supported in write-prepared/write-unprepared transactions. + assert(!seq_per_batch); + constexpr size_t txn_size = 10; + constexpr TxnTimestamp ts_delta = 10; + constexpr size_t num_txns = 100; + std::vector<std::shared_ptr<const Snapshot> > snapshots(num_txns); + constexpr TxnTimestamp start_ts = 10000; + auto notifier = std::make_shared<TsCheckingTxnNotifier>(); + for (size_t i = 0; i < num_txns; ++i) { + std::unique_ptr<Transaction> txn( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn->SetName("txn" + std::to_string(i))); + for (size_t j = 0; j < txn_size; ++j) { + ASSERT_OK(txn->Put("k" + std::to_string(j), + "v" + std::to_string(j) + "_" + std::to_string(i))); + } + if (0 == (i % 2)) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, start_ts + i * ts_delta, + &snapshots[i])); + assert(snapshots[i]); + ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + { + auto snapshot = db->GetTimestampedSnapshot(start_ts + 1); + ASSERT_EQ(nullptr, snapshot); + } + + constexpr TxnTimestamp max_ts = start_ts + num_txns * ts_delta; + for (size_t i = 0; i < num_txns; ++i) { + auto snapshot = db->GetTimestampedSnapshot(start_ts + i * ts_delta); + ASSERT_EQ(snapshots[i], snapshot); + + std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots; + Status s = db->GetTimestampedSnapshots(max_ts, start_ts + i * ts_delta, + tmp_snapshots); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_TRUE(tmp_snapshots.empty()); + + for (size_t j = i; j < num_txns; ++j) { + std::vector<std::shared_ptr<const Snapshot> > expected_snapshots( + snapshots.begin() + i, snapshots.begin() + j); + tmp_snapshots.clear(); + s = db->GetTimestampedSnapshots(start_ts + i * ts_delta, + start_ts + j * ts_delta, tmp_snapshots); + if (i < j) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsInvalidArgument()); + } + ASSERT_EQ(expected_snapshots, tmp_snapshots); + } + } + + { + std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots, tmp_snapshots); + + const std::shared_ptr<const Snapshot> latest_snapshot = + db->GetLatestTimestampedSnapshot(); + ASSERT_EQ(snapshots.back(), latest_snapshot); + } + + for (size_t i = 0; i <= num_txns; ++i) { + std::vector<std::shared_ptr<const Snapshot> > snapshots1( + snapshots.begin() + i, snapshots.end()); + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + assert(snapshot1); + ASSERT_EQ(start_ts + (i - 1) * ts_delta, snapshot1->GetTimestamp()); + } + + db->ReleaseTimestampedSnapshotsOlderThan(start_ts + i * ts_delta); + + if (i > 0) { + auto snapshot1 = + db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta); + ASSERT_EQ(nullptr, snapshot1); + } + + std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots; + const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots); + ASSERT_OK(s); + ASSERT_EQ(snapshots1, tmp_snapshots); + } + + // Even after released by db, the applications still hold reference to shared + // snapshots. + for (size_t i = 0; i < num_txns; ++i) { + assert(snapshots[i]); + ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp()); + } + + snapshots.clear(); + ASSERT_OK(db->Close()); + delete db; + db = nullptr; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_base.cc b/src/rocksdb/utilities/transactions/transaction_base.cc new file mode 100644 index 000000000..83fd94ac8 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_base.cc @@ -0,0 +1,731 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_base.h" + +#include <cinttypes> + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "util/cast_util.h" +#include "util/string_util.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +Status Transaction::CommitAndTryCreateSnapshot( + std::shared_ptr<TransactionNotifier> notifier, TxnTimestamp ts, + std::shared_ptr<const Snapshot>* snapshot) { + if (snapshot) { + snapshot->reset(); + } + TxnTimestamp commit_ts = GetCommitTimestamp(); + if (commit_ts == kMaxTxnTimestamp) { + if (ts == kMaxTxnTimestamp) { + return Status::InvalidArgument("Commit timestamp unset"); + } else { + const Status s = SetCommitTimestamp(ts); + if (!s.ok()) { + return s; + } + } + } else if (ts != kMaxTxnTimestamp) { + if (ts != commit_ts) { + // For now we treat this as error. + return Status::InvalidArgument("Different commit ts specified"); + } + } + SetSnapshotOnNextOperation(notifier); + Status s = Commit(); + if (!s.ok()) { + return s; + } + assert(s.ok()); + // If we reach here, we must return ok status for this function. + std::shared_ptr<const Snapshot> new_snapshot = GetTimestampedSnapshot(); + + if (snapshot) { + *snapshot = new_snapshot; + } + return Status::OK(); +} + +TransactionBaseImpl::TransactionBaseImpl( + DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory) + : db_(db), + dbimpl_(static_cast_with_check<DBImpl>(db)), + write_options_(write_options), + cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())), + lock_tracker_factory_(lock_tracker_factory), + start_time_(dbimpl_->GetSystemClock()->NowMicros()), + write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key), + tracked_locks_(lock_tracker_factory_.Create()), + commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */), + indexing_enabled_(true) { + assert(dynamic_cast<DBImpl*>(db_) != nullptr); + log_number_ = 0; + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +TransactionBaseImpl::~TransactionBaseImpl() { + // Release snapshot if snapshot is set + SetSnapshotInternal(nullptr); +} + +void TransactionBaseImpl::Clear() { + save_points_.reset(nullptr); + write_batch_.Clear(); + commit_time_batch_.Clear(); + tracked_locks_->Clear(); + num_puts_ = 0; + num_deletes_ = 0; + num_merges_ = 0; + + if (dbimpl_->allow_2pc()) { + InitWriteBatch(); + } +} + +void TransactionBaseImpl::Reinitialize(DB* db, + const WriteOptions& write_options) { + Clear(); + ClearSnapshot(); + id_ = 0; + db_ = db; + name_.clear(); + log_number_ = 0; + write_options_ = write_options; + start_time_ = dbimpl_->GetSystemClock()->NowMicros(); + indexing_enabled_ = true; + cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily()); + WriteBatchInternal::UpdateProtectionInfo( + write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key) + .PermitUncheckedError(); + WriteBatchInternal::UpdateProtectionInfo( + &commit_time_batch_, write_options_.protection_bytes_per_key) + .PermitUncheckedError(); +} + +void TransactionBaseImpl::SetSnapshot() { + const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary(); + SetSnapshotInternal(snapshot); +} + +void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) { + // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to + // be released, not deleted when it is no longer referenced. + snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot, + this, std::placeholders::_1, db_)); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; +} + +void TransactionBaseImpl::SetSnapshotOnNextOperation( + std::shared_ptr<TransactionNotifier> notifier) { + snapshot_needed_ = true; + snapshot_notifier_ = notifier; +} + +void TransactionBaseImpl::SetSnapshotIfNeeded() { + if (snapshot_needed_) { + std::shared_ptr<TransactionNotifier> notifier = snapshot_notifier_; + SetSnapshot(); + if (notifier != nullptr) { + notifier->SnapshotCreated(GetSnapshot()); + } + } +} + +Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family, + const SliceParts& key, bool read_only, + bool exclusive, const bool do_validate, + const bool assume_tracked) { + size_t key_size = 0; + for (int i = 0; i < key.num_parts; ++i) { + key_size += key.parts[i].size(); + } + + std::string str; + str.reserve(key_size); + + for (int i = 0; i < key.num_parts; ++i) { + str.append(key.parts[i].data(), key.parts[i].size()); + } + + return TryLock(column_family, str, read_only, exclusive, do_validate, + assume_tracked); +} + +void TransactionBaseImpl::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset( + new std::stack<TransactionBaseImpl::SavePoint, + autovector<TransactionBaseImpl::SavePoint>>()); + } + save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_, + num_puts_, num_deletes_, num_merges_, + lock_tracker_factory_); + write_batch_.SetSavePoint(); +} + +Status TransactionBaseImpl::RollbackToSavePoint() { + if (save_points_ != nullptr && save_points_->size() > 0) { + // Restore saved SavePoint + TransactionBaseImpl::SavePoint& save_point = save_points_->top(); + snapshot_ = save_point.snapshot_; + snapshot_needed_ = save_point.snapshot_needed_; + snapshot_notifier_ = save_point.snapshot_notifier_; + num_puts_ = save_point.num_puts_; + num_deletes_ = save_point.num_deletes_; + num_merges_ = save_point.num_merges_; + + // Rollback batch + Status s = write_batch_.RollbackToSavePoint(); + assert(s.ok()); + + // Rollback any keys that were tracked since the last savepoint + tracked_locks_->Subtract(*save_point.new_locks_); + + save_points_->pop(); + + return s; + } else { + assert(write_batch_.RollbackToSavePoint().IsNotFound()); + return Status::NotFound(); + } +} + +Status TransactionBaseImpl::PopSavePoint() { + if (save_points_ == nullptr || save_points_->empty()) { + // No SavePoint yet. + assert(write_batch_.PopSavePoint().IsNotFound()); + return Status::NotFound(); + } + + assert(!save_points_->empty()); + // If there is another savepoint A below the current savepoint B, then A needs + // to inherit tracked_keys in B so that if we rollback to savepoint A, we + // remember to unlock keys in B. If there is no other savepoint below, then we + // can safely discard savepoint info. + if (save_points_->size() == 1) { + save_points_->pop(); + } else { + TransactionBaseImpl::SavePoint top(lock_tracker_factory_); + std::swap(top, save_points_->top()); + save_points_->pop(); + + save_points_->top().new_locks_->Merge(*top.new_locks_); + } + + return write_batch_.PopSavePoint(); +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; +} + +Status TransactionBaseImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, + pinnable_val); +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && value != nullptr) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + s = Get(read_options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + } + return s; +} + +Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val, + bool exclusive, + const bool do_validate) { + if (!do_validate && read_options.snapshot != nullptr) { + return Status::InvalidArgument( + "If do_validate is false then GetForUpdate with snapshot is not " + "defined."); + } + Status s = + TryLock(column_family, key, true /* read_only */, exclusive, do_validate); + + if (s.ok() && pinnable_val != nullptr) { + s = Get(read_options, column_family, key, pinnable_val); + } + return s; +} + +std::vector<Status> TransactionBaseImpl::MultiGet( + const ReadOptions& read_options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) { + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector<Status> stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, + num_keys, keys, values, statuses, + sorted_input); +} + +std::vector<Status> TransactionBaseImpl::MultiGetForUpdate( + const ReadOptions& read_options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) { + // Regardless of whether the MultiGet succeeded, track these keys. + size_t num_keys = keys.size(); + values->resize(num_keys); + + // Lock all keys + for (size_t i = 0; i < num_keys; ++i) { + Status s = TryLock(column_family[i], keys[i], true /* read_only */, + true /* exclusive */); + if (!s.ok()) { + // Fail entire multiget if we cannot lock all keys + return std::vector<Status>(num_keys, s); + } + } + + // TODO(agiardullo): optimize multiget? + std::vector<Status> stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + } + + return stat_list; +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) { + Iterator* db_iter = db_->NewIterator(read_options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter, + &read_options); +} + +Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + Iterator* db_iter = db_->NewIterator(read_options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter, + &read_options); +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + const bool do_validate = !assume_tracked; + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, do_validate, assume_tracked); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Put(column_family, key, value); + if (s.ok()) { + num_puts_++; + } + } + + return s; +} + +Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, + const Slice& value) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Merge(column_family, key, value); + if (s.ok()) { + num_merges_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->Delete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +Status TransactionBaseImpl::SingleDeleteUntracked( + ColumnFamilyHandle* column_family, const Slice& key) { + Status s = TryLock(column_family, key, false /* read_only */, + true /* exclusive */, false /* do_validate */); + + if (s.ok()) { + s = GetBatchForWrite()->SingleDelete(column_family, key); + if (s.ok()) { + num_deletes_++; + } + } + + return s; +} + +void TransactionBaseImpl::PutLogData(const Slice& blob) { + auto s = write_batch_.PutLogData(blob); + (void)s; + assert(s.ok()); +} + +WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() { + return &write_batch_; +} + +uint64_t TransactionBaseImpl::GetElapsedTime() const { + return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000; +} + +uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; } + +uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; } + +uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; } + +uint64_t TransactionBaseImpl::GetNumKeys() const { + return tracked_locks_->GetNumPointLocks(); +} + +void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key, + SequenceNumber seq, bool read_only, + bool exclusive) { + PointLockRequest r; + r.column_family_id = cfh_id; + r.key = key; + r.seq = seq; + r.read_only = read_only; + r.exclusive = exclusive; + + // Update map of all tracked keys for this transaction + tracked_locks_->Track(r); + + if (save_points_ != nullptr && !save_points_->empty()) { + // Update map of tracked keys in this SavePoint + save_points_->top().new_locks_->Track(r); + } +} + +// Gets the write batch that should be used for Put/Merge/Deletes. +// +// Returns either a WriteBatch or WriteBatchWithIndex depending on whether +// DisableIndexing() has been called. +WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() { + if (indexing_enabled_) { + // Use WriteBatchWithIndex + return &write_batch_; + } else { + // Don't use WriteBatchWithIndex. Return base WriteBatch. + return write_batch_.GetWriteBatch(); + } +} + +void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) { + if (snapshot != nullptr) { + ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log, + "ReleaseSnapshot %" PRIu64 " Set", + snapshot->GetSequenceNumber()); + db->ReleaseSnapshot(snapshot); + } +} + +void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) { + PointLockRequest r; + r.column_family_id = GetColumnFamilyID(column_family); + r.key = key.ToString(); + r.read_only = true; + + bool can_untrack = false; + if (save_points_ != nullptr && !save_points_->empty()) { + // If there is no GetForUpdate of the key in this save point, + // then cannot untrack from the global lock tracker. + UntrackStatus s = save_points_->top().new_locks_->Untrack(r); + can_untrack = (s != UntrackStatus::NOT_TRACKED); + } else { + // No save point, so can untrack from the global lock tracker. + can_untrack = true; + } + + if (can_untrack) { + // If erased from the global tracker, then can unlock the key. + UntrackStatus s = tracked_locks_->Untrack(r); + bool can_unlock = (s == UntrackStatus::REMOVED); + if (can_unlock) { + UnlockGetForUpdate(column_family, key); + } + } +} + +Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) { + struct IndexedWriteBatchBuilder : public WriteBatch::Handler { + Transaction* txn_; + DBImpl* db_; + IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db) + : txn_(txn), db_(db) { + assert(dynamic_cast<TransactionBaseImpl*>(txn_) != nullptr); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return txn_->Delete(db_->GetColumnFamilyHandle(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override { + return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val); + } + + // this is used for reconstructing prepared transactions upon + // recovery. there should not be any meta markers in the batches + // we are processing. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + IndexedWriteBatchBuilder copycat(this, dbimpl_); + return src_batch->Iterate(©cat); +} + +WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { + return &commit_time_batch_; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_base.h b/src/rocksdb/utilities/transactions/transaction_base.h new file mode 100644 index 000000000..1bcb20ca9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_base.h @@ -0,0 +1,384 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <stack> +#include <string> +#include <vector> + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/lock/lock_tracker.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionBaseImpl : public Transaction { + public: + TransactionBaseImpl(DB* db, const WriteOptions& write_options, + const LockTrackerFactory& lock_tracker_factory); + + ~TransactionBaseImpl() override; + + // Remove pending operations queued in this transaction. + virtual void Clear(); + + void Reinitialize(DB* db, const WriteOptions& write_options); + + // Called before executing Put, Merge, Delete, and GetForUpdate. If TryLock + // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed. + // do_validate will be false if called from PutUntracked, DeleteUntracked, + // MergeUntracked, or GetForUpdate(do_validate=false) + virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, + bool read_only, bool exclusive, + const bool do_validate = true, + const bool assume_tracked = false) = 0; + + void SetSavePoint() override; + + Status RollbackToSavePoint() override; + + Status PopSavePoint() override; + + using Transaction::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override { + return Get(options, db_->DefaultColumnFamily(), key, value); + } + + using Transaction::GetForUpdate; + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val, bool exclusive, + const bool do_validate) override; + + Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive, + const bool do_validate) override { + return GetForUpdate(options, db_->DefaultColumnFamily(), key, value, + exclusive, do_validate); + } + + using Transaction::MultiGet; + std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, + std::vector<std::string>* values) override; + + std::vector<Status> MultiGet(const ReadOptions& options, + const std::vector<Slice>& keys, + std::vector<std::string>* values) override { + return MultiGet(options, + std::vector<ColumnFamilyHandle*>( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input = false) override; + + using Transaction::MultiGetForUpdate; + std::vector<Status> MultiGetForUpdate( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, + std::vector<std::string>* values) override; + + std::vector<Status> MultiGetForUpdate( + const ReadOptions& options, const std::vector<Slice>& keys, + std::vector<std::string>* values) override { + return MultiGetForUpdate(options, + std::vector<ColumnFamilyHandle*>( + keys.size(), db_->DefaultColumnFamily()), + keys, values); + } + + Iterator* GetIterator(const ReadOptions& read_options) override; + Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) override; + + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key, + const bool assume_tracked = false) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status PutUntracked(const Slice& key, const Slice& value) override { + return PutUntracked(nullptr, key, value); + } + + Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status PutUntracked(const SliceParts& key, const SliceParts& value) override { + return PutUntracked(nullptr, key, value); + } + + Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status MergeUntracked(const Slice& key, const Slice& value) override { + return MergeUntracked(nullptr, key, value); + } + + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status DeleteUntracked(const Slice& key) override { + return DeleteUntracked(nullptr, key); + } + Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status DeleteUntracked(const SliceParts& key) override { + return DeleteUntracked(nullptr, key); + } + + Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDeleteUntracked(const Slice& key) override { + return SingleDeleteUntracked(nullptr, key); + } + + void PutLogData(const Slice& blob) override; + + WriteBatchWithIndex* GetWriteBatch() override; + + virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ + } + + const Snapshot* GetSnapshot() const override { + // will return nullptr when there is no snapshot + return snapshot_.get(); + } + + std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const override { + return snapshot_; + } + + virtual void SetSnapshot() override; + void SetSnapshotOnNextOperation( + std::shared_ptr<TransactionNotifier> notifier = nullptr) override; + + void ClearSnapshot() override { + snapshot_.reset(); + snapshot_needed_ = false; + snapshot_notifier_ = nullptr; + } + + void DisableIndexing() override { indexing_enabled_ = false; } + + void EnableIndexing() override { indexing_enabled_ = true; } + + bool IndexingEnabled() const { return indexing_enabled_; } + + uint64_t GetElapsedTime() const override; + + uint64_t GetNumPuts() const override; + + uint64_t GetNumDeletes() const override; + + uint64_t GetNumMerges() const override; + + uint64_t GetNumKeys() const override; + + void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; + void UndoGetForUpdate(const Slice& key) override { + return UndoGetForUpdate(nullptr, key); + }; + + WriteOptions* GetWriteOptions() override { return &write_options_; } + + void SetWriteOptions(const WriteOptions& write_options) override { + write_options_ = write_options; + } + + // Used for memory management for snapshot_ + void ReleaseSnapshot(const Snapshot* snapshot, DB* db); + + // iterates over the given batch and makes the appropriate inserts. + // used for rebuilding prepared transactions after recovery. + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WriteBatch* GetCommitTimeWriteBatch() override; + + LockTracker& GetTrackedLocks() { return *tracked_locks_; } + + protected: + // Add a key to the list of tracked keys. + // + // seqno is the earliest seqno this key was involved with this transaction. + // readonly should be set to true if no data was written for this key + void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno, + bool readonly, bool exclusive); + + // Called when UndoGetForUpdate determines that this key can be unlocked. + virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + // Sets a snapshot if SetSnapshotOnNextOperation() has been called. + void SetSnapshotIfNeeded(); + + // Initialize write_batch_ for 2PC by inserting Noop. + inline void InitWriteBatch(bool clear = false) { + if (clear) { + write_batch_.Clear(); + } + assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader); + auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch()); + assert(s.ok()); + } + + WriteBatchBase* GetBatchForWrite(); + + DB* db_; + DBImpl* dbimpl_; + + WriteOptions write_options_; + + const Comparator* cmp_; + + const LockTrackerFactory& lock_tracker_factory_; + + // Stores that time the txn was constructed, in microseconds. + uint64_t start_time_; + + // Stores the current snapshot that was set by SetSnapshot or null if + // no snapshot is currently set. + std::shared_ptr<const Snapshot> snapshot_; + + // Count of various operations pending in this transaction + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + struct SavePoint { + std::shared_ptr<const Snapshot> snapshot_; + bool snapshot_needed_ = false; + std::shared_ptr<TransactionNotifier> snapshot_notifier_; + uint64_t num_puts_ = 0; + uint64_t num_deletes_ = 0; + uint64_t num_merges_ = 0; + + // Record all locks tracked since the last savepoint + std::shared_ptr<LockTracker> new_locks_; + + SavePoint(std::shared_ptr<const Snapshot> snapshot, bool snapshot_needed, + std::shared_ptr<TransactionNotifier> snapshot_notifier, + uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges, + const LockTrackerFactory& lock_tracker_factory) + : snapshot_(snapshot), + snapshot_needed_(snapshot_needed), + snapshot_notifier_(snapshot_notifier), + num_puts_(num_puts), + num_deletes_(num_deletes), + num_merges_(num_merges), + new_locks_(lock_tracker_factory.Create()) {} + + explicit SavePoint(const LockTrackerFactory& lock_tracker_factory) + : new_locks_(lock_tracker_factory.Create()) {} + }; + + // Records writes pending in this transaction + WriteBatchWithIndex write_batch_; + + // For Pessimistic Transactions this is the set of acquired locks. + // Optimistic Transactions will keep note the requested locks (not actually + // locked), and do conflict checking until commit time based on the tracked + // lock requests. + std::unique_ptr<LockTracker> tracked_locks_; + + // Stack of the Snapshot saved at each save point. Saved snapshots may be + // nullptr if there was no snapshot at the time SetSavePoint() was called. + std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint, + autovector<TransactionBaseImpl::SavePoint>>> + save_points_; + + private: + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + + // Extra data to be persisted with the commit. Note this is only used when + // prepare phase is not skipped. + WriteBatch commit_time_batch_; + + // If true, future Put/Merge/Deletes will be indexed in the + // WriteBatchWithIndex. + // If false, future Put/Merge/Deletes will be inserted directly into the + // underlying WriteBatch and not indexed in the WriteBatchWithIndex. + bool indexing_enabled_; + + // SetSnapshotOnNextOperation() has been called and the snapshot has not yet + // been reset. + bool snapshot_needed_ = false; + + // SetSnapshotOnNextOperation() has been called and the caller would like + // a notification through the TransactionNotifier interface + std::shared_ptr<TransactionNotifier> snapshot_notifier_ = nullptr; + + Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, + bool read_only, bool exclusive, const bool do_validate = true, + const bool assume_tracked = false); + + void SetSnapshotInternal(const Snapshot* snapshot); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc new file mode 100644 index 000000000..345c4be90 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_db_mutex_impl.h" + +#include <chrono> +#include <condition_variable> +#include <functional> +#include <mutex> + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexImpl : public TransactionDBMutex { + public: + TransactionDBMutexImpl() {} + ~TransactionDBMutexImpl() override {} + + Status Lock() override; + + Status TryLockFor(int64_t timeout_time) override; + + void UnLock() override { mutex_.unlock(); } + + friend class TransactionDBCondVarImpl; + + private: + std::mutex mutex_; +}; + +class TransactionDBCondVarImpl : public TransactionDBCondVar { + public: + TransactionDBCondVarImpl() {} + ~TransactionDBCondVarImpl() override {} + + Status Wait(std::shared_ptr<TransactionDBMutex> mutex) override; + + Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex, + int64_t timeout_time) override; + + void Notify() override { cv_.notify_one(); } + + void NotifyAll() override { cv_.notify_all(); } + + private: + std::condition_variable cv_; +}; + +std::shared_ptr<TransactionDBMutex> +TransactionDBMutexFactoryImpl::AllocateMutex() { + return std::shared_ptr<TransactionDBMutex>(new TransactionDBMutexImpl()); +} + +std::shared_ptr<TransactionDBCondVar> +TransactionDBMutexFactoryImpl::AllocateCondVar() { + return std::shared_ptr<TransactionDBCondVar>(new TransactionDBCondVarImpl()); +} + +Status TransactionDBMutexImpl::Lock() { + mutex_.lock(); + return Status::OK(); +} + +Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { + bool locked = true; + + if (timeout_time == 0) { + locked = mutex_.try_lock(); + } else { + // Previously, this code used a std::timed_mutex. However, this was changed + // due to known bugs in gcc versions < 4.9. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54562 + // + // Since this mutex isn't held for long and only a single mutex is ever + // held at a time, it is reasonable to ignore the lock timeout_time here + // and only check it when waiting on the condition_variable. + mutex_.lock(); + } + + if (!locked) { + // timeout acquiring mutex + return Status::TimedOut(Status::SubCode::kMutexTimeout); + } + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::Wait( + std::shared_ptr<TransactionDBMutex> mutex) { + auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get()); + + std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock); + cv_.wait(lock); + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + return Status::OK(); +} + +Status TransactionDBCondVarImpl::WaitFor( + std::shared_ptr<TransactionDBMutex> mutex, int64_t timeout_time) { + Status s; + + auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get()); + std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock); + + if (timeout_time < 0) { + // If timeout is negative, do not use a timeout + cv_.wait(lock); + } else { + auto duration = std::chrono::microseconds(timeout_time); + auto cv_status = cv_.wait_for(lock, duration); + + // Check if the wait stopped due to timing out. + if (cv_status == std::cv_status::timeout) { + s = Status::TimedOut(Status::SubCode::kMutexTimeout); + } + } + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + + // CV was signaled, or we spuriously woke up (but didn't time out) + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h new file mode 100644 index 000000000..fbee92832 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/transaction_db_mutex.h" + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutex; +class TransactionDBCondVar; + +// Default implementation of TransactionDBMutexFactory. May be overridden +// by TransactionDBOptions.custom_mutex_factory. +class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory { + public: + std::shared_ptr<TransactionDBMutex> AllocateMutex() override; + std::shared_ptr<TransactionDBCondVar> AllocateCondVar() override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_test.cc b/src/rocksdb/utilities/transactions/transaction_test.cc new file mode 100644 index 000000000..caf1566b9 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_test.cc @@ -0,0 +1,6550 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_test.h" + +#include <algorithm> +#include <functional> +#include <string> +#include <thread> + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" + +namespace ROCKSDB_NAMESPACE { + +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, TransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, TransactionStressTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite))); +INSTANTIATE_TEST_CASE_P( + StackableDBAsBaseDB, TransactionTest, + ::testing::Values( + std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite), + std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite))); + +// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it +// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set). +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + MySQLStyleTransactionTest, MySQLStyleTransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false), + std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, DoubleEmptyWrite) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + + WriteBatch batch; + + ASSERT_OK(db->Write(write_options, &batch)); + ASSERT_OK(db->Write(write_options, &batch)); + + // Also test committing empty transactions in 2PC + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->Commit()); + delete txn0; + + // Also test that it works during recovery + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid2")); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + txn0 = db->GetTransactionByName("xid2"); + ASSERT_OK(txn0->Commit()); + delete txn0; +} + +TEST_P(TransactionTest, SuccessTest) { + ASSERT_OK(db->ResetStats()); + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_LE(0, txn->GetID()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2"))); + + ASSERT_EQ(1, txn->GetNumPuts()); + + ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) { + const TxnDBWritePolicy write_policy = std::get<2>(GetParam()); + + if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) { + ROCKSDB_GTEST_BYPASS("Test applies to write-committed only"); + return; + } + + ASSERT_OK(db->Put(WriteOptions(), "key0", "value")); + + TransactionOptions txn_opts; + txn_opts.use_only_the_last_commit_time_batch_for_recovery = true; + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts); + assert(txn); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&](void* arg) { + // db mutex not held. + auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg); + assert(mems); + ASSERT_EQ(1, mems->size()); + auto* ctwb = txn->GetCommitTimeWriteBatch(); + ASSERT_OK(ctwb->Put("gtid", "123")); + ASSERT_OK(txn->Commit()); + delete txn; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(txn->Put("key1", "value")); + ASSERT_OK(txn->SetName("txn1")); + + ASSERT_OK(txn->Prepare()); + + auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr)); + ASSERT_OK(dbimpl->TEST_FlushMemTable( + /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr)); + + ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable()); + + { + std::string value; + ASSERT_OK(db->Get(ReadOptions(), "key1", &value)); + ASSERT_EQ("value", value); + } + + delete db; + db = nullptr; + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + ASSERT_OK(s); + assert(db); + + { + std::string value; + ASSERT_OK(db->Get(ReadOptions(), "gtid", &value)); + ASSERT_EQ("123", value); + + ASSERT_OK(db->Get(ReadOptions(), "key1", &value)); + ASSERT_EQ("value", value); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// The test clarifies the contract of do_validate and assume_tracked +// in GetForUpdate and Put/Merge/Delete +TEST_P(TransactionTest, AssumeExclusiveTracked) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + TransactionOptions txn_options; + txn_options.lock_timeout = 1; + const bool EXCLUSIVE = true; + const bool DO_VALIDATE = true; + const bool ASSUME_LOCKED = true; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + txn->SetSnapshot(); + + // commit a value after the snapshot is taken + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + + // By default write should fail to the commit after our snapshot + s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE); + ASSERT_TRUE(s.IsBusy()); + // But the user could direct the db to skip validating the snapshot. The read + // value then should be the most recently committed + ASSERT_OK( + txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE)); + ASSERT_EQ(value, "bar"); + + // Although ValidateSnapshot is skipped the key must have still got locked + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_TRUE(s.IsTimedOut()); + + // By default the write operations should fail due to the commit after the + // snapshot + s = txn->Put(Slice("foo"), Slice("bar1")); + ASSERT_TRUE(s.IsBusy()); + s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"), + !ASSUME_LOCKED); + ASSERT_TRUE(s.IsBusy()); + // But the user could direct the db that it already assumes exclusive lock on + // the key due to the previous GetForUpdate call. + ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"), + ASSUME_LOCKED)); + ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"), + ASSUME_LOCKED)); + ASSERT_OK( + txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED)); + ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"), + ASSUME_LOCKED)); + + ASSERT_OK(txn->Rollback()); + delete txn; +} + +// This test clarifies the contract of ValidateSnapshot +TEST_P(TransactionTest, ValidateSnapshotTest) { + for (bool with_flush : {true}) { + for (bool with_2pc : {true}) { + ASSERT_OK(ReOpen()); + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + assert(db != nullptr); + Transaction* txn1 = + db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn1); + ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1"))); + if (with_2pc) { + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Prepare()); + } + + if (with_flush) { + auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // Make sure the flushed memtable is not kept in memory + int max_memtable_in_history = + std::max( + options.max_write_buffer_number, + static_cast<int>(options.max_write_buffer_size_to_maintain) / + static_cast<int>(options.write_buffer_size)) + + 1; + for (int i = 0; i < max_memtable_in_history; i++) { + ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value"))); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + } + } + + Transaction* txn2 = + db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn2); + txn2->SetSnapshot(); + + ASSERT_OK(txn1->Commit()); + delete txn1; + + auto pes_txn2 = dynamic_cast<PessimisticTransaction*>(txn2); + // Test the simple case where the key is not tracked yet + auto trakced_seq = kMaxSequenceNumber; + auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo", + &trakced_seq); + ASSERT_TRUE(s.IsBusy()); + delete txn2; + } + } +} + +TEST_P(TransactionTest, WaitingTxn) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.lock_timeout = 1; + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + /* create second cf */ + ColumnFamilyHandle* cfa; + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->Put(write_options, cfa, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + TransactionID id1 = txn1->GetID(); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) { + std::string key; + uint32_t cf_id; + std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key); + ASSERT_EQ(key, "foo"); + ASSERT_EQ(wait.size(), 1); + ASSERT_EQ(wait[0], id1); + ASSERT_EQ(cf_id, 0U); + }); + + get_perf_context()->Reset(); + // lock key in default cf + s = txn1->GetForUpdate(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0); + + // lock key in cfa + s = txn1->GetForUpdate(read_options, cfa, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0); + + auto lock_data = db->GetLockStatusData(); + // Locked keys exist in both column family. + ASSERT_EQ(lock_data.size(), 2); + + auto cf_iterator = lock_data.begin(); + + // The iterator points to an unordered_multimap + // thus the test can not assume any particular order. + + // Column family is 1 or 0 (cfa). + if (cf_iterator->first != 1 && cf_iterator->first != 0) { + FAIL(); + } + // The locked key is "foo" and is locked by txn1 + ASSERT_EQ(cf_iterator->second.key, "foo"); + ASSERT_EQ(cf_iterator->second.ids.size(), 1); + ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID()); + + cf_iterator++; + + // Column family is 0 (default) or 1. + if (cf_iterator->first != 1 && cf_iterator->first != 0) { + FAIL(); + } + // The locked key is "foo" and is locked by txn1 + ASSERT_EQ(cf_iterator->second.key, "foo"); + ASSERT_EQ(cf_iterator->second.ids.size(), 1); + ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + s = txn2->GetForUpdate(read_options, "foo", &value); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1); + ASSERT_GE(get_perf_context()->key_lock_wait_time, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + delete cfa; + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, SharedLocks) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn2); + ASSERT_TRUE(txn3); + + // Test shared access between txns + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + auto lock_data = db->GetLockStatusData(); + ASSERT_EQ(lock_data.size(), 1); + + auto cf_iterator = lock_data.begin(); + ASSERT_EQ(cf_iterator->second.key, "foo"); + + // We compare whether the set of txns locking this key is the same. To do + // this, we need to sort both vectors so that the comparison is done + // correctly. + std::vector<TransactionID> expected_txns = {txn1->GetID(), txn2->GetID(), + txn3->GetID()}; + std::vector<TransactionID> lock_txns = cf_iterator->second.ids; + ASSERT_EQ(expected_txns, lock_txns); + ASSERT_FALSE(cf_iterator->second.exclusive); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + ASSERT_OK(txn3->Rollback()); + + // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it. + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn2->UndoGetForUpdate("foo"); + s = txn3->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + ASSERT_OK(txn3->Rollback()); + + // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock. + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn2->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + + // Test txn1 trying to downgrade its lock. + s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + // Should still fail after "downgrading". + s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + + // Test txn1 holding an exclusive lock and txn2 trying to obtain shared + // access. + s = txn1->GetForUpdate(read_options, "foo", nullptr); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + txn1->UndoGetForUpdate("foo"); + s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */); + ASSERT_OK(s); + + delete txn1; + delete txn2; + delete txn3; +} + +TEST_P(TransactionTest, DeadlockCycleShared) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + + // Set up a wait for chain like this: + // + // Tn -> T(n*2) + // Tn -> T(n*2 + 1) + // + // So we have: + // T1 -> T2 -> T4 ... + // | |> T5 ... + // |> T3 -> T6 ... + // |> T7 ... + // up to T31, then T[16 - 31] -> T1. + // Note that Tn holds lock on floor(n / 2). + + std::vector<Transaction*> txns(31); + + for (uint32_t i = 0; i < 31; i++) { + txns[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns[i]); + auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2), + nullptr, false /* exclusive */); + ASSERT_OK(s); + } + + std::atomic<uint32_t> checkpoints(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // We want the leaf transactions to block and hold everyone back. + std::vector<port::Thread> threads; + for (uint32_t i = 0; i < 15; i++) { + std::function<void()> blocking_thread = [&, i] { + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1), + nullptr, true /* exclusive */); + ASSERT_OK(s); + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + }; + threads.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints.load() != 15) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle T[16 - 31] -> T1 + for (uint32_t i = 15; i < 31; i++) { + auto s = + txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */); + ASSERT_TRUE(s.IsDeadlock()); + + // Calculate next buffer len, plateau at 5 when 5 records are inserted. + const uint32_t curr_dlock_buffer_len_ = + (i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_); + auto dlock_entry = dlock_buffer[0].path; + ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks); + int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time; + int64_t cur_deadlock_time = 0; + for (auto const& dl_path_rec : dlock_buffer) { + cur_deadlock_time = dl_path_rec.deadlock_time; + ASSERT_NE(cur_deadlock_time, 0); + ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time); + pre_deadlock_time = cur_deadlock_time; + } + + int64_t curr_waiting_key = 0; + + // Offset of each txn id from the root of the shared dlock tree's txn id. + int64_t offset_root = dlock_entry[0].m_txn_id - 1; + // Offset of the final entry in the dlock path from the root's txn id. + TransactionID leaf_id = + dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root; + + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { + auto dl_node = *it; + ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id); + ASSERT_EQ(dl_node.m_cf_id, 0U); + ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key)); + ASSERT_EQ(dl_node.m_exclusive, true); + + if (curr_waiting_key == 0) { + curr_waiting_key = leaf_id; + } + curr_waiting_key /= 2; + leaf_id /= 2; + } + } + + // Rollback the leaf transaction. + for (uint32_t i = 15; i < 31; i++) { + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + } + + for (auto& t : threads) { + t.join(); + } + + // Downsize the buffer and verify the 3 latest deadlocks are preserved. + auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(3); + auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 3); + + for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) { + for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) { + ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id, + dlock_buffer_before_resize[i].path[j].m_txn_id); + } + } + + // Upsize the buffer and verify the 3 latest dealocks are preserved. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(5); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 3); + + for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) { + for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) { + ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id, + dlock_buffer_before_resize[i].path[j].m_txn_id); + } + } + + // Downsize to 0 and verify the size is consistent. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(0); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 0); + + // Upsize from 0 to verify the size is persistent. + dlock_buffer_before_resize = db->GetDeadlockInfoBuffer(); + db->SetDeadlockInfoBufferSize(3); + dlock_buffer_after_resize = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer_after_resize.size(), 0); + + // Contrived case of shared lock of cycle size 2 to verify that a shared + // lock causing a deadlock is correctly reported as "shared" in the buffer. + std::vector<Transaction*> txns_shared(2); + + // Create a cycle of size 2. + for (uint32_t i = 0; i < 2; i++) { + txns_shared[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns_shared[i]); + auto s = + txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + ASSERT_OK(s); + } + + std::atomic<uint32_t> checkpoints_shared(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector<port::Thread> threads_shared; + for (uint32_t i = 0; i < 1; i++) { + std::function<void()> blocking_thread = [&, i] { + auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1), + nullptr); + ASSERT_OK(s); + ASSERT_OK(txns_shared[i]->Rollback()); + delete txns_shared[i]; + }; + threads_shared.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints_shared.load() != 1) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle T2 -> T1 with a shared lock. + auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false); + ASSERT_TRUE(s.IsDeadlock()); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + + // Verify the size of the buffer and the single path. + ASSERT_EQ(dlock_buffer.size(), 1); + ASSERT_EQ(dlock_buffer[0].path.size(), 2); + + // Verify the exclusivity field of the transactions in the deadlock path. + ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive); + ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive); + ASSERT_OK(txns_shared[1]->Rollback()); + delete txns_shared[1]; + + for (auto& t : threads_shared) { + t.join(); + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, DeadlockCycle) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + // offset by 2 from the max depth to test edge case + const uint32_t kMaxCycleLength = 52; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + + for (uint32_t len = 2; len < kMaxCycleLength; len++) { + // Set up a long wait for chain like this: + // + // T1 -> T2 -> T3 -> ... -> Tlen + + std::vector<Transaction*> txns(len); + + for (uint32_t i = 0; i < len; i++) { + txns[i] = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txns[i]); + auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr); + ASSERT_OK(s); + } + + std::atomic<uint32_t> checkpoints(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PointLockManager::AcquireWithTimeout:WaitingTxn", + [&](void* /*arg*/) { checkpoints.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // We want the last transaction in the chain to block and hold everyone + // back. + std::vector<port::Thread> threads; + for (uint32_t i = 0; i + 1 < len; i++) { + std::function<void()> blocking_thread = [&, i] { + auto s = + txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr); + ASSERT_OK(s); + ASSERT_OK(txns[i]->Rollback()); + delete txns[i]; + }; + threads.emplace_back(blocking_thread); + } + + // Wait until all threads are waiting on each other. + while (checkpoints.load() != len - 1) { + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Complete the cycle Tlen -> T1 + auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr); + ASSERT_TRUE(s.IsDeadlock()); + + const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1); + uint32_t curr_waiting_key = 0; + TransactionID curr_txn_id = txns[0]->GetID(); + + auto dlock_buffer = db->GetDeadlockInfoBuffer(); + ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_); + uint32_t check_len = len; + bool check_limit_flag = false; + + // Special case for a deadlock path that exceeds the maximum depth. + if (len > 50) { + check_len = 0; + check_limit_flag = true; + } + auto dlock_entry = dlock_buffer[0].path; + ASSERT_EQ(dlock_entry.size(), check_len); + ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag); + + int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time; + int64_t cur_deadlock_time = 0; + for (auto const& dl_path_rec : dlock_buffer) { + cur_deadlock_time = dl_path_rec.deadlock_time; + ASSERT_NE(cur_deadlock_time, 0); + ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time); + pre_deadlock_time = cur_deadlock_time; + } + + // Iterates backwards over path verifying decreasing txn_ids. + for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) { + auto dl_node = *it; + ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1); + ASSERT_EQ(dl_node.m_cf_id, 0u); + ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key)); + ASSERT_EQ(dl_node.m_exclusive, true); + + curr_txn_id--; + if (curr_waiting_key == 0) { + curr_waiting_key = len; + } + curr_waiting_key--; + } + + // Rollback the last transaction. + ASSERT_OK(txns[len - 1]->Rollback()); + delete txns[len - 1]; + + for (auto& t : threads) { + t.join(); + } + } +} + +TEST_P(TransactionStressTest, DeadlockStress) { + const uint32_t NUM_TXN_THREADS = 10; + const uint32_t NUM_KEYS = 100; + const uint32_t NUM_ITERS = 1000; + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + + txn_options.lock_timeout = 1000000; + txn_options.deadlock_detect = true; + std::vector<std::string> keys; + + for (uint32_t i = 0; i < NUM_KEYS; i++) { + ASSERT_OK(db->Put(write_options, Slice(std::to_string(i)), Slice(""))); + keys.push_back(std::to_string(i)); + } + + size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random rnd(static_cast<uint32_t>(tid)); + std::function<void(uint32_t)> stress_thread = [&](uint32_t seed) { + std::default_random_engine g(seed); + + Transaction* txn; + for (uint32_t i = 0; i < NUM_ITERS; i++) { + txn = db->BeginTransaction(write_options, txn_options); + auto random_keys = keys; + std::shuffle(random_keys.begin(), random_keys.end(), g); + + // Lock keys in random order. + for (const auto& k : random_keys) { + // Lock mostly for shared access, but exclusive 1/4 of the time. + auto s = + txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0); + if (!s.ok()) { + ASSERT_TRUE(s.IsDeadlock()); + ASSERT_OK(txn->Rollback()); + break; + } + } + + delete txn; + } + }; + + std::vector<port::Thread> threads; + for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) { + threads.emplace_back(stress_thread, rnd.Next()); + } + + for (auto& t : threads) { + t.join(); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, CommitTimeBatchFailTest) { + WriteOptions write_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog")); + + s = txn1->Put("foo", "bar"); + ASSERT_OK(s); + + // fails due to non-empty commit-time batch + s = txn1->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + delete txn1; +} + +TEST_P(TransactionTest, LogMarkLeakTest) { + TransactionOptions txn_options; + WriteOptions write_options; + options.write_buffer_size = 1024; + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + Random rnd(47); + std::vector<Transaction*> txns; + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + // At the beginning there should be no log containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + for (size_t i = 0; i < 100; i++) { + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + ASSERT_OK(txn->Put(Slice("foo" + std::to_string(i)), Slice("bar"))); + ASSERT_OK(txn->Prepare()); + ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + if (rnd.OneIn(5)) { + txns.push_back(txn); + } else { + ASSERT_OK(txn->Commit()); + delete txn; + } + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + } + for (auto txn : txns) { + ASSERT_OK(txn->Commit()); + delete txn; + } + // At the end there should be no log left containing prepare data + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + // Make sure that the underlying data structures are properly truncated and + // cause not leak + ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0); + ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0); +} + +TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) { + for (bool cwb4recovery : {true, false}) { + ASSERT_OK(ReOpen()); + WriteOptions write_options; + ReadOptions read_options; + + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // regular db put + s = db->Put(write_options, Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // regular db read + ASSERT_OK(db->Get(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar2"); + + // commit time put + if (cwb4recovery) { + ASSERT_OK( + txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"))); + ASSERT_OK( + txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"))); + } + + // nothing has been prepped yet + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + s = txn->Prepare(); + ASSERT_OK(s); + + // data not im mem yet + s = db->Get(read_options, Slice("foo"), &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, Slice("gtid"), &value); + ASSERT_TRUE(s.IsNotFound()); + + // find trans in list of prepared transactions + std::vector<Transaction*> prepared_trans; + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 1); + ASSERT_EQ(prepared_trans.front()->GetName(), "xid"); + + auto log_containing_prep = + db_impl->TEST_FindMinLogContainingOutstandingPrep(); + ASSERT_GT(log_containing_prep, 0); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + // value is now available + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // we already committed + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // no longer is prepared results + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 0); + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); + + // heap should not care about prepared section anymore + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // but now our memtable should be referencing the prep section + ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep()); + ASSERT_EQ(log_containing_prep, + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // After flush the recoverable state must be visible + if (cwb4recovery) { + s = db->Get(read_options, "gtid", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs"); + + s = db->Get(read_options, "gtid2", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "cats"); + } + + // after memtable flush we can now relese the log + ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep); + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + + delete txn; + + if (cwb4recovery) { + // kill and reopen to trigger recovery + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + s = db->Get(read_options, "gtid", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs"); + + s = db->Get(read_options, "gtid2", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "cats"); + } + } +} + +TEST_P(TransactionTest, TwoPhaseNameTest) { + Status s; + + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn3); + delete txn3; + + // cant prepare txn without name + s = txn1->Prepare(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // name too short + s = txn1->SetName(""); + ASSERT_EQ(s, Status::InvalidArgument()); + + // name too long + s = txn1->SetName(std::string(513, 'x')); + ASSERT_EQ(s, Status::InvalidArgument()); + + // valid set name + s = txn1->SetName("name1"); + ASSERT_OK(s); + + // cant have duplicate name + s = txn2->SetName("name1"); + ASSERT_EQ(s, Status::InvalidArgument()); + + // shouldn't be able to prepare + s = txn2->Prepare(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // valid name set + s = txn2->SetName("name2"); + ASSERT_OK(s); + + // cant reset name + s = txn2->SetName("name3"); + ASSERT_EQ(s, Status::InvalidArgument()); + + ASSERT_EQ(txn1->GetName(), "name1"); + ASSERT_EQ(txn2->GetName(), "name2"); + + s = txn1->Prepare(); + ASSERT_OK(s); + + // can't rename after prepare + s = txn1->SetName("name4"); + ASSERT_EQ(s, Status::InvalidArgument()); + + ASSERT_OK(txn1->Rollback()); + ASSERT_OK(txn2->Rollback()); + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) { + for (bool cwb4recovery : {true, false}) { + for (bool test_with_empty_wal : {true, false}) { + if (!cwb4recovery && test_with_empty_wal) { + continue; + } + ASSERT_OK(ReOpen()); + Status s; + std::string value; + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = + cwb4recovery; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn1->SetName("joe"); + ASSERT_OK(s); + + s = txn2->SetName("bob"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + + if (cwb4recovery) { + ASSERT_OK( + txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"))); + } + + s = txn2->Prepare(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn2; + if (cwb4recovery) { + if (test_with_empty_wal) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + // After flush the state must be visible + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + } + ASSERT_OK(db->FlushWAL(true)); + // kill and reopen to trigger recovery + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + s = db->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + } + } + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, TwoPhaseExpirationTest) { + Status s; + + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.expiration = 500; // 500ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + ASSERT_TRUE(txn1); + + s = txn1->SetName("joe"); + ASSERT_OK(s); + s = txn2->SetName("bob"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Prepare(); + ASSERT_EQ(s, Status::Expired()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TwoPhaseRollbackTest) { + WriteOptions write_options; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("tfoo"), Slice("tbar")); + ASSERT_OK(s); + + // value is readable form txn + s = txn->Get(read_options, Slice("tfoo"), &value); + ASSERT_OK(s); + ASSERT_EQ(value, "tbar"); + + // issue rollback + s = txn->Rollback(); + ASSERT_OK(s); + + // value is nolonger readable + s = txn->Get(read_options, Slice("tfoo"), &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(txn->GetNumPuts(), 0); + + // put new txn values + s = txn->Put(Slice("tfoo2"), Slice("tbar2")); + ASSERT_OK(s); + + // new value is readable from txn + s = txn->Get(read_options, Slice("tfoo2"), &value); + ASSERT_OK(s); + ASSERT_EQ(value, "tbar2"); + + s = txn->Prepare(); + ASSERT_OK(s); + + // flush to next wal + s = db->Put(write_options, Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // issue rollback (marker written to WAL) + s = txn->Rollback(); + ASSERT_OK(s); + + // value is nolonger readable + s = txn->Get(read_options, Slice("tfoo2"), &value); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(txn->GetNumPuts(), 0); + + // make commit + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // try rollback again + s = txn->Rollback(); + ASSERT_EQ(s, Status::InvalidArgument()); + + delete txn; +} + +TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + // txn read + s = txn->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // regular db put + s = db->Put(write_options, Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // regular db read + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "bar2"); + + // nothing has been prepped yet + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + // still not available to db + s = db->Get(read_options, Slice("foo"), &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_OK(db->FlushWAL(false)); + delete txn; + // kill and reopen + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + s = ReOpenNoDelete(); + ASSERT_OK(s); + assert(db != nullptr); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + // find trans in list of prepared transactions + std::vector<Transaction*> prepared_trans; + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 1); + + txn = prepared_trans.front(); + ASSERT_TRUE(txn); + ASSERT_EQ(txn->GetName(), "xid"); + ASSERT_EQ(db->GetTransactionByName("xid"), txn); + + // log has been marked + auto log_containing_prep = + db_impl->TEST_FindMinLogContainingOutstandingPrep(); + ASSERT_GT(log_containing_prep, 0); + + // value is readable from txn + s = txn->Get(read_options, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar"); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + // value is now available + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + // we already committed + s = txn->Commit(); + ASSERT_EQ(s, Status::InvalidArgument()); + + // no longer is prepared results + prepared_trans.clear(); + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), 0); + + // transaction should no longer be visible + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); + + // heap should not care about prepared section anymore + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // but now our memtable should be referencing the prep section + ASSERT_EQ(log_containing_prep, + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep()); + + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // Add a dummy record to memtable before a flush. Otherwise, the + // memtable will be empty and flush will be skipped. + s = db->Put(write_options, Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + // after memtable flush we can now release the log + ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep); + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + + delete txn; + + // deleting transaction should unregister transaction + ASSERT_EQ(db->GetTransactionByName("xid"), nullptr); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// TODO this test needs to be updated with serial commits +TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) { + // mix transaction writes and regular writes + const uint32_t NUM_TXN_THREADS = 50; + std::atomic<uint32_t> txn_thread_num(0); + + std::function<void()> txn_write_thread = [&]() { + uint32_t id = txn_thread_num.fetch_add(1); + + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + TransactionOptions txn_options; + txn_options.lock_timeout = 1000000; + if (id % 2 == 0) { + txn_options.expiration = 1000000; + } + TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(id))); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName(name)); + for (int i = 0; i < 10; i++) { + std::string key(name + "_" + std::string(1, static_cast<char>('A' + i))); + ASSERT_OK(txn->Put(key, "val")); + } + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + }; + + // assure that all thread are in the same write group + std::atomic<uint32_t> t_wait_on_prepare(0); + std::atomic<uint32_t> t_wait_on_commit(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + auto* writer = reinterpret_cast<WriteThread::Writer*>(arg); + + if (writer->ShouldWriteToWAL()) { + t_wait_on_prepare.fetch_add(1); + // wait for friends + while (t_wait_on_prepare.load() < NUM_TXN_THREADS) { + env->SleepForMicroseconds(10); + } + } else if (writer->ShouldWriteToMemtable()) { + t_wait_on_commit.fetch_add(1); + // wait for friends + while (t_wait_on_commit.load() < NUM_TXN_THREADS) { + env->SleepForMicroseconds(10); + } + } else { + FAIL(); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector<port::Thread> threads; + for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) { + threads.emplace_back(txn_write_thread); + } + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ReadOptions read_options; + std::string value; + Status s; + for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) { + TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(t))); + for (int i = 0; i < 10; i++) { + std::string key(name + "_" + std::string(1, static_cast<char>('A' + i))); + s = db->Get(read_options, key, &value); + ASSERT_OK(s); + ASSERT_EQ(value, "val"); + } + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("bob"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + delete txn; + + for (int i = 0; i < 1000; i++) { + std::string key(i, 'k'); + std::string val(1000, 'v'); + assert(db != nullptr); + s = db->Put(write_options, key, val); + ASSERT_OK(s); + + if (i % 29 == 0) { + // crash + env->SetFilesystemActive(false); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ReOpenNoDelete(); + } else if (i % 37 == 0) { + // close + ReOpenNoDelete(); + } + } + + // commit old txn + txn = db->GetTransactionByName("bob"); + ASSERT_TRUE(txn); + s = txn->Commit(); + ASSERT_OK(s); + + // verify data txn data + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + // verify non txn data + for (int i = 0; i < 1000; i++) { + std::string key(i, 'k'); + std::string val(1000, 'v'); + s = db->Get(read_options, key, &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, val); + } + + delete txn; +} + +TEST_P(TransactionTest, TwoPhaseSequenceTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("xid"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + s = txn->Put(Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + s = txn->Put(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn->Put(Slice("foo4"), Slice("bar4")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + // make commit + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + ReOpenNoDelete(); + assert(db != nullptr); + + // value is now available + s = db->Get(read_options, "foo4", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar4"); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ReadOptions read_options; + + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("a"); + ASSERT_OK(s); + + // transaction put + s = txn->Put(Slice("foo"), Slice("bar")); + ASSERT_OK(s); + + // prepare + s = txn->Prepare(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ReOpenNoDelete(); + + // commit old txn + assert(db != nullptr); // Make clang analyze happy. + txn = db->GetTransactionByName("a"); + assert(txn != nullptr); + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + delete txn; + + txn = db->BeginTransaction(write_options, txn_options); + s = txn->SetName("b"); + ASSERT_OK(s); + + s = txn->Put(Slice("foo2"), Slice("bar2")); + ASSERT_OK(s); + + s = txn->Prepare(); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // kill and reopen + env->SetFilesystemActive(false); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + // value is now available + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar"); + + s = db->Get(read_options, "foo2", &value); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(value, "bar2"); +} + +TEST_P(TransactionTest, TwoPhaseLogRollingTest) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + Status s; + std::string v; + ColumnFamilyHandle *cfa, *cfb; + + // Create 2 new column families + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + WriteOptions wopts; + wopts.disableWAL = false; + wopts.sync = true; + + TransactionOptions topts1; + Transaction* txn1 = db->BeginTransaction(wopts, topts1); + s = txn1->SetName("xid1"); + ASSERT_OK(s); + + TransactionOptions topts2; + Transaction* txn2 = db->BeginTransaction(wopts, topts2); + s = txn2->SetName("xid2"); + ASSERT_OK(s); + + // transaction put in two column families + s = txn1->Put(cfa, "ka1", "va1"); + ASSERT_OK(s); + + // transaction put in two column families + s = txn2->Put(cfa, "ka2", "va2"); + ASSERT_OK(s); + s = txn2->Put(cfb, "kb2", "vb2"); + ASSERT_OK(s); + + // write prep section to wal + s = txn1->Prepare(); + ASSERT_OK(s); + + // our log should be in the heap + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber()); + + // flush default cf to crate new log + s = db->Put(wopts, "foo", "bar"); + ASSERT_OK(s); + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // make sure we are on a new log + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber()); + + // put txn2 prep section in this log + s = txn2->Prepare(); + ASSERT_OK(s); + ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber()); + + // heap should still see first log + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + + // commit txn1 + s = txn1->Commit(); + ASSERT_OK(s); + + // heap should now show txn2s log + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn2->GetLogNumber()); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // we should see txn1s log refernced by the memtables + ASSERT_EQ(txn1->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush default cf to crate new log + s = db->Put(wopts, "foo", "bar2"); + ASSERT_OK(s); + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // make sure we are on a new log + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber()); + + // commit txn2 + s = txn2->Commit(); + ASSERT_OK(s); + + // heap should not show any logs + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // should show the first txn log + ASSERT_EQ(txn1->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush only cfa memtable + s = db_impl->TEST_FlushMemTable(true, false, cfa); + ASSERT_OK(s); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // should show the first txn log + ASSERT_EQ(txn2->GetLogNumber(), + db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable()); + break; + default: + assert(false); + } + + // flush only cfb memtable + s = db_impl->TEST_FlushMemTable(true, false, cfb); + ASSERT_OK(s); + + // should show not dependency on logs + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + delete txn1; + delete txn2; + delete cfa; + delete cfb; +} + +TEST_P(TransactionTest, TwoPhaseLogRollingTest2) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + Status s; + ColumnFamilyHandle *cfa, *cfb; + + ColumnFamilyOptions cf_options; + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + WriteOptions wopts; + wopts.disableWAL = false; + wopts.sync = true; + + auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(cfa); + auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(cfb); + + TransactionOptions topts1; + Transaction* txn1 = db->BeginTransaction(wopts, topts1); + s = txn1->SetName("xid1"); + ASSERT_OK(s); + s = txn1->Put(cfa, "boys", "girls1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(wopts, topts1); + s = txn2->SetName("xid2"); + ASSERT_OK(s); + s = txn2->Put(cfb, "up", "down1"); + ASSERT_OK(s); + + // prepre transaction in LOG A + s = txn1->Prepare(); + ASSERT_OK(s); + + // prepre transaction in LOG A + s = txn2->Prepare(); + ASSERT_OK(s); + + // regular put so that mem table can actually be flushed for log rolling + s = db->Put(wopts, "cats", "dogs1"); + ASSERT_OK(s); + + auto prepare_log_no = txn1->GetLastLogNumber(); + + // roll to LOG B + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // now we pause background work so that + // imm()s are not flushed before we can check their status + s = db_impl->PauseBackgroundWork(); + ASSERT_OK(s); + + ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no); + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // This cf is empty and should ref the latest log + ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no); + ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // This cf is not flushed yet and should ref the log that has its data + ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no); + break; + default: + assert(false); + } + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + + // commit in LOG B + s = txn1->Commit(); + ASSERT_OK(s); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), + prepare_log_no); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // In these modes memtable do not ref the prep sections + ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0); + break; + default: + assert(false); + } + + ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog()); + + // request a flush for all column families such that the earliest + // alive log file can be killed + ASSERT_OK(db_impl->TEST_SwitchWAL()); + // log cannot be flushed because txn2 has not been commited + ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed()); + ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog()); + + // assert that cfa has a flush requested + ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested()); + + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + // cfb should not be flushed becuse it has no data from LOG A + ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested()); + break; + case WRITE_PREPARED: + case WRITE_UNPREPARED: + // cfb should be flushed becuse it has prepared data from LOG A + ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested()); + break; + default: + assert(false); + } + + // cfb now has data from LOG A + s = txn2->Commit(); + ASSERT_OK(s); + + ASSERT_OK(db_impl->TEST_SwitchWAL()); + ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog()); + + // we should see that cfb now has a flush requested + ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested()); + + // all data in LOG A resides in a memtable that has been + // requested for a flush + ASSERT_TRUE(db_impl->TEST_IsLogGettingFlushed()); + + delete txn1; + delete txn2; + delete cfa; + delete cfb; +} +/* + * 1) use prepare to keep first log around to determine starting sequence + * during recovery. + * 2) insert many values, skipping wal, to increase seqid. + * 3) insert final value into wal + * 4) recover and see that final value was properly recovered - not + * hidden behind improperly summed sequence ids + */ +TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + WriteOptions wal_on, wal_off; + wal_on.sync = true; + wal_on.disableWAL = false; + wal_off.disableWAL = true; + ReadOptions read_options; + TransactionOptions txn_options; + + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(wal_on, txn_options); + + s = txn1->SetName("1"); + ASSERT_OK(s); + + s = db->Put(wal_on, "first", "first"); + ASSERT_OK(s); + + s = txn1->Put(Slice("dummy"), Slice("dummy")); + ASSERT_OK(s); + s = txn1->Prepare(); + ASSERT_OK(s); + + s = db->Put(wal_off, "cats", "dogs1"); + ASSERT_OK(s); + s = db->Put(wal_off, "cats", "dogs2"); + ASSERT_OK(s); + s = db->Put(wal_off, "cats", "dogs3"); + ASSERT_OK(s); + + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + s = db->Put(wal_on, "cats", "dogs4"); + ASSERT_OK(s); + + ASSERT_OK(db->FlushWAL(false)); + + // kill and reopen + env->SetFilesystemActive(false); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + s = db->Get(read_options, "first", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "first"); + + s = db->Get(read_options, "cats", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "dogs4"); +} + +TEST_P(TransactionTest, FirstWriteTest) { + WriteOptions write_options; + + // Test conflict checking against the very first write to a db. + // The transaction's snapshot will have seq 1 and the following write + // will have sequence 1. + Status s = db->Put(write_options, "A", "a"); + + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + ASSERT_OK(s); + + s = txn->Put("A", "b"); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, FirstWriteTest2) { + WriteOptions write_options; + + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + // Test conflict checking against the very first write to a db. + // The transaction's snapshot is a seq 0 while the following write + // will have sequence 1. + Status s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = txn->Put("A", "b"); + ASSERT_TRUE(s.IsBusy()); + + delete txn; +} + +TEST_P(TransactionTest, WriteOptionsTest) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = true; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_TRUE(txn->GetWriteOptions()->sync); + + write_options.sync = false; + txn->SetWriteOptions(write_options); + ASSERT_FALSE(txn->GetWriteOptions()->sync); + ASSERT_TRUE(txn->GetWriteOptions()->disableWAL); + + delete txn; +} + +TEST_P(TransactionTest, WriteConflictTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "A")); + ASSERT_OK(db->Put(write_options, "foo2", "B")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("foo", "A2"); + ASSERT_OK(s); + + s = txn->Put("foo2", "B2"); + ASSERT_OK(s); + + // This Put outside of a transaction will conflict with the previous write + s = db->Put(write_options, "foo", "xxx"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A"); + + s = txn->Commit(); + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "A2"); + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "B2"); + + delete txn; +} + +TEST_P(TransactionTest, WriteConflictTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + // This Put outside of a transaction will conflict with a later write + s = db->Put(write_options, "foo", "barz"); + ASSERT_OK(s); + + s = txn->Put("foo2", "X"); + ASSERT_OK(s); + + s = txn->Put("foo", + "bar2"); // Conflicts with write done after snapshot taken + ASSERT_TRUE(s.IsBusy()); + + s = txn->Put("foo3", "Y"); + ASSERT_OK(s); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + ASSERT_EQ(2, txn->GetNumKeys()); + + s = txn->Commit(); + ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3 + + // Verify that transaction wrote foo2 and foo3 but not foo + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "barz"); + + db->Get(read_options, "foo2", &value); + ASSERT_EQ(value, "X"); + + db->Get(read_options, "foo3", &value); + ASSERT_EQ(value, "Y"); + + delete txn; +} + +TEST_P(TransactionTest, ReadConflictTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "foo", "bar")); + ASSERT_OK(db->Put(write_options, "foo2", "bar")); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + // This Put outside of a transaction will conflict with the previous read + s = db->Put(write_options, "foo", "barz"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, TxnOnlyTest) { + // Test to make sure transactions work when there are no other writes in an + // empty db. + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("x", "y"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, FlushTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + + // Put a random key so we have a memtable to flush + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + // force a memtable flush + FlushOptions flush_ops; + db->Flush(flush_ops); + + s = txn->Commit(); + // txn should commit since the flushed table is still in MemtableList History + ASSERT_OK(s); + + db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, FlushTest2) { + const size_t num_tests = 3; + + for (size_t n = 0; n < num_tests; n++) { + // Test different table factories + switch (n) { + case 0: + break; + case 1: + options.table_factory.reset(new mock::MockTableFactory()); + break; + case 2: { + PlainTableOptions pt_opts; + pt_opts.hash_table_ratio = 0; + options.table_factory.reset(NewPlainTableFactory(pt_opts)); + break; + } + } + + Status s = ReOpen(); + ASSERT_OK(s); + assert(db != nullptr); + + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2"))); + ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3"))); + + txn_options.set_snapshot = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar"); + + s = txn->Put(Slice("foo"), Slice("bar2")); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + // verify foo is locked by txn + s = db->Delete(write_options, "foo"); + ASSERT_TRUE(s.IsTimedOut()); + + s = db->Put(write_options, "Z", "z"); + ASSERT_OK(s); + s = db->Put(write_options, "dummy", "dummy"); + ASSERT_OK(s); + + s = db->Put(write_options, "S", "s"); + ASSERT_OK(s); + s = db->SingleDelete(write_options, "S"); + ASSERT_OK(s); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in memtable + ASSERT_TRUE(s.IsBusy()); + + // force a memtable flush + s = db_impl->TEST_FlushMemTable(true); + ASSERT_OK(s); + + // Put a random key so we have a MemTable to flush + s = db->Put(write_options, "dummy", "dummy2"); + ASSERT_OK(s); + + // force a memtable flush + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + s = db->Put(write_options, "dummy", "dummy3"); + ASSERT_OK(s); + + // force a memtable flush + // Since our test db has max_write_buffer_number=2, this flush will cause + // the first memtable to get purged from the MemtableList history. + ASSERT_OK(db_impl->TEST_FlushMemTable(true)); + + s = txn->Put("X", "Y"); + // Should succeed after verifying there is no write to X in SST file + ASSERT_OK(s); + + s = txn->Put("Z", "zz"); + // Should fail after encountering a write to Z in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->GetForUpdate(read_options, "foo2", &value); + // should succeed since key was written before txn started + ASSERT_OK(s); + // verify foo2 is locked by txn + s = db->Delete(write_options, "foo2"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in SST file + ASSERT_TRUE(s.IsBusy()); + + // Write a bunch of keys to db to force a compaction + Random rnd(47); + for (int i = 0; i < 1000; i++) { + s = db->Put(write_options, std::to_string(i), + test::CompressibleString(&rnd, 0.8, 100, &value)); + ASSERT_OK(s); + } + + s = txn->Put("X", "yy"); + // Should succeed after verifying there is no write to X in SST file + ASSERT_OK(s); + + s = txn->Put("Z", "zzz"); + // Should fail after encountering a write to Z in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->Delete("S"); + // Should fail after encountering a write to S in SST file + ASSERT_TRUE(s.IsBusy()); + + s = txn->GetForUpdate(read_options, "foo3", &value); + // should succeed since key was written before txn started + ASSERT_OK(s); + // verify foo3 is locked by txn + s = db->Delete(write_options, "foo3"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(db_impl->TEST_WaitForCompact()); + + s = txn->Commit(); + ASSERT_OK(s); + + // Transaction should only write the keys that succeeded. + s = db->Get(read_options, "foo", &value); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("yy", value); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z", value); + + delete txn; + } +} + +TEST_P(TransactionTest, NoSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "AAA", "bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Modify key after transaction start + ASSERT_OK(db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snap + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Should commit since read/write was done after data changed + s = txn->Commit(); + ASSERT_OK(s); + + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar2"); + + delete txn; +} + +TEST_P(TransactionTest, MultipleSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ASSERT_OK(db->Put(write_options, "AAA", "bar")); + ASSERT_OK(db->Put(write_options, "BBB", "bar")); + ASSERT_OK(db->Put(write_options, "CCC", "bar")); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_OK(db->Put(write_options, "AAA", "bar1")); + + // Read and write without a snapshot + ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("AAA", "bar2"); + ASSERT_OK(s); + + // Modify BBB before snapshot is taken + ASSERT_OK(db->Put(write_options, "BBB", "bar1")); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("BBB", "bar2"); + ASSERT_OK(s); + + ASSERT_OK(db->Put(write_options, "CCC", "bar1")); + + // Set a new snapshot + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + // Read and write with snapshot + ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value)); + ASSERT_EQ(value, "bar1"); + s = txn->Put("CCC", "bar2"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = txn->GetForUpdate(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar1"); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "AAA", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "BBB", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + s = db->Get(read_options, "CCC", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "bar2"); + + // verify that we track multiple writes to the same key at different snapshots + delete txn; + txn = db->BeginTransaction(write_options); + + // Potentially conflicting writes + ASSERT_OK(db->Put(write_options, "ZZZ", "zzz")); + ASSERT_OK(db->Put(write_options, "XXX", "xxx")); + + txn->SetSnapshot(); + + TransactionOptions txn_options; + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + txn2->SetSnapshot(); + + // This should not conflict in txn since the snapshot is later than the + // previous write (spoiler alert: it will later conflict with txn2). + s = txn->Put("ZZZ", "zzzz"); + ASSERT_OK(s); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; + + // This will conflict since the snapshot is earlier than another write to ZZZ + s = txn2->Put("ZZZ", "xxxxx"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "ZZZ", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + delete txn2; +} + +TEST_P(TransactionTest, ColumnFamiliesTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + db = nullptr; + + // open DB with three column families + std::vector<ColumnFamilyDescriptor> column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + std::vector<ColumnFamilyHandle*> handles; + + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "foo")); + ASSERT_OK(batch.Put(handles[1], "AAA", "bar")); + ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar")); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ")); + + // These keys do not conflict with existing writes since they're in + // different column families + s = txn->Delete("AAA"); + ASSERT_OK(s); + s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + Slice key_slice("AAAZZZ"); + Slice value_slices[2] = {Slice("bar"), Slice("bar")}; + s = txn->Put(handles[2], SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + ASSERT_OK(s); + ASSERT_EQ(3, txn->GetNumKeys()); + + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "AAA", &value); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_EQ(value, "barbar"); + + Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; + Slice value_slice("barbarbar"); + + s = txn2->Delete(handles[2], "XXX"); + ASSERT_OK(s); + s = txn2->Delete(handles[1], "XXX"); + ASSERT_OK(s); + + // This write will cause a conflict with the earlier batch write + s = txn2->Put(handles[1], SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + // In the above the latest change to AAAZZZ in handles[1] is delete. + s = db->Get(read_options, handles[1], "AAAZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; + + txn = db->BeginTransaction(write_options, txn_options); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2], + handles[0], handles[2]}; + std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"}; + std::vector<std::string> values(4); + std::vector<Status> results = txn->MultiGetForUpdate( + snapshot_read_options, multiget_cfh, multiget_keys, &values); + ASSERT_OK(results[0]); + ASSERT_OK(results[1]); + ASSERT_OK(results[2]); + ASSERT_TRUE(results[3].IsNotFound()); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "barbar"); + ASSERT_EQ(values[2], "foo"); + + s = txn->SingleDelete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYY"); + ASSERT_OK(s); + s = txn->Put(handles[2], "ZZZ", "YYYY"); + ASSERT_OK(s); + s = txn->Delete(handles[2], "ZZZ"); + ASSERT_OK(s); + s = txn->Put(handles[2], "AAAZZZ", "barbarbar"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumKeys()); + + // Txn should commit + s = txn->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, handles[2], "ZZZ", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Put a key which will conflict with the next txn using the previous snapshot + ASSERT_OK(db->Put(write_options, handles[2], "foo", "000")); + + results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh, + multiget_keys, &values); + // All results should fail since there was a conflict + ASSERT_TRUE(results[0].IsBusy()); + ASSERT_TRUE(results[1].IsBusy()); + ASSERT_TRUE(results[2].IsBusy()); + ASSERT_TRUE(results[3].IsBusy()); + + s = db->Get(read_options, handles[2], "foo", &value); + ASSERT_EQ(value, "000"); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->DropColumnFamily(handles[1]); + ASSERT_OK(s); + s = db->DropColumnFamily(handles[2]); + ASSERT_OK(s); + + delete txn; + delete txn2; + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionTest, MultiGetBatchedTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + ColumnFamilyHandle* cf; + ColumnFamilyOptions cf_options; + + // Create a new column families + s = db->CreateColumnFamily(cf_options, "CF", &cf); + ASSERT_OK(s); + + delete cf; + delete db; + db = nullptr; + + // open DB with three column families + std::vector<ColumnFamilyDescriptor> column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + + std::vector<ColumnFamilyHandle*> handles; + + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + // Write some data to the db + WriteBatch batch; + ASSERT_OK(batch.Put(handles[1], "aaa", "val1")); + ASSERT_OK(batch.Put(handles[1], "bbb", "val2")); + ASSERT_OK(batch.Put(handles[1], "ccc", "val3")); + ASSERT_OK(batch.Put(handles[1], "ddd", "foo")); + ASSERT_OK(batch.Put(handles[1], "eee", "val5")); + ASSERT_OK(batch.Put(handles[1], "fff", "val6")); + ASSERT_OK(batch.Merge(handles[1], "ggg", "foo")); + s = db->Write(write_options, &batch); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + + txn_options.set_snapshot = true; + // Write some data to the db + s = txn->Delete(handles[1], "bbb"); + ASSERT_OK(s); + s = txn->Put(handles[1], "ccc", "val3_new"); + ASSERT_OK(s); + s = txn->Merge(handles[1], "ddd", "bar"); + ASSERT_OK(s); + + std::vector<Slice> keys = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"}; + std::vector<PinnableSlice> values(keys.size()); + std::vector<Status> statuses(keys.size()); + + txn->MultiGet(snapshot_read_options, handles[1], keys.size(), keys.data(), + values.data(), statuses.data()); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ(values[0], "val1"); + ASSERT_TRUE(statuses[1].IsNotFound()); + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ(values[2], "val3_new"); + ASSERT_TRUE(statuses[3].ok()); + ASSERT_EQ(values[3], "foo,bar"); + ASSERT_TRUE(statuses[4].ok()); + ASSERT_EQ(values[4], "val5"); + ASSERT_TRUE(statuses[5].ok()); + ASSERT_EQ(values[5], "val6"); + ASSERT_TRUE(statuses[6].ok()); + ASSERT_EQ(values[6], "foo"); + delete txn; + for (auto handle : handles) { + delete handle; + } +} + +// This test calls WriteBatchWithIndex::MultiGetFromBatchAndDB with a large +// number of keys, i.e greater than MultiGetContext::MAX_BATCH_SIZE, which is +// is 32. This forces autovector allocations in the MultiGet code paths +// to use std::vector in addition to stack allocations. The MultiGet keys +// includes Merges, which are handled specially in MultiGetFromBatchAndDB by +// allocating an autovector of MergeContexts +TEST_P(TransactionTest, MultiGetLargeBatchedTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ColumnFamilyHandle* cf; + ColumnFamilyOptions cf_options; + + std::vector<std::string> key_str; + for (int i = 0; i < 100; ++i) { + key_str.emplace_back(std::to_string(i)); + } + // Create a new column families + s = db->CreateColumnFamily(cf_options, "CF", &cf); + ASSERT_OK(s); + + delete cf; + delete db; + db = nullptr; + + // open DB with three column families + std::vector<ColumnFamilyDescriptor> column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + column_families.push_back(ColumnFamilyDescriptor("CF", cf_options)); + + std::vector<ColumnFamilyHandle*> handles; + + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + // Write some data to the db + WriteBatch batch; + for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) { + std::string val = "val" + std::to_string(i); + ASSERT_OK(batch.Put(handles[1], key_str[i], val)); + } + s = db->Write(write_options, &batch); + ASSERT_OK(s); + + WriteBatchWithIndex wb; + // Write some data to the db + s = wb.Delete(handles[1], std::to_string(1)); + ASSERT_OK(s); + s = wb.Put(handles[1], std::to_string(2), "new_val" + std::to_string(2)); + ASSERT_OK(s); + // Write a lot of merges so when we call MultiGetFromBatchAndDB later on, + // it is forced to use std::vector in ROCKSDB_NAMESPACE::autovector to + // allocate MergeContexts. The number of merges needs to be > + // MultiGetContext::MAX_BATCH_SIZE + for (int i = 8; i < MultiGetContext::MAX_BATCH_SIZE + 24; ++i) { + s = wb.Merge(handles[1], std::to_string(i), "merge"); + ASSERT_OK(s); + } + + // MultiGet a lot of keys in order to force std::vector reallocations + std::vector<Slice> keys; + for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE + 32; ++i) { + keys.emplace_back(key_str[i]); + } + std::vector<PinnableSlice> values(keys.size()); + std::vector<Status> statuses(keys.size()); + + wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(), + keys.data(), values.data(), statuses.data(), false); + for (size_t i = 0; i < keys.size(); ++i) { + if (i == 1) { + ASSERT_TRUE(statuses[1].IsNotFound()); + } else if (i == 2) { + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ(values[2], "new_val" + std::to_string(2)); + } else if (i >= 8 && i < 56) { + ASSERT_TRUE(statuses[i].ok()); + ASSERT_EQ(values[i], "val" + std::to_string(i) + ",merge"); + } else { + ASSERT_TRUE(statuses[i].ok()); + if (values[i] != "val" + std::to_string(i)) { + ASSERT_EQ(values[i], "val" + std::to_string(i)); + } + } + } + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionTest, MultiGetSnapshot) { + WriteOptions write_options; + TransactionOptions transaction_options; + Transaction* txn1 = db->BeginTransaction(write_options, transaction_options); + + Slice key = "foo"; + + Status s = txn1->Put(key, "bar"); + ASSERT_OK(s); + + s = txn1->SetName("test"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + + // Get snapshot between prepare and commit + // Un-committed data should be invisible to other transactions + const Snapshot* s1 = db->GetSnapshot(); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + Transaction* txn2 = db->BeginTransaction(write_options, transaction_options); + ReadOptions read_options; + read_options.snapshot = s1; + + std::vector<Slice> keys; + std::vector<PinnableSlice> values(1); + std::vector<Status> statuses(1); + keys.push_back(key); + auto cfd = db->DefaultColumnFamily(); + txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(), + statuses.data()); + ASSERT_TRUE(statuses[0].IsNotFound()); + delete txn2; + + db->ReleaseSnapshot(s1); +} + +TEST_P(TransactionTest, ColumnFamiliesTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + ColumnFamilyHandle *one, *two; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "ONE", &one); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "TWO", &two); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + Transaction* txn2 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn2); + + s = txn1->Put(one, "X", "1"); + ASSERT_OK(s); + s = txn1->Put(two, "X", "2"); + ASSERT_OK(s); + s = txn1->Put("X", "0"); + ASSERT_OK(s); + + s = txn2->Put(one, "X", "11"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + // Drop first column family + s = db->DropColumnFamily(one); + ASSERT_OK(s); + + // Should fail since column family was dropped. + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + txn1 = db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + // Should fail since column family was dropped + s = txn1->Put(one, "X", "111"); + ASSERT_TRUE(s.IsInvalidArgument()); + + s = txn1->Put(two, "X", "222"); + ASSERT_OK(s); + + s = txn1->Put("X", "000"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, two, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("222", value); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("000", value); + + s = db->DropColumnFamily(two); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + delete one; + delete two; +} + +TEST_P(TransactionTest, EmptyTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + ASSERT_OK(txn->Rollback()); + delete txn; + + txn = db->BeginTransaction(write_options); + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->GetForUpdate(read_options, "aaa", &value); + ASSERT_EQ(value, "aaa"); + + // Conflicts with previous GetForUpdate + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_TRUE(s.IsTimedOut()); + + // transaction expired! + s = txn->Commit(); + ASSERT_OK(s); + delete txn; +} + +TEST_P(TransactionTest, PredicateManyPreceders) { + WriteOptions write_options; + ReadOptions read_options1, read_options2; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + read_options2.snapshot = txn2->GetSnapshot(); + + std::vector<Slice> multiget_keys = {"1", "2", "3"}; + std::vector<std::string> multiget_values; + + std::vector<Status> results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_EQ(results.size(), 3); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + s = txn2->Put("2", "x"); // Conflict's with txn1's MultiGetForUpdate + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn2->Rollback()); + + multiget_values.clear(); + results = + txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values); + ASSERT_EQ(results.size(), 3); + ASSERT_TRUE(results[0].IsNotFound()); + ASSERT_TRUE(results[1].IsNotFound()); + ASSERT_TRUE(results[2].IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("4", "x"); + ASSERT_OK(s); + + s = txn2->Delete("4"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->GetForUpdate(read_options2, "4", &value); + ASSERT_TRUE(s.IsBusy()); + + ASSERT_OK(txn2->Rollback()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, LostUpdate) { + WriteOptions write_options; + ReadOptions read_options, read_options1, read_options2; + TransactionOptions txn_options; + std::string value; + Status s; + + // Test 2 transactions writing to the same key in multiple orders and + // with/without snapshots + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "1"); + ASSERT_OK(s); + + s = txn2->Put("1", "2"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + delete txn2; + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "3"); + ASSERT_OK(s); + s = txn2->Put("1", "4"); // conflict + ASSERT_TRUE(s.IsTimedOut()); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "5"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "6"); + ASSERT_TRUE(s.IsBusy()); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options, txn_options); + read_options1.snapshot = txn1->GetSnapshot(); + + txn2 = db->BeginTransaction(write_options, txn_options); + read_options2.snapshot = txn2->GetSnapshot(); + + s = txn1->Put("1", "7"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn2->SetSnapshot(); + s = txn2->Put("1", "8"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ("8", value); + + delete txn1; + delete txn2; + + txn1 = db->BeginTransaction(write_options); + txn2 = db->BeginTransaction(write_options); + + s = txn1->Put("1", "9"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("1", "10"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; + + s = db->Get(read_options, "1", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "10"); +} + +TEST_P(TransactionTest, UntrackedWrites) { + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + // TODO(lth): For WriteUnprepared, validate that untracked writes are + // not supported. + return; + } + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + // Verify transaction rollback works for untracked keys. + Transaction* txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = txn->PutUntracked("untracked", "0"); + ASSERT_OK(s); + ASSERT_OK(txn->Rollback()); + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + txn = db->BeginTransaction(write_options); + txn->SetSnapshot(); + + s = db->Put(write_options, "untracked", "x"); + ASSERT_OK(s); + + // Untracked writes should succeed even though key was written after snapshot + s = txn->PutUntracked("untracked", "1"); + ASSERT_OK(s); + s = txn->MergeUntracked("untracked", "2"); + ASSERT_OK(s); + s = txn->DeleteUntracked("untracked"); + ASSERT_OK(s); + + // Conflict + s = txn->Put("untracked", "3"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "untracked", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(TransactionTest, ExpiredTransaction) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + s = txn1->Put("Y", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should be able to write to X since txn1 has expired + s = txn2->Put("X", "2"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("2", value); + + s = txn1->Put("Z", "1"); + ASSERT_OK(s); + + // txn1 should fail to commit since it is expired + s = txn1->Commit(); + ASSERT_TRUE(s.IsExpired()); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "Z", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, ReinitializeTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Reinitialize transaction to no long expire + txn_options.expiration = -1; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "z"); + ASSERT_OK(s); + + // Should commit since not expired + s = txn1->Commit(); + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "zz"); + ASSERT_OK(s); + + // Reinitilize txn1 and verify that Z gets unlocked + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr); + s = txn2->Put("Z", "zzz"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzz"); + + // Verify snapshots get reinitialized correctly + txn1->SetSnapshot(); + s = txn1->Put("Z", "zzzz"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + const Snapshot* snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_TRUE(snapshot); + + s = txn1->Put("Z", "a"); + ASSERT_OK(s); + + ASSERT_OK(txn1->Rollback()); + + s = txn1->Put("Y", "y"); + ASSERT_OK(s); + + txn_options.set_snapshot = false; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + s = txn1->Put("X", "x"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->SetName("name"); + ASSERT_OK(s); + + s = txn1->Prepare(); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->SetName("name"); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, Rollback) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + ASSERT_OK(s); + + s = txn1->Put("X", "1"); + ASSERT_OK(s); + + Transaction* txn2 = db->BeginTransaction(write_options); + + // txn2 should not be able to write to X since txn1 has it locked + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->Rollback()); + delete txn1; + + // txn2 should now be able to write to X + s = txn2->Put("X", "3"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + delete txn2; +} + +TEST_P(TransactionTest, LockLimitTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + delete db; + db = nullptr; + + // Open DB with a lock limit of 3 + txn_db_options.max_num_locks = 3; + ASSERT_OK(ReOpen()); + assert(db != nullptr); + ASSERT_OK(s); + + // Create a txn and verify we can only lock up to 3 keys + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn); + + s = txn->Put("X", "x"); + ASSERT_OK(s); + + s = txn->Put("Y", "y"); + ASSERT_OK(s); + + s = txn->Put("Z", "z"); + ASSERT_OK(s); + + // lock limit reached + s = txn->Put("W", "w"); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->Put("X", "xx"); + ASSERT_OK(s); + + s = txn->GetForUpdate(read_options, "W", &value); + ASSERT_TRUE(s.IsBusy()); + s = txn->GetForUpdate(read_options, "V", &value); + ASSERT_TRUE(s.IsBusy()); + + // re-locking same key shouldn't put us over the limit + s = txn->GetForUpdate(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = txn->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // "X" currently locked + s = txn2->Put("X", "x"); + ASSERT_TRUE(s.IsTimedOut()); + + // lock limit reached + s = txn2->Put("M", "m"); + ASSERT_TRUE(s.IsBusy()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("xx", value); + + s = db->Get(read_options, "W", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Committing txn should release its locks and allow txn2 to proceed + s = txn2->Put("X", "x2"); + ASSERT_OK(s); + + s = txn2->Delete("X"); + ASSERT_OK(s); + + s = txn2->Put("M", "m"); + ASSERT_OK(s); + + s = txn2->Put("Z", "z2"); + ASSERT_OK(s); + + // lock limit reached + s = txn2->Delete("Y"); + ASSERT_TRUE(s.IsBusy()); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ("z2", value); + + s = db->Get(read_options, "Y", &value); + ASSERT_OK(s); + ASSERT_EQ("y", value); + + s = db->Get(read_options, "X", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; + delete txn2; +} + +TEST_P(TransactionTest, IteratorTest) { + // This test does writes without snapshot validation, and then tries to create + // iterator later, which is unsupported in write unprepared. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + // Write some keys to the db + s = db->Put(write_options, "A", "a"); + ASSERT_OK(s); + + s = db->Put(write_options, "G", "g"); + ASSERT_OK(s); + + s = db->Put(write_options, "F", "f"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c"); + ASSERT_OK(s); + + s = db->Put(write_options, "D", "d"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Put("H", "h"); + ASSERT_OK(s); + + s = txn->Delete("D"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + txn->SetSnapshot(); + const Snapshot* snapshot = txn->GetSnapshot(); + + // Write some keys to the db after the snapshot + s = db->Put(write_options, "BB", "xx"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "xx"); + ASSERT_OK(s); + + read_options.snapshot = snapshot; + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + // Read all keys via iter and lock them all + std::string results[] = {"a", "b", "c", "e", "f", "g", "h"}; + for (int i = 0; i < 7; i++) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(results[i], iter->value().ToString()); + + s = txn->GetForUpdate(read_options, iter->key(), nullptr); + if (i == 2) { + // "C" was modified after txn's snapshot + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("G"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("g", iter->value().ToString()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("f", iter->value().ToString()); + + iter->Seek("D"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek("C"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("c", iter->value().ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("e", iter->value().ToString()); + + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->value().ToString()); + + iter->Seek("X"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("h", iter->value().ToString()); + + s = txn->Commit(); + ASSERT_OK(s); + + delete iter; + delete txn; +} + +TEST_P(TransactionTest, DisableIndexingTest) { + // Skip this test for write unprepared. It does not solely rely on WBWI for + // read your own writes, so depending on whether batches are flushed or not, + // only some writes will be visible. + // + // Also, write unprepared does not support creating iterators if there has + // been txn->Put() without snapshot validation. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn->DisableIndexing(); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + + iter->Seek("B"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + + s = txn->Delete("A"); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn->EnableIndexing(); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + iter->Seek("B"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bb", iter->value().ToString()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + delete iter; + delete txn; +} + +TEST_P(TransactionTest, SavepointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + txn->SetSavePoint(); // 1 + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("B", "b"); + ASSERT_OK(s); + + ASSERT_EQ(1, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + delete txn; + txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("B", "bb"); + ASSERT_OK(s); + + s = txn->Put("C", "c"); + ASSERT_OK(s); + + txn->SetSavePoint(); // 2 + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Put("C", "cc"); + ASSERT_OK(s); + + s = txn->Put("D", "d"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumPuts()); + ASSERT_EQ(1, txn->GetNumDeletes()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 2 + + ASSERT_EQ(3, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("bb", value); + + s = txn->Get(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("E", "e"); + ASSERT_OK(s); + + ASSERT_EQ(5, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + // Rollback to beginning of txn + s = txn->RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_OK(txn->Rollback()); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Put("A", "aa"); + ASSERT_OK(s); + + s = txn->Put("F", "f"); + ASSERT_OK(s); + + ASSERT_EQ(2, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + txn->SetSavePoint(); // 3 + txn->SetSavePoint(); // 4 + + s = txn->Put("G", "g"); + ASSERT_OK(s); + + s = txn->SingleDelete("F"); + ASSERT_OK(s); + + s = txn->Delete("B"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = txn->Get(read_options, "F", &value); + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "F" should be. The current implementation will + // return NotFound in this case. + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + ASSERT_EQ(3, txn->GetNumPuts()); + ASSERT_EQ(2, txn->GetNumDeletes()); + + ASSERT_OK(txn->RollbackToSavePoint()); // Rollback to 3 + + ASSERT_EQ(2, txn->GetNumPuts()); + ASSERT_EQ(0, txn->GetNumDeletes()); + + s = txn->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = txn->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "F", &value); + ASSERT_OK(s); + ASSERT_EQ("f", value); + + s = db->Get(read_options, "G", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("aa", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "E", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +TEST_P(TransactionTest, SavepointTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->Put("A", ""); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 1 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + s = txn1->Put("C", "c"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 2 + + // Verify that "A" and "C" is still locked while "B" is not + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_OK(s); + + s = txn1->Put("A", "aa"); + ASSERT_OK(s); + s = txn1->Put("B", "bb"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->Put("A", "aaa"); + ASSERT_OK(s); + s = txn1->Put("B", "bbb"); + ASSERT_OK(s); + s = txn1->Put("C", "ccc"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 3 + + // Verify that "A", "B", "C" are still locked + txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1 + + // Verify that only "A" is locked + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_OK(s); + s = txn2->Put("C", "c3po"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + // Verify "A" "C" "B" are no longer locked + s = txn2->Put("A", "a4"); + ASSERT_OK(s); + s = txn2->Put("B", "b4"); + ASSERT_OK(s); + s = txn2->Put("C", "c4"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; +} + +TEST_P(TransactionTest, SavepointTest3) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->PopSavePoint(); // No SavePoint present + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("A", ""); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Still no SavePoint present + ASSERT_TRUE(s.IsNotFound()); + + txn1->SetSavePoint(); // 1 + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Remove 1 + ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound()); + + // Verify that "A" is still locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + delete txn2; + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + + s = txn1->Put("B", "b2"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Roll back to 2 + + s = txn1->PopSavePoint(); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + std::string value; + + // tnx1 should have modified "A" to "a" + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + // tnx1 should have set "B" to just "b" + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b", value); + + s = db->Get(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(TransactionTest, SavepointTest4) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->SetSavePoint(); // 1 + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + s = txn1->Put("B", "b"); + ASSERT_OK(s); + + s = txn1->PopSavePoint(); // Remove 2 + ASSERT_OK(s); + + // Verify that A/B still exists. + std::string value; + ASSERT_OK(txn1->Get(read_options, "A", &value)); + ASSERT_EQ("a", value); + + ASSERT_OK(txn1->Get(read_options, "B", &value)); + ASSERT_EQ("b", value); + + ASSERT_OK(txn1->RollbackToSavePoint()); // Rollback to 1 + + // Verify that everything was rolled back. + s = txn1->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Nothing should be locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + s = txn2->Put("A", ""); + ASSERT_OK(s); + + s = txn2->Put("B", ""); + ASSERT_OK(s); + + delete txn2; + delete txn1; +} + +TEST_P(TransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->UndoGetForUpdate("A"); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + txn1 = db->BeginTransaction(write_options, txn_options); + + txn1->UndoGetForUpdate("A"); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Verify that A is locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("A"); + + // Verify that A is now unlocked + s = txn2->Put("A", "a2"); + ASSERT_OK(s); + ASSERT_OK(txn2->Commit()); + delete txn2; + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a2", value); + + s = txn1->Delete("A"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("B", "b3"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + + // Verify that A and B are still locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a4"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b4"); + ASSERT_TRUE(s.IsTimedOut()); + + ASSERT_OK(txn1->Rollback()); + delete txn1; + + // Verify that A and B are no longer locked + s = txn2->Put("A", "a5"); + ASSERT_OK(s); + s = txn2->Put("B", "b5"); + ASSERT_OK(s); + s = txn2->Commit(); + delete txn2; + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + s = txn1->Put("B", "b5"); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B,C are locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B are locked and C is not + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_OK(s); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify B is locked and A and C are not + s = txn2->Put("A", "a7"); + ASSERT_OK(s); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c7"); + ASSERT_OK(s); + s = txn2->Put("X", "x7"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +TEST_P(TransactionTest, UndoGetForUpdateTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + std::string value; + Status s; + + s = db->Put(write_options, "A", ""); + ASSERT_OK(s); + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("F", "f"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 1 + + txn1->UndoGetForUpdate("A"); + + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("E", "e"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "E", &value); + ASSERT_OK(s); + + s = txn1->GetForUpdate(read_options, "F", &value); + ASSERT_OK(s); + + // Verify A,B,C,D,E,F are still locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f1"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("E"); + + // Verify A,B,D,E,F are still locked and C is not. + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("H", "h"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,D,E,F,H are still locked and C,G are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("H", "h3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 2 + + // Verify A,B,D,E,F are still locked and C,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,E,F are still locked and C,D,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + ASSERT_OK(txn1->RollbackToSavePoint()); // rollback to 1 + + // Verify A,B,F are still locked and C,D,E,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify F is still locked and A,B,C,D,E,G,H are not. + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("A", "a3"); + ASSERT_OK(s); + s = txn2->Put("B", "b3"); + ASSERT_OK(s); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, TimeoutTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + delete db; + db = nullptr; + + // transaction writes have an infinite timeout, + // but we will override this when we start a txn + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = -1; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + assert(db != nullptr); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options0; + txn_options0.expiration = 100; // 100ms + txn_options0.lock_timeout = 50; // txn timeout no longer infinite + Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + ASSERT_GE(txn1->GetElapsedTime(), + static_cast<uint64_t>(txn_options0.expiration)); + + s = txn1->Commit(); + ASSERT_TRUE(s.IsExpired()); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + delete db; + + // transaction writes have 10ms timeout, + // db writes have infinite timeout + txn_db_options.transaction_lock_timeout = 50; + txn_db_options.default_lock_timeout = -1; + + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + ASSERT_OK(s); + + s = db->Put(write_options, "aaa", "aaa"); + ASSERT_OK(s); + + TransactionOptions txn_options; + txn_options.expiration = 100; // 100ms + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "aaa", nullptr); + ASSERT_OK(s); + + // Conflicts with previous GetForUpdate. + // Since db writes do not have a timeout, this should eventually succeed when + // the transaction expires. + s = db->Put(write_options, "aaa", "xxx"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_NOK(s); // expired! + + s = db->Get(read_options, "aaa", &value); + ASSERT_OK(s); + ASSERT_EQ("xxx", value); + + delete txn1; + txn_options.expiration = 6000000; // 100 minutes + txn_options.lock_timeout = 1; // 1ms + txn1 = db->BeginTransaction(write_options, txn_options); + txn1->SetLockTimeout(100); + + TransactionOptions txn_options2; + txn_options2.expiration = 10; // 10ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options2); + ASSERT_OK(s); + + s = txn2->Put("a", "2"); + ASSERT_OK(s); + + // txn1 has a lock timeout longer than txn2's expiration, so it will win + s = txn1->Delete("a"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + // txn2 should be expired out since txn1 waiting until its timeout expired. + s = txn2->Commit(); + ASSERT_TRUE(s.IsExpired()); + + delete txn1; + delete txn2; + txn_options.expiration = 6000000; // 100 minutes + txn1 = db->BeginTransaction(write_options, txn_options); + txn_options2.expiration = 100000000; + txn2 = db->BeginTransaction(write_options, txn_options2); + + s = txn1->Delete("asdf"); + ASSERT_OK(s); + + // txn2 has a smaller lock timeout than txn1's expiration, so it will time out + s = txn2->Delete("asdf"); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key"); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = txn2->Put("asdf", "asdf"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "asdf", &value); + ASSERT_OK(s); + ASSERT_EQ("asdf", value); + + delete txn1; + delete txn2; +} + +TEST_P(TransactionTest, SingleDeleteTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + txn = db->BeginTransaction(write_options); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + txn = db->BeginTransaction(write_options); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + txn = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + txn2->SetSnapshot(); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Put("A", "a2"); + ASSERT_OK(s); + + s = txn->SingleDelete("A"); + ASSERT_OK(s); + + s = txn->SingleDelete("B"); + ASSERT_OK(s); + + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "A" should be. The current implementation will + // return NotFound in this case. + s = txn->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn2->Put("B", "b"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + // According to db.h, doing a SingleDelete on a key that has been + // overwritten will have undefinied behavior. So it is unclear what the + // result of fetching "A" should be. The current implementation will + // return NotFound in this case. + s = db->Get(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); +} + +TEST_P(TransactionTest, MergeTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(write_options, TransactionOptions()); + ASSERT_TRUE(txn); + + s = db->Put(write_options, "A", "a0"); + ASSERT_OK(s); + + s = txn->Merge("A", "1"); + ASSERT_OK(s); + + s = txn->Merge("A", "2"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a0,1,2", value); + + s = txn->Put("A", "a"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a", value); + + s = txn->Merge("A", "3"); + ASSERT_OK(s); + + s = txn->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a,3", value); + + TransactionOptions txn_options; + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2); + + // verify that txn has "A" locked + s = txn2->Merge("A", "4"); + ASSERT_TRUE(s.IsTimedOut()); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn->Commit(); + ASSERT_OK(s); + delete txn; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a,3", value); +} + +TEST_P(TransactionTest, DeleteRangeSupportTest) { + // The `DeleteRange()` API is banned everywhere. + ASSERT_TRUE( + db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + + // But range deletions can be added via the `Write()` API by specifying the + // proper flags to promise there are no conflicts according to the DB type + // (see `TransactionDB::DeleteRange()` API doc for details). + for (bool skip_concurrency_control : {false, true}) { + for (bool skip_duplicate_key_check : {false, true}) { + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + TransactionDBWriteOptimizations flags; + flags.skip_concurrency_control = skip_concurrency_control; + flags.skip_duplicate_key_check = skip_duplicate_key_check; + Status s = db->Write(WriteOptions(), flags, &wb); + std::string value; + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + if (skip_concurrency_control) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + case WRITE_PREPARED: + // Intentional fall-through + case WRITE_UNPREPARED: + if (skip_concurrency_control && skip_duplicate_key_check) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + } + // Without any promises from the user, range deletion via other `Write()` + // APIs are still banned. + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + ASSERT_NOK(db->Write(WriteOptions(), &wb)); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + } +} + +TEST_P(TransactionTest, DeferSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + Status s; + + s = db->Put(write_options, "A", "a0"); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + Transaction* txn2 = db->BeginTransaction(write_options); + + txn1->SetSnapshotOnNextOperation(); + auto snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + s = txn2->Put("A", "a2"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->GetForUpdate(read_options, "A", &value); + // Should not conflict with txn2 since snapshot wasn't set until + // GetForUpdate was called. + ASSERT_OK(s); + ASSERT_EQ("a2", value); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + s = db->Put(write_options, "B", "b0"); + ASSERT_OK(s); + + // Cannot lock B since it was written after the snapshot was set + s = txn1->Put("B", "b1"); + ASSERT_TRUE(s.IsBusy()); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a1", value); + + s = db->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_EQ("b0", value); +} + +TEST_P(TransactionTest, DeferSnapshotTest2) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSnapshot(); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + s = db->Put(write_options, "C", "c0"); + ASSERT_OK(s); + s = db->Put(write_options, "D", "d0"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + + txn1->SetSnapshotOnNextOperation(); + + s = txn1->Get(snapshot_read_options, "C", &value); + // Snapshot was set before C was written + ASSERT_TRUE(s.IsNotFound()); + s = txn1->Get(snapshot_read_options, "D", &value); + // Snapshot was set before D was written + ASSERT_TRUE(s.IsNotFound()); + + // Snapshot should not have changed yet. + snapshot_read_options.snapshot = txn1->GetSnapshot(); + + s = txn1->Get(snapshot_read_options, "C", &value); + // Snapshot was set before C was written + ASSERT_TRUE(s.IsNotFound()); + s = txn1->Get(snapshot_read_options, "D", &value); + // Snapshot was set before D was written + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_OK(s); + ASSERT_EQ("c0", value); + + s = db->Put(write_options, "D", "d00"); + ASSERT_OK(s); + + // Snapshot is now set + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "D", &value); + ASSERT_OK(s); + ASSERT_EQ("d0", value); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +TEST_P(TransactionTest, DeferSnapshotSavePointTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSavePoint(); // 1 + + s = db->Put(write_options, "T", "1"); + ASSERT_OK(s); + + txn1->SetSnapshotOnNextOperation(); + + s = db->Put(write_options, "T", "2"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = db->Put(write_options, "T", "3"); + ASSERT_OK(s); + + s = txn1->Put("A", "a"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 3 + + s = db->Put(write_options, "T", "4"); + ASSERT_OK(s); + + txn1->SetSnapshot(); + txn1->SetSnapshotOnNextOperation(); + + txn1->SetSavePoint(); // 4 + + s = db->Put(write_options, "T", "5"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("4", value); + + s = txn1->Put("A", "a1"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 4 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("4", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 3 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("3", value); + + s = txn1->Get(read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 2 + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->Delete("A"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_TRUE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->RollbackToSavePoint(); // Rollback to 1 + ASSERT_OK(s); + + s = txn1->Delete("A"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + s = txn1->Get(snapshot_read_options, "T", &value); + ASSERT_OK(s); + ASSERT_EQ("5", value); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + class Notifier : public TransactionNotifier { + private: + const Snapshot** snapshot_ptr_; + + public: + explicit Notifier(const Snapshot** snapshot_ptr) + : snapshot_ptr_(snapshot_ptr) {} + + void SnapshotCreated(const Snapshot* newSnapshot) override { + *snapshot_ptr_ = newSnapshot; + } + }; + + std::shared_ptr<Notifier> notifier = + std::make_shared<Notifier>(&read_options.snapshot); + Status s; + + s = db->Put(write_options, "B", "0"); + ASSERT_OK(s); + + Transaction* txn1 = db->BeginTransaction(write_options); + + txn1->SetSnapshotOnNextOperation(notifier); + ASSERT_FALSE(read_options.snapshot); + + s = db->Put(write_options, "B", "1"); + ASSERT_OK(s); + + // A Get does not generate the snapshot + s = txn1->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_FALSE(read_options.snapshot); + ASSERT_EQ(value, "1"); + + // Any other operation does + s = txn1->Put("A", "0"); + ASSERT_OK(s); + + // Now change "B". + s = db->Put(write_options, "B", "2"); + ASSERT_OK(s); + + // The original value should still be read + s = txn1->Get(read_options, "B", &value); + ASSERT_OK(s); + ASSERT_TRUE(read_options.snapshot); + ASSERT_EQ(value, "1"); + + s = txn1->Commit(); + ASSERT_OK(s); + + delete txn1; +} + +TEST_P(TransactionTest, ClearSnapshotTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + std::string value; + Status s; + + s = db->Put(write_options, "foo", "0"); + ASSERT_OK(s); + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + s = db->Put(write_options, "foo", "1"); + ASSERT_OK(s); + + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + + // No snapshot created yet + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "1"); + + txn->SetSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_TRUE(snapshot_read_options.snapshot); + + s = db->Put(write_options, "foo", "2"); + ASSERT_OK(s); + + // Snapshot was created before change to '2' + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "1"); + + txn->ClearSnapshot(); + snapshot_read_options.snapshot = txn->GetSnapshot(); + ASSERT_FALSE(snapshot_read_options.snapshot); + + // Snapshot has now been cleared + s = txn->Get(snapshot_read_options, "foo", &value); + ASSERT_EQ(value, "2"); + + s = txn->Commit(); + ASSERT_OK(s); + + delete txn; +} + +TEST_P(TransactionTest, ToggleAutoCompactionTest) { + Status s; + + ColumnFamilyHandle *cfa, *cfb; + ColumnFamilyOptions cf_options; + + // Create 2 new column families + s = db->CreateColumnFamily(cf_options, "CFA", &cfa); + ASSERT_OK(s); + s = db->CreateColumnFamily(cf_options, "CFB", &cfb); + ASSERT_OK(s); + + delete cfa; + delete cfb; + delete db; + + // open DB with three column families + std::vector<ColumnFamilyDescriptor> column_families; + // have to open default column family + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions())); + // open the new column families + column_families.push_back( + ColumnFamilyDescriptor("CFA", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); + + ColumnFamilyOptions* cf_opt_default = &column_families[0].options; + ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options; + ColumnFamilyOptions* cf_opt_cfb = &column_families[2].options; + cf_opt_default->disable_auto_compactions = false; + cf_opt_cfa->disable_auto_compactions = true; + cf_opt_cfb->disable_auto_compactions = false; + + std::vector<ColumnFamilyHandle*> handles; + + s = TransactionDB::Open(options, txn_db_options, dbname, column_families, + &handles, &db); + ASSERT_OK(s); + + auto cfh_default = static_cast_with_check<ColumnFamilyHandleImpl>(handles[0]); + auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions(); + + auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(handles[1]); + auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions(); + + auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(handles[2]); + auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions(); + + ASSERT_EQ(opt_default.disable_auto_compactions, false); + ASSERT_EQ(opt_a.disable_auto_compactions, true); + ASSERT_EQ(opt_b.disable_auto_compactions, false); + + for (auto handle : handles) { + delete handle; + } +} + +TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) { + // In this test, txn1 should succeed committing, + // as the callback is called after txn1 starts committing. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"TransactionTest::ExpirableTransactionDataRace:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) { + WriteOptions write_options; + TransactionOptions txn_options; + + // Force txn1 to expire + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Status s; + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions write_options; + TransactionOptions txn_options; + + txn_options.expiration = 1000; // 1 second + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + Status s; + s = txn1->Put("X", "1"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + ReadOptions read_options; + std::string value; + s = db->Get(read_options, "X", &value); + ASSERT_OK(s); + ASSERT_EQ("1", value); + + delete txn1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +namespace { +// cmt_delay_ms is the delay between prepare and commit +// first_id is the id of the first transaction +Status TransactionStressTestInserter( + TransactionDB* db, const size_t num_transactions, const size_t num_sets, + const size_t num_keys_per_set, Random64* rand, + const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = true; + + // Inside the inserter we might also retake the snapshot. We do both since two + // separte functions are engaged for each. + txn_options.set_snapshot = rand->OneIn(2); + + RandomTransactionInserter inserter( + rand, write_options, read_options, num_keys_per_set, + static_cast<uint16_t>(num_sets), cmt_delay_ms, first_id); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.TransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + inserter.GetLastStatus().PermitUncheckedError(); + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (num_transactions != 1 && + inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +// Worker threads add a number to a key from each set of keys. The checker +// threads verify that the sum of all keys in each set are equal. +TEST_P(MySQLStyleTransactionTest, TransactionStressTest) { + // Small write buffer to trigger more compactions + options.write_buffer_size = 1024; + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice& key) { + return RandomTransactionInserter::RollbackDeletionTypeCallback(key); + }; + ASSERT_OK(ReOpenNoDelete()); + constexpr size_t num_workers = 4; // worker threads count + constexpr size_t num_checkers = 2; // checker threads count + constexpr size_t num_slow_checkers = 2; // checker threads emulating backups + constexpr size_t num_slow_workers = 1; // slow worker threads count + constexpr size_t num_transactions_per_thread = 1000; + constexpr uint16_t num_sets = 3; + constexpr size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector<port::Thread> threads; + std::atomic<uint32_t> finished = {0}; + constexpr bool TAKE_SNAPSHOT = true; + uint64_t time_seed = env->NowMicros(); + printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce + + std::function<void()> call_inserter = [&] { + size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread, + num_sets, num_keys_per_set, &rand)); + finished++; + }; + std::function<void()> call_checker = [&] { + size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + // Verify that data is consistent + while (finished < num_workers) { + ASSERT_OK(RandomTransactionInserter::Verify( + db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand)); + } + }; + std::function<void()> call_slow_checker = [&] { + size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + // Verify that data is consistent + while (finished < num_workers) { + uint64_t delay_ms = rand.Uniform(100) + 1; + Status s = RandomTransactionInserter::Verify( + db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms); + ASSERT_OK(s); + } + }; + std::function<void()> call_slow_inserter = [&] { + size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 rand(time_seed * thd_seed); + uint64_t id = 0; + // Verify that data is consistent + while (finished < num_workers) { + uint64_t delay_ms = rand.Uniform(500) + 1; + ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set, + &rand, delay_ms, id++)); + } + }; + + for (uint32_t i = 0; i < num_workers; i++) { + threads.emplace_back(call_inserter); + } + for (uint32_t i = 0; i < num_checkers; i++) { + threads.emplace_back(call_checker); + } + if (with_slow_threads_) { + for (uint32_t i = 0; i < num_slow_checkers; i++) { + threads.emplace_back(call_slow_checker); + } + for (uint32_t i = 0; i < num_slow_workers; i++) { + threads.emplace_back(call_slow_inserter); + } + } + + // Wait for all threads to finish + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set, + !TAKE_SNAPSHOT); + ASSERT_OK(s); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(TransactionTest, MemoryLimitTest) { + TransactionOptions txn_options; + // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data. + txn_options.max_write_batch_size = 29; + // Set threshold to unlimited so that the write batch does not get flushed, + // and can hit the memory limit. + txn_options.write_batch_flush_threshold = 0; + std::string value; + Status s; + + Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options); + ASSERT_TRUE(txn); + + ASSERT_EQ(0, txn->GetNumPuts()); + ASSERT_LE(0, txn->GetID()); + + s = txn->Put(Slice("a"), Slice("....")); + ASSERT_OK(s); + ASSERT_EQ(1, txn->GetNumPuts()); + + s = txn->Put(Slice("b"), Slice("....")); + ASSERT_OK(s); + ASSERT_EQ(2, txn->GetNumPuts()); + + s = txn->Put(Slice("b"), Slice("....")); + ASSERT_TRUE(s.IsMemoryLimit()); + ASSERT_EQ(2, txn->GetNumPuts()); + + ASSERT_OK(txn->Rollback()); + delete txn; +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// This test clarifies the existing expectation from the sequence number +// algorithm. It could detect mistakes in updating the code but it is not +// necessarily the one acceptable way. If the algorithm is legitimately changed, +// this unit test should be updated as well. +TEST_P(TransactionStressTest, SeqAdvanceTest) { + // TODO(myabandeh): must be test with false before new releases + const bool short_test = true; + WriteOptions wopts; + FlushOptions fopt; + + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Do the test with NUM_BRANCHES branches in it. Each run of a test takes some + // of the branches. This is the same as counting a binary number where i-th + // bit represents whether we take branch i in the represented by the number. + const size_t NUM_BRANCHES = short_test ? 6 : 10; + // Helper function that shows if the branch is to be taken in the run + // represented by the number n. + auto branch_do = [&](size_t n, size_t* branch) { + assert(*branch < NUM_BRANCHES); + const size_t filter = static_cast<size_t>(1) << *branch; + return n & filter; + }; + const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES; + for (size_t n = 0; n < max_n; n++) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + size_t branch = 0; + auto seq = db_impl->GetLatestSequenceNumber(); + exp_seq = seq; + TestTxn0(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + // Doing it twice might detect some bugs + TestTxn0(1); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + TestTxn1(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn3(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn4(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + + TestTxn2(0); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + if (branch_do(n, &branch)) { + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + } + if (!short_test && branch_do(n, &branch)) { + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_EQ(exp_seq, seq); + } + ASSERT_OK(ReOpen()); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// Verify that the optimization would not compromize the correctness +TEST_P(TransactionTest, Optimizations) { + size_t comb_cnt = size_t(1) << 2; // 2 is number of optimization vars + for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) { + TransactionDBWriteOptimizations optimizations; + optimizations.skip_concurrency_control = IsInCombination(0, new_comb); + optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb); + + ASSERT_OK(ReOpen()); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("k"), Slice("v1"))); + ASSERT_OK(db->Write(write_options, &batch)); + + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("v1")); + } +} + +// A comparator that uses only the first three bytes +class ThreeBytewiseComparator : public Comparator { + public: + ThreeBytewiseComparator() {} + const char* Name() const override { return "test.ThreeBytewiseComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na.compare(nb); + } + bool Equal(const Slice& a, const Slice& b) const override { + Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3); + Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3); + return na == nb; + } + // These methods below don't seem relevant to this test. Implement them if + // proven othersize. + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + const Comparator* bytewise_comp = BytewiseComparator(); + bytewise_comp->FindShortSuccessor(key); + } +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionTest, GetWithoutSnapshot) { + WriteOptions write_options; + std::atomic<bool> finish = {false}; + ASSERT_OK(db->Put(write_options, "key", "value")); + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + for (int i = 0; i < 100; i++) { + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put("key", "overridedvalue")); + ASSERT_OK(txn->Put("key", "value")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + } + finish = true; + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + while (!finish) { + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value")); + } + }); + commit_thread.join(); + read_thread.join(); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// Test that the transactional db can handle duplicate keys in the write batch +TEST_P(TransactionTest, DuplicateKeys) { + ColumnFamilyOptions cf_options; + std::string cf_name = "two"; + ColumnFamilyHandle* cf_handle = nullptr; + { + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(Slice("key"), Slice("value"))); + ASSERT_OK(batch.Put(Slice("key2"), Slice("value2"))); + // duplicate the keys + ASSERT_OK(batch.Put(Slice("key"), Slice("value3"))); + // duplicate the 2nd key. It should not be counted duplicate since a + // sub-patch is cut after the last duplicate. + ASSERT_OK(batch.Put(Slice("key2"), Slice("value4"))); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate keys + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5"))); + + ASSERT_OK(db->Write(write_options, &batch)); + + ReadOptions ropt; + PinnableSlice pinnable_val; + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value3")); + s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value4")); + s = db->Get(ropt, cf_handle, "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("value5")); + + delete cf_handle; + } + + // Test with non-bytewise comparator + { + ASSERT_OK(ReOpen()); + std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator()); + cf_options.comparator = comp_gc.get(); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + WriteBatch batch; + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value"))); + // The first three bytes are the same, do it must be counted as duplicate + ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2"))); + // check for 2nd duplicate key in cf with non-default comparator + ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b"))); + ASSERT_OK(db->Write(write_options, &batch)); + + // The value must be the most recent value for all the keys equal to "key", + // including "key2" + ReadOptions ropt; + PinnableSlice pinnable_val; + ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value2b")); + + // Test duplicate keys with rollback + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3"))); + ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4"))); + ASSERT_OK(txn0->Rollback()); + ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val)); + ASSERT_TRUE(pinnable_val == ("value2b")); + delete txn0; + + delete cf_handle; + cf_options.comparator = BytewiseComparator(); + } + + for (bool do_prepare : {true, false}) { + for (bool do_rollback : {true, false}) { + for (bool with_commit_batch : {true, false}) { + if (with_commit_batch && !do_prepare) { + continue; + } + if (with_commit_batch && do_rollback) { + continue; + } + ASSERT_OK(ReOpen()); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = true; + WriteOptions write_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto s = txn0->SetName("xid"); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0"), Slice("bar0a")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0"), Slice("bar0b")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo1"), Slice("bar1")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo2"), Slice("bar2a")); + ASSERT_OK(s); + // Repeat a key after the start of a sub-patch. This should not cause a + // duplicate in the most recent sub-patch and hence not creating a new + // sub-patch. + s = txn0->Put(Slice("foo0"), Slice("bar0c")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo2"), Slice("bar2b")); + ASSERT_OK(s); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate. + s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn0->Merge(Slice("foo3"), Slice("bar3")); + ASSERT_OK(s); + s = txn0->Put(Slice("foo4"), Slice("bar4")); + ASSERT_OK(s); + s = txn0->Delete(Slice("foo4")); + ASSERT_OK(s); + s = txn0->SingleDelete(Slice("foo4")); + ASSERT_OK(s); + if (do_prepare) { + s = txn0->Prepare(); + ASSERT_OK(s); + } + if (do_rollback) { + // Test rolling back the batch with duplicates + s = txn0->Rollback(); + ASSERT_OK(s); + } else { + if (with_commit_batch) { + assert(do_prepare); + auto cb = txn0->GetCommitTimeWriteBatch(); + // duplicate a key in the original batch + // TODO(myabandeh): the behavior of GetCommitTimeWriteBatch + // conflicting with the prepared batch is currently undefined and + // gives different results in different implementations. + + // s = cb->Put(Slice("foo0"), Slice("bar0d")); + // ASSERT_OK(s); + // add a new duplicate key + s = cb->Put(Slice("foo6"), Slice("bar6a")); + ASSERT_OK(s); + s = cb->Put(Slice("foo6"), Slice("bar6b")); + ASSERT_OK(s); + // add a duplicate key that is removed in the same batch + s = cb->Put(Slice("foo7"), Slice("bar7a")); + ASSERT_OK(s); + s = cb->Delete(Slice("foo7")); + ASSERT_OK(s); + } + s = txn0->Commit(); + ASSERT_OK(s); + } + delete txn0; + ReadOptions ropt; + PinnableSlice pinnable_val; + + if (do_rollback) { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, cf_handle, "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + } else { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0c")); + s = db->Get(ropt, cf_handle, "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0-cf1")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar2a,bar2b")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar3,bar3")); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + if (with_commit_batch) { + s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val); + if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_COMMITTED) { + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar6b")); + } else { + ASSERT_TRUE(s.IsNotFound()); + } + s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + } + } + delete cf_handle; + } // with_commit_batch + } // do_rollback + } // do_prepare + + if (!options.unordered_write) { + // Also test with max_successive_merges > 0. max_successive_merges will not + // affect our algorithm for duplicate key insertion but we add the test to + // verify that. + cf_options.max_successive_merges = 2; + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpen()); + db->CreateColumnFamily(cf_options, cf_name, &cf_handle); + WriteOptions write_options; + // Ensure one value for the key + ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value"))); + WriteBatch batch; + // Merge more than max_successive_merges times + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3"))); + ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4"))); + ASSERT_OK(db->Write(write_options, &batch)); + ReadOptions read_options; + std::string value; + ASSERT_OK(db->Get(read_options, cf_handle, "key", &value)); + ASSERT_EQ(value, "value,1,2,3,4"); + delete cf_handle; + } + + { + // Test that the duplicate detection is not compromised after rolling back + // to a save point + TransactionOptions txn_options; + WriteOptions write_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b"))); + txn0->SetSavePoint(); + ASSERT_OK(txn0->RollbackToSavePoint()); + ASSERT_OK(txn0->Commit()); + delete txn0; + } + + // Test sucessfull recovery after a crash + { + ASSERT_OK(ReOpen()); + TransactionOptions txn_options; + WriteOptions write_options; + ReadOptions ropt; + Transaction* txn0; + PinnableSlice pinnable_val; + Status s; + + std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator()); + cf_options.comparator = comp_gc.get(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + delete cf_handle; + std::vector<ColumnFamilyDescriptor> cfds{ + ColumnFamilyDescriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)), + ColumnFamilyDescriptor(cf_name, cf_options), + }; + std::vector<ColumnFamilyHandle*> handles; + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + + assert(db != nullptr); + ASSERT_OK(db->Put(write_options, "foo0", "init")); + ASSERT_OK(db->Put(write_options, "foo1", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init")); + ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init")); + + // one entry + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0a")); + + // two entries, no duplicate + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0b")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "fol1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1b")); + + // one duplicate with ::Put + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c"))); + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0d")); + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1c")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1d")); + + // Duplicate with ::Put, ::Delete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->Delete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::SingleDelete + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g"))); + ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e"))); + ASSERT_OK(txn0->SingleDelete(Slice("foo0"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + // Duplicate with ::Put, ::Merge + txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i"))); + ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j"))); + ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f"))); + ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g"))); + ASSERT_OK(txn0->Prepare()); + delete txn0; + // This will check the asserts inside recovery code + ASSERT_OK(db->FlushWAL(true)); + // Flush only cf 1 + ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB()) + ->TEST_FlushMemTable(true, false, handles[1])); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete(cfds, &handles)); + txn0 = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn0 != nullptr); + ASSERT_OK(txn0->Commit()); + delete txn0; + pinnable_val.Reset(); + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar0f,bar0g")); + pinnable_val.Reset(); + s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == ("bar1i,bar1j")); + + for (auto h : handles) { + delete h; + } + delete db; + db = nullptr; + } +} + +// Test that the reseek optimization in iterators will not result in an infinite +// loop if there are too many uncommitted entries before the snapshot. +TEST_P(TransactionTest, ReseekOptimization) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + ColumnFamilyDescriptor cfd; + ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd)); + auto max_skip = cfd.options.max_sequential_skip_in_iterations; + + ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv"))); + + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + // Duplicate keys will result into separate sequence numbers in WritePrepared + // and WriteUnPrepared + for (size_t i = 0; i < 2 * max_skip; i++) { + ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar"))); + } + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv"))); + + ReadOptions read_options; + // To avoid loops + read_options.max_skippable_internal_keys = 10 * max_skip; + Iterator* iter = db->NewIterator(read_options); + ASSERT_OK(iter->status()); + size_t cnt = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + iter->Next(); + ASSERT_OK(iter->status()); + cnt++; + } + ASSERT_EQ(cnt, 2); + cnt = 0; + iter->SeekToLast(); + while (iter->Valid()) { + iter->Prev(); + ASSERT_OK(iter->status()); + cnt++; + } + ASSERT_EQ(cnt, 2); + delete iter; + ASSERT_OK(txn0->Rollback()); + delete txn0; +} + +// After recovery in kPointInTimeRecovery mode, the corrupted log file remains +// there. The new log files should be still read succesfully during recovery of +// the 2nd crash. +TEST_P(TransactionTest, DoubleCrashInRecovery) { + for (const bool manual_wal_flush : {false, true}) { + for (const bool write_after_recovery : {false, true}) { + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.manual_wal_flush = manual_wal_flush; + ASSERT_OK(ReOpen()); + std::string cf_name = "two"; + ColumnFamilyOptions cf_options; + ColumnFamilyHandle* cf_handle = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + + // Add a prepare entry to prevent the older logs from being deleted. + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("foo-prepare"), Slice("bar-prepare"))); + ASSERT_OK(txn->Prepare()); + + FlushOptions flush_ops; + ASSERT_OK(db->Flush(flush_ops)); + // Now we have a log that cannot be deleted + + ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1")); + // Flush only the 2nd cf + ASSERT_OK(db->Flush(flush_ops, cf_handle)); + + // The value is large enough to be touched by the corruption we ingest + // below. + std::string large_value(400, ' '); + // key/value not touched by corruption + ASSERT_OK(db->Put(write_options, "foo2", "bar2")); + // key/value touched by corruption + ASSERT_OK(db->Put(write_options, "foo3", large_value)); + // key/value not touched by corruption + ASSERT_OK(db->Put(write_options, "foo4", "bar4")); + + ASSERT_OK(db->FlushWAL(true)); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + uint64_t wal_file_id = db_impl->TEST_LogfileNumber(); + std::string fname = LogFileName(dbname, wal_file_id); + reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash(); + delete txn; + delete cf_handle; + delete db; + db = nullptr; + + // Corrupt the last log file in the middle, so that it is not corrupted + // in the tail. + std::string file_content; + ASSERT_OK(ReadFileToString(env, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(env->DeleteFile(fname)); + ASSERT_OK(WriteStringToFile(env, file_content, fname, true)); + + // Recover from corruption + std::vector<ColumnFamilyHandle*> handles; + std::vector<ColumnFamilyDescriptor> column_families; + column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("two", ColumnFamilyOptions())); + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + + if (write_after_recovery) { + // Write data to the log right after the corrupted log + ASSERT_OK(db->Put(write_options, "foo5", large_value)); + } + + // Persist data written to WAL during recovery or by the last Put + ASSERT_OK(db->FlushWAL(true)); + // 2nd crash to recover while having a valid log after the corrupted one. + ASSERT_OK(ReOpenNoDelete(column_families, &handles)); + assert(db != nullptr); + txn = db->GetTransactionByName("xid"); + ASSERT_TRUE(txn != nullptr); + ASSERT_OK(txn->Commit()); + delete txn; + for (auto handle : handles) { + delete handle; + } + } + } +} + +TEST_P(TransactionTest, CommitWithoutPrepare) { + { + // skip_prepare = false. + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.skip_prepare = false; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn->Commit().IsTxnNotPrepared()); + delete txn; + } + + { + // skip_prepare = true. + WriteOptions write_options; + TransactionOptions txn_options; + txn_options.skip_prepare = true; + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->Commit()); + delete txn; + } +} + +TEST_P(TransactionTest, OpenAndEnableU64Timestamp) { + ASSERT_OK(ReOpenNoDelete()); + + assert(db); + + const std::string test_cf_name = "test_cf"; + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + { + ColumnFamilyHandle* cfh = nullptr; + const Status s = db->CreateColumnFamily(cf_opts, test_cf_name, &cfh); + if (txn_db_options.write_policy == WRITE_COMMITTED) { + ASSERT_OK(s); + delete cfh; + } else { + ASSERT_TRUE(s.IsNotSupported()); + assert(!cfh); + } + } + + // Bypass transaction db layer. + if (txn_db_options.write_policy != WRITE_COMMITTED) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + assert(db_impl); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + } + + { + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + std::vector<ColumnFamilyHandle*> handles; + const Status s = ReOpenNoDelete(cf_descs, &handles); + if (txn_db_options.write_policy == WRITE_COMMITTED) { + ASSERT_OK(s); + for (auto* h : handles) { + delete h; + } + } else { + ASSERT_TRUE(s.IsNotSupported()); + } + } +} + +TEST_P(TransactionTest, OpenAndEnableU32Timestamp) { + class DummyComparatorWithU32Ts : public Comparator { + public: + DummyComparatorWithU32Ts() : Comparator(sizeof(uint32_t)) {} + const char* Name() const override { return "DummyComparatorWithU32Ts"; } + void FindShortSuccessor(std::string*) const override {} + void FindShortestSeparator(std::string*, const Slice&) const override {} + int Compare(const Slice&, const Slice&) const override { return 0; } + }; + + std::unique_ptr<Comparator> dummy_ucmp(new DummyComparatorWithU32Ts()); + + ASSERT_OK(ReOpenNoDelete()); + + assert(db); + + const std::string test_cf_name = "test_cf"; + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = dummy_ucmp.get(); + { + ColumnFamilyHandle* cfh = nullptr; + ASSERT_TRUE(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh) + .IsInvalidArgument()); + } + + // Bypass transaction db layer. + { + ColumnFamilyHandle* cfh = nullptr; + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + assert(db_impl); + ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + } + + { + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + std::vector<ColumnFamilyHandle*> handles; + ASSERT_TRUE(ReOpenNoDelete(cf_descs, &handles).IsInvalidArgument()); + } +} + +TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) { + ColumnFamilyOptions cf_options; + WriteOptions write_options; + + std::vector<std::string> cf_names; + std::vector<ColumnFamilyHandle*> cf_handles; + + cf_names.push_back("test_cf"); + + ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar")); + ASSERT_OK(db->DropColumnFamilies(cf_handles)); + + for (auto* h : cf_handles) { + delete h; + } + cf_handles.clear(); + + std::vector<ColumnFamilyDescriptor> cf_descriptors; + + cf_descriptors.emplace_back("test_cf", ColumnFamilyOptions()); + + ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar")); + ASSERT_OK(db->DropColumnFamilies(cf_handles)); + for (auto* h : cf_handles) { + delete h; + } + cf_handles.clear(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_test.h b/src/rocksdb/utilities/transactions/transaction_test.h new file mode 100644 index 000000000..0b86453a4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_test.h @@ -0,0 +1,578 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <algorithm> +#include <cinttypes> +#include <functional> +#include <string> +#include <thread> + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Return true if the ith bit is set in combination represented by comb +bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); } + +enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite }; + +class TransactionTestBase : public ::testing::Test { + public: + TransactionDB* db; + SpecialEnv special_env; + FaultInjectionTestEnv* env; + std::string dbname; + Options options; + + TransactionDBOptions txn_db_options; + bool use_stackable_db_; + + TransactionTestBase(bool use_stackable_db, bool two_write_queue, + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : db(nullptr), + special_env(Env::Default()), + env(nullptr), + use_stackable_db_(use_stackable_db) { + options.create_if_missing = true; + options.max_write_buffer_number = 2; + options.write_buffer_size = 4 * 1024; + options.unordered_write = write_ordering == kUnorderedWrite; + options.level0_file_num_compaction_trigger = 2; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + special_env.skip_fsync_ = true; + env = new FaultInjectionTestEnv(&special_env); + options.env = env; + options.two_write_queues = two_write_queue; + dbname = test::PerThreadDBPath("transaction_testdb"); + + EXPECT_OK(DestroyDB(dbname, options)); + txn_db_options.transaction_lock_timeout = 0; + txn_db_options.default_lock_timeout = 0; + txn_db_options.write_policy = write_policy; + txn_db_options.rollback_merge_operands = true; + // This will stress write unprepared, by forcing write batch flush on every + // write. + txn_db_options.default_write_batch_flush_threshold = 1; + // Write unprepared requires all transactions to be named. This setting + // autogenerates the name so that existing tests can pass. + txn_db_options.autogenerate_name = true; + Status s; + if (use_stackable_db == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + EXPECT_OK(s); + } + + ~TransactionTestBase() { + delete db; + db = nullptr; + // This is to skip the assert statement in FaultInjectionTestEnv. There + // seems to be a bug in btrfs that the makes readdir return recently + // unlink-ed files. By using the default fs we simply ignore errors resulted + // from attempting to delete such files in DestroyDB. + if (getenv("KEEP_DB") == nullptr) { + options.env = Env::Default(); + EXPECT_OK(DestroyDB(dbname, options)); + } else { + fprintf(stdout, "db is still in %s\n", dbname.c_str()); + } + delete env; + } + + Status ReOpenNoDelete() { + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpenNoDelete(std::vector<ColumnFamilyDescriptor>& cfs, + std::vector<ColumnFamilyHandle*>* handles) { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete db; + db = nullptr; + env->AssertNoOpenFile(); + env->DropUnsyncedFileData(); + env->ResetState(); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles, + &db); + } else { + s = OpenWithStackableDB(cfs, handles); + } + assert(!s.ok() || db != nullptr); + return s; + } + + Status ReOpen() { + delete db; + db = nullptr; + DestroyDB(dbname, options); + Status s; + if (use_stackable_db_ == false) { + s = TransactionDB::Open(options, txn_db_options, dbname, &db); + } else { + s = OpenWithStackableDB(); + } + assert(db != nullptr); + return s; + } + + Status OpenWithStackableDB(std::vector<ColumnFamilyDescriptor>& cfs, + std::vector<ColumnFamilyHandle*>* handles) { + std::vector<size_t> compaction_enabled_cf_indices; + TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices); + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db, + use_seq_per_batch, use_batch_per_txn); + auto stackable_db = std::make_unique<StackableDB>(root_db); + if (s.ok()) { + assert(root_db != nullptr); + // If WrapStackableDB() returns non-ok, then stackable_db is already + // deleted within WrapStackableDB(). + s = TransactionDB::WrapStackableDB(stackable_db.release(), txn_db_options, + compaction_enabled_cf_indices, + *handles, &db); + } + return s; + } + + Status OpenWithStackableDB() { + std::vector<size_t> compaction_enabled_cf_indices; + std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))}; + + TransactionDB::PrepareWrap(&options, &column_families, + &compaction_enabled_cf_indices); + std::vector<ColumnFamilyHandle*> handles; + DB* root_db = nullptr; + Options options_copy(options); + const bool use_seq_per_batch = + txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED; + const bool use_batch_per_txn = + txn_db_options.write_policy == WRITE_COMMITTED || + txn_db_options.write_policy == WRITE_PREPARED; + Status s = DBImpl::Open(options_copy, dbname, column_families, &handles, + &root_db, use_seq_per_batch, use_batch_per_txn); + if (!s.ok()) { + delete root_db; + return s; + } + StackableDB* stackable_db = new StackableDB(root_db); + assert(root_db != nullptr); + assert(handles.size() == 1); + s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options, + compaction_enabled_cf_indices, handles, + &db); + delete handles[0]; + if (!s.ok()) { + delete stackable_db; + } + return s; + } + + std::atomic<size_t> linked = {0}; + std::atomic<size_t> exp_seq = {0}; + std::atomic<size_t> commit_writes = {0}; + std::atomic<size_t> expected_commits = {0}; + // Without Prepare, the commit does not write to WAL + std::atomic<size_t> with_empty_commits = {0}; + void TestTxn0(size_t index) { + // Test DB's internal txn. It involves no prepare phase nor a commit marker. + auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value"); + ASSERT_OK(s); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq++; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + with_empty_commits++; + } + + void TestTxn1(size_t index) { + // Testing directly writing a write batch. Functionality-wise it is + // equivalent to commit without prepare. + WriteBatch wb; + auto istr = std::to_string(index); + ASSERT_OK(wb.Put("k1" + istr, "v1")); + ASSERT_OK(wb.Put("k2" + istr, "v2")); + ASSERT_OK(wb.Put("k3" + istr, "v3")); + auto s = db->Write(WriteOptions(), &wb); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 3; + } else { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } + ASSERT_OK(s); + with_empty_commits++; + } + + void TestTxn2(size_t index) { + // Commit without prepare. It should write to DB without a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 4; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 4; + // WriteUnprepared implements CommitWithoutPrepareInternal by simply + // calling Prepare then Commit. Consume one seq for the prepare. + exp_seq++; + } + delete txn; + with_empty_commits++; + } + + void TestTxn3(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Commit()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // Consume one seq per key + exp_seq += 5; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per commit marker + exp_seq++; + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per commit marker + exp_seq++; + } + delete txn; + } + + void TestTxn4(size_t index) { + // A full 2pc txn that also involves a commit marker. + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + auto istr = std::to_string(index); + ASSERT_OK(txn->SetName("xid" + istr)); + ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); + ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2"))); + ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3"))); + ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4"))); + ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5"))); + expected_commits++; + ASSERT_OK(txn->Prepare()); + commit_writes++; + ASSERT_OK(txn->Rollback()); + if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { + // No seq is consumed for deleting the txn buffer + exp_seq += 0; + } else if (txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED) { + // Consume one seq per batch + exp_seq++; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } else { + // Flushed after each key, consume one seq per flushed batch + exp_seq += 5; + // Consume one seq per rollback batch + exp_seq++; + if (options.two_write_queues) { + // Consume one seq for rollback commit + exp_seq++; + } + } + delete txn; + } + + // Test that we can change write policy after a clean shutdown (which would + // empty the WAL) + void CrossCompatibilityTest(TxnDBWritePolicy from_policy, + TxnDBWritePolicy to_policy, bool empty_wal) { + TransactionOptions txn_options; + ReadOptions read_options; + WriteOptions write_options; + uint32_t index = 0; + Random rnd(1103); + options.write_buffer_size = 1024; // To create more sst files + std::unordered_map<std::string, std::string> committed_kvs; + Transaction* txn; + + txn_db_options.write_policy = from_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + ASSERT_OK(ReOpen()); + + for (int i = 0; i < 1024; i++) { + auto istr = std::to_string(index); + auto k = Slice("foo-" + istr).ToString(); + auto v = Slice("bar-" + istr).ToString(); + // For test the duplicate keys + auto v2 = Slice("bar2-" + istr).ToString(); + auto type = rnd.Uniform(4); + switch (type) { + case 0: + committed_kvs[k] = v; + ASSERT_OK(db->Put(write_options, k, v)); + committed_kvs[k] = v2; + ASSERT_OK(db->Put(write_options, k, v2)); + break; + case 1: { + WriteBatch wb; + committed_kvs[k] = v; + ASSERT_OK(wb.Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(wb.Put(k, v2)); + ASSERT_OK(db->Write(write_options, &wb)); + + } break; + case 2: + case 3: + txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + istr)); + committed_kvs[k] = v; + ASSERT_OK(txn->Put(k, v)); + committed_kvs[k] = v2; + ASSERT_OK(txn->Put(k, v2)); + + if (type == 3) { + ASSERT_OK(txn->Prepare()); + } + ASSERT_OK(txn->Commit()); + delete txn; + break; + default: + FAIL(); + } + + index++; + } // for i + + txn_db_options.write_policy = to_policy; + if (txn_db_options.write_policy == WRITE_COMMITTED) { + options.unordered_write = false; + } + auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + // Before upgrade/downgrade the WAL must be emptied + if (empty_wal) { + ASSERT_OK(db_impl->TEST_FlushMemTable()); + } else { + ASSERT_OK(db_impl->FlushWAL(true)); + } + auto s = ReOpenNoDelete(); + if (empty_wal) { + ASSERT_OK(s); + } else { + // Test that we can detect the WAL that is produced by an incompatible + // WritePolicy and fail fast before mis-interpreting the WAL. + ASSERT_TRUE(s.IsNotSupported()); + return; + } + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + // Check that WAL is empty + VectorLogPtr log_files; + ASSERT_OK(db_impl->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + for (auto& kv : committed_kvs) { + std::string value; + s = db->Get(read_options, kv.first, &value); + if (s.IsNotFound()) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_OK(s); + if (kv.second != value) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_EQ(kv.second, value); + } + } +}; + +class TransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> { + public: + TransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; +}; + +class TransactionStressTest : public TransactionTest {}; + +class MySQLStyleTransactionTest + : public TransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> { + public: + MySQLStyleTransactionTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + with_slow_threads_(std::get<4>(GetParam())) { + if (with_slow_threads_ && + (txn_db_options.write_policy == WRITE_PREPARED || + txn_db_options.write_policy == WRITE_UNPREPARED)) { + // The corner case with slow threads involves the caches filling + // over which would not happen even with artifial delays. To help + // such cases to show up we lower the size of the cache-related data + // structures. + txn_db_options.wp_snapshot_cache_bits = 1; + txn_db_options.wp_commit_cache_bits = 10; + options.write_buffer_size = 1024; + EXPECT_OK(ReOpen()); + } + }; + + protected: + // Also emulate slow threads by addin artiftial delays + const bool with_slow_threads_; +}; + +class WriteCommittedTxnWithTsTest + : public TransactionTestBase, + public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> { + public: + WriteCommittedTxnWithTsTest() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + WRITE_COMMITTED, kOrderedWrite) {} + ~WriteCommittedTxnWithTsTest() override { + for (auto* h : handles_) { + delete h; + } + } + + Status GetFromDb(ReadOptions read_opts, ColumnFamilyHandle* column_family, + const Slice& key, TxnTimestamp ts, std::string* value) { + std::string ts_buf; + PutFixed64(&ts_buf, ts); + Slice ts_slc = ts_buf; + read_opts.timestamp = &ts_slc; + assert(db); + return db->Get(read_opts, column_family, key, value); + } + + Transaction* NewTxn(WriteOptions write_opts, TransactionOptions txn_opts) { + assert(db); + auto* txn = db->BeginTransaction(write_opts, txn_opts); + assert(txn); + const bool enable_indexing = std::get<2>(GetParam()); + if (enable_indexing) { + txn->EnableIndexing(); + } else { + txn->DisableIndexing(); + } + return txn; + } + + protected: + std::vector<ColumnFamilyHandle*> handles_{}; +}; + +class TimestampedSnapshotWithTsSanityCheck + : public TransactionTestBase, + public ::testing::WithParamInterface< + std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> { + public: + explicit TimestampedSnapshotWithTsSanityCheck() + : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())) {} + ~TimestampedSnapshotWithTsSanityCheck() override { + for (auto* h : handles_) { + delete h; + } + } + + protected: + std::vector<ColumnFamilyHandle*> handles_{}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/utilities/transactions/transaction_util.cc b/src/rocksdb/utilities/transactions/transaction_util.cc new file mode 100644 index 000000000..360edc8ec --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_util.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_util.h" + +#include <cinttypes> +#include <string> +#include <vector> + +#include "db/db_impl/db_impl.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status TransactionUtil::CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key, + SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only, + ReadCallback* snap_checker, SequenceNumber min_uncommitted) { + Status result; + + auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd); + + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + cfh->GetName()); + } + + if (result.ok()) { + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts, + cache_only, snap_checker, min_uncommitted); + + db_impl->ReturnAndCleanupSuperVersion(cfd, sv); + } + + return result; +} + +Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, + SequenceNumber snap_seq, + const std::string& key, + const std::string* const read_ts, + bool cache_only, ReadCallback* snap_checker, + SequenceNumber min_uncommitted) { + // When `min_uncommitted` is provided, keys are not always committed + // in sequence number order, and `snap_checker` is used to check whether + // specific sequence number is in the database is visible to the transaction. + // So `snap_checker` must be provided. + assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr); + + Status result; + bool need_to_read_sst = false; + + // Since it would be too slow to check the SST files, we will only use + // the memtables to check whether there have been any recent writes + // to this key after it was accessed in this transaction. But if the + // Memtables do not contain a long enough history, we must fail the + // transaction. + if (earliest_seq == kMaxSequenceNumber) { + // The age of this memtable is unknown. Cannot rely on it to check + // for recent writes. This error shouldn't happen often in practice as + // the Memtable should have a valid earliest sequence number except in some + // corner cases (such as error cases during recovery). + need_to_read_sst = true; + + if (cache_only) { + result = Status::TryAgain( + "Transaction could not check for conflicts as the MemTable does not " + "contain a long enough history to check write at SequenceNumber: ", + std::to_string(snap_seq)); + } + } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) { + // Use <= for min_uncommitted since earliest_seq is actually the largest sec + // before this memtable was created + need_to_read_sst = true; + + if (cache_only) { + // The age of this memtable is too new to use to check for recent + // writes. + char msg[300]; + snprintf(msg, sizeof(msg), + "Transaction could not check for conflicts for operation at " + "SequenceNumber %" PRIu64 + " as the MemTable only contains changes newer than " + "SequenceNumber %" PRIu64 + ". Increasing the value of the " + "max_write_buffer_size_to_maintain option could reduce the " + "frequency " + "of this error.", + snap_seq, earliest_seq); + result = Status::TryAgain(msg); + } + } + + if (result.ok()) { + SequenceNumber seq = kMaxSequenceNumber; + std::string timestamp; + bool found_record_for_key = false; + + // When min_uncommitted == kMaxSequenceNumber, writes are committed in + // sequence number order, so only keys larger than `snap_seq` can cause + // conflict. + // When min_uncommitted != kMaxSequenceNumber, keys lower than + // min_uncommitted will not triggered conflicts, while keys larger than + // min_uncommitted might create conflicts, so we need to read them out + // from the DB, and call callback to snap_checker to determine. So only + // keys lower than min_uncommitted can be skipped. + SequenceNumber lower_bound_seq = + (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted; + Status s = db_impl->GetLatestSequenceForKey( + sv, key, !need_to_read_sst, lower_bound_seq, &seq, + !read_ts ? nullptr : ×tamp, &found_record_for_key, + /*is_blob_index=*/nullptr); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + result = s; + } else if (found_record_for_key) { + bool write_conflict = snap_checker == nullptr + ? snap_seq < seq + : !snap_checker->IsVisible(seq); + // Perform conflict checking based on timestamp if applicable. + if (!write_conflict && read_ts != nullptr) { + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + assert(read_ts->size() == ucmp->timestamp_size()); + assert(read_ts->size() == timestamp.size()); + // Write conflict if *ts < timestamp. + write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0; + } + if (write_conflict) { + result = Status::Busy(); + } + } + } + + return result; +} + +Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only) { + Status result; + + std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it( + tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + + SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf); + if (sv == nullptr) { + result = Status::InvalidArgument("Could not access column family " + + std::to_string(cf)); + break; + } + + SequenceNumber earliest_seq = + db_impl->GetEarliestMemTableSequenceNumber(sv, true); + + // For each of the keys in this transaction, check to see if someone has + // written to this key since the start of the transaction. + std::unique_ptr<LockTracker::KeyIterator> key_it( + tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + PointLockStatus status = tracker.GetPointLockStatus(cf, key); + const SequenceNumber key_seq = status.seq; + + // TODO: support timestamp-based conflict checking. + // CheckKeysForConflicts() is currently used only by optimistic + // transactions. + result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, + /*read_ts=*/nullptr, cache_only); + if (!result.ok()) { + break; + } + } + + db_impl->ReturnAndCleanupSuperVersion(cf, sv); + + if (!result.ok()) { + break; + } + } + + return result; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/transaction_util.h b/src/rocksdb/utilities/transactions/transaction_util.h new file mode 100644 index 000000000..a349ba87a --- /dev/null +++ b/src/rocksdb/utilities/transactions/transaction_util.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include <unordered_map> + +#include "db/dbformat.h" +#include "db/read_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "utilities/transactions/lock/lock_tracker.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +struct SuperVersion; +class WriteBatchWithIndex; + +class TransactionUtil { + public: + // Verifies there have been no commits to this key in the db since this + // sequence number. If user-defined timestamp is enabled, then also check + // no commits to this key in the db since the given ts. + // + // If cache_only is true, then this function will not attempt to read any + // SST files. This will make it more likely this function will + // return an error if it is unable to determine if there are any conflicts. + // + // See comment of CheckKey() for explanation of `snap_seq`, `ts`, + // `snap_checker` and `min_uncommitted`. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + static Status CheckKeyForConflicts( + DBImpl* db_impl, ColumnFamilyHandle* column_family, + const std::string& key, SequenceNumber snap_seq, + const std::string* const ts, bool cache_only, + ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); + + // For each key,SequenceNumber pair tracked by the LockTracker, this function + // will verify there have been no writes to the key in the db since that + // sequence number. + // + // Returns OK on success, BUSY if there is a conflicting write, or other error + // status for any unexpected errors. + // + // REQUIRED: + // This function should only be called on the write thread or if the + // mutex is held. + // tracker must support point lock. + static Status CheckKeysForConflicts(DBImpl* db_impl, + const LockTracker& tracker, + bool cache_only); + + private: + // If `snap_checker` == nullptr, writes are always commited in sequence number + // order. All sequence number <= `snap_seq` will not conflict with any + // write, and all keys > `snap_seq` of `key` will trigger conflict. + // If `snap_checker` != nullptr, writes may not commit in sequence number + // order. In this case `min_uncommitted` is a lower bound. + // seq < `min_uncommitted`: no conflict + // seq > `snap_seq`: applicable to conflict + // `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine. + // + // If user-defined timestamp is enabled, a write conflict is detected if an + // operation for `key` with timestamp greater than `ts` exists. + static Status CheckKey(DBImpl* db_impl, SuperVersion* sv, + SequenceNumber earliest_seq, SequenceNumber snap_seq, + const std::string& key, const std::string* const ts, + bool cache_only, ReadCallback* snap_checker = nullptr, + SequenceNumber min_uncommitted = kMaxSequenceNumber); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc new file mode 100644 index 000000000..94b8201f7 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc @@ -0,0 +1,588 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "utilities/merge_operators.h" +#ifndef ROCKSDB_LITE + +#include "test_util/testutil.h" +#include "utilities/transactions/transaction_test.h" + +namespace ROCKSDB_NAMESPACE { + +INSTANTIATE_TEST_CASE_P( + DBAsBaseDB, WriteCommittedTxnWithTsTest, + ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false, + /*enable_indexing=*/false), + std::make_tuple(false, /*two_write_queue=*/true, + /*enable_indexing=*/false), + std::make_tuple(false, /*two_write_queue=*/false, + /*enable_indexing=*/true), + std::make_tuple(false, /*two_write_queue=*/true, + /*enable_indexing=*/true))); + +INSTANTIATE_TEST_CASE_P( + DBAsStackableDB, WriteCommittedTxnWithTsTest, + ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false, + /*enable_indexing=*/false), + std::make_tuple(true, /*two_write_queue=*/true, + /*enable_indexing=*/false), + std::make_tuple(true, /*two_write_queue=*/false, + /*enable_indexing=*/true), + std::make_tuple(true, /*two_write_queue=*/true, + /*enable_indexing=*/true))); + +TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->Put(handles_[1], "foo", "value")); + ASSERT_TRUE(txn->Commit().IsInvalidArgument()); + + auto* pessimistic_txn = + static_cast_with_check<PessimisticTransaction>(txn.get()); + ASSERT_TRUE( + pessimistic_txn->CommitBatch(/*batch=*/nullptr).IsInvalidArgument()); + + { + WriteBatchWithIndex* wbwi = txn->GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + // Write a key to the batch for nonexisting cf. + ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"", + /*value=*/"")); + } + + ASSERT_OK(txn->SetCommitTimestamp(20)); + + ASSERT_TRUE(txn->Commit().IsInvalidArgument()); + txn.reset(); + + std::unique_ptr<Transaction> txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Put(handles_[1], "foo", "value")); + { + WriteBatchWithIndex* wbwi = txn1->GetWriteBatch(); + assert(wbwi); + WriteBatch* wb = wbwi->GetWriteBatch(); + assert(wb); + // Write a key to the batch for non-existing cf. + ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"", + /*value=*/"")); + } + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(21)); + ASSERT_TRUE(txn1->Commit().IsInvalidArgument()); + txn1.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + ASSERT_OK(txn0->Put(handles_[1], "foo", "value")); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + ASSERT_TRUE(txn0->Commit().IsInvalidArgument()); + txn0.reset(); + + std::unique_ptr<Transaction> txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->Put(handles_[1], "foo", "value1")); + { + std::string buf; + PutFixed64(&buf, 23); + ASSERT_OK(txn1->Put("id", buf)); + ASSERT_OK(txn1->Merge("id", buf)); + } + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + { + std::string value; + const Status s = + GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value); + ASSERT_OK(s); + ASSERT_EQ("value1", value); + } + + { + std::string value; + const Status s = db->Get(ReadOptions(), handles_[0], "id", &value); + ASSERT_OK(s); + uint64_t ival = 0; + Slice value_slc = value; + bool result = GetFixed64(&value_slc, &ival); + assert(result); + ASSERT_EQ(46, ival); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, RecoverFromWal) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_opts; + cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_opts); + options.avoid_flush_during_shutdown = true; + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + ASSERT_OK(txn0->Put(handles_[1], "foo", "foo_value")); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + + WriteOptions write_opts; + write_opts.sync = true; + std::unique_ptr<Transaction> txn1(NewTxn(write_opts, TransactionOptions())); + assert(txn1); + ASSERT_OK(txn1->Put("bar", "bar_value_1")); + ASSERT_OK(txn1->Put(handles_[1], "bar", "bar_value_1")); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + std::unique_ptr<Transaction> txn2(NewTxn(write_opts, TransactionOptions())); + assert(txn2); + ASSERT_OK(txn2->Put("key1", "value_3")); + ASSERT_OK(txn2->Put(handles_[1], "key1", "value_3")); + ASSERT_OK(txn2->SetCommitTimestamp(/*ts=*/24)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); + + txn0.reset(); + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + { + std::string value; + Status s = GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(ReadOptions(), handles_[0], "bar", &value); + ASSERT_OK(s); + ASSERT_EQ("bar_value_1", value); + + value.clear(); + s = GetFromDb(ReadOptions(), handles_[1], "bar", /*ts=*/23, &value); + ASSERT_OK(s); + ASSERT_EQ("bar_value_1", value); + + s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/23, &value); + ASSERT_TRUE(s.IsNotFound()); + + s = db->Get(ReadOptions(), handles_[0], "key1", &value); + ASSERT_OK(s); + ASSERT_EQ("value_3", value); + + s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/24, &value); + ASSERT_OK(s); + ASSERT_EQ("value_3", value); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, TransactionDbLevelApi) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, cf_options); + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::string key_str = "tes_key"; + std::string ts_str; + std::string value_str = "test_value"; + PutFixed64(&ts_str, 100); + Slice value = value_str; + + assert(db); + ASSERT_TRUE( + db->Put(WriteOptions(), handles_[1], "foo", "bar").IsNotSupported()); + ASSERT_TRUE(db->Delete(WriteOptions(), handles_[1], "foo").IsNotSupported()); + ASSERT_TRUE( + db->SingleDelete(WriteOptions(), handles_[1], "foo").IsNotSupported()); + ASSERT_TRUE( + db->Merge(WriteOptions(), handles_[1], "foo", "+1").IsNotSupported()); + WriteBatch wb1(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + ASSERT_OK(wb1.Put(handles_[1], key_str, ts_str, value)); + ASSERT_TRUE(db->Write(WriteOptions(), &wb1).IsNotSupported()); + ASSERT_TRUE(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb1) + .IsNotSupported()); + auto* pessimistic_txn_db = + static_cast_with_check<PessimisticTransactionDB>(db); + assert(pessimistic_txn_db); + ASSERT_TRUE( + pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb1) + .IsNotSupported()); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + ASSERT_OK(db->Delete(WriteOptions(), "bar")); + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Merge(WriteOptions(), "key", "_more")); + WriteBatch wb2(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + ASSERT_OK(wb2.Put(key_str, value)); + ASSERT_OK(db->Write(WriteOptions(), &wb2)); + ASSERT_OK(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb2)); + ASSERT_OK( + pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb2)); + + std::unique_ptr<Transaction> txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + + WriteBatch wb3(/*reserved_bytes=*/0, /*max_bytes=*/0, + /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0); + + ASSERT_OK(wb3.Put(handles_[1], "key", "value")); + auto* pessimistic_txn = + static_cast_with_check<PessimisticTransaction>(txn.get()); + assert(pessimistic_txn); + ASSERT_TRUE(pessimistic_txn->CommitBatch(&wb3).IsNotSupported()); + + txn.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, Merge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn); + ASSERT_OK(txn->Put(handles_[1], "foo", "bar")); + ASSERT_OK(txn->Merge(handles_[1], "foo", "1")); + ASSERT_OK(txn->SetCommitTimestamp(24)); + ASSERT_OK(txn->Commit()); + txn.reset(); + { + std::string value; + const Status s = + GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/24, &value); + ASSERT_OK(s); + ASSERT_EQ("bar,1", value); + } +} + +TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn0( + NewTxn(WriteOptions(), TransactionOptions())); + + std::unique_ptr<Transaction> txn1( + NewTxn(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn1->Put(handles_[1], "key", "value1")); + ASSERT_OK(txn1->SetCommitTimestamp(24)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + std::string value; + ASSERT_OK(txn0->SetReadTimestampForValidation(23)); + ASSERT_TRUE( + txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value).IsBusy()); + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + std::unique_ptr<Transaction> txn2( + NewTxn(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->SetReadTimestampForValidation(25)); + ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value)); + ASSERT_OK(txn2->SetCommitTimestamp(26)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + std::unique_ptr<Transaction> txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + + { + std::string value; + ASSERT_OK(txn0->SetReadTimestampForValidation(100)); + // Lock "key". + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value) + .IsNotFound()); + } + + ASSERT_OK(txn0->Put(handles_[1], "key", "value0")); + ASSERT_OK(txn0->SetCommitTimestamp(101)); + ASSERT_OK(txn0->Commit()); + + ASSERT_OK(txn1->Put(handles_[1], "key", "value1")); + // In reality, caller needs to ensure commit_ts of txn1 is greater than the + // commit_ts of txn0, which is true for lock-based concurrency control. + ASSERT_OK(txn1->SetCommitTimestamp(102)); + ASSERT_OK(txn1->Commit()); + + txn0.reset(); + txn1.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, RefineReadTimestamp) { + ASSERT_OK(ReOpenNoDelete()); + + ColumnFamilyOptions cf_options; + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + const std::string test_cf_name = "test_cf"; + ColumnFamilyHandle* cfh = nullptr; + assert(db); + ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh)); + delete cfh; + cfh = nullptr; + + std::vector<ColumnFamilyDescriptor> cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options)); + options.avoid_flush_during_shutdown = true; + + ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_)); + + std::unique_ptr<Transaction> txn0( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn0); + + std::unique_ptr<Transaction> txn1( + NewTxn(WriteOptions(), TransactionOptions())); + assert(txn1); + + { + ASSERT_OK(txn0->SetReadTimestampForValidation(100)); + // Lock "key0", "key1", ..., "key4". + for (int i = 0; i < 5; ++i) { + std::string value; + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], + "key" + std::to_string(i), &value) + .IsNotFound()); + } + } + ASSERT_OK(txn1->Put(handles_[1], "key5", "value5_0")); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn1->SetCommitTimestamp(101)); + ASSERT_OK(txn1->Commit()); + txn1.reset(); + + { + std::string value; + ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value) + .IsBusy()); + ASSERT_OK(txn0->SetReadTimestampForValidation(102)); + ASSERT_OK(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value)); + ASSERT_EQ("value5_0", value); + } + + for (int i = 0; i < 6; ++i) { + ASSERT_OK(txn0->Put(handles_[1], "key" + std::to_string(i), + "value" + std::to_string(i))); + } + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->SetCommitTimestamp(103)); + ASSERT_OK(txn0->Commit()); + txn0.reset(); +} + +TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + ASSERT_OK(ReOpen()); + + std::unique_ptr<Transaction> txn1( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn1); + + std::unique_ptr<Transaction> txn2( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn2); + ASSERT_OK(txn2->Put("foo", "v0")); + ASSERT_OK(txn2->SetCommitTimestamp(10)); + ASSERT_OK(txn2->Commit()); + txn2.reset(); + + // txn1 takes a snapshot after txn2 commits. The writes of txn2 have + // a smaller seqno than txn1's snapshot, thus should not affect conflict + // checking. + txn1->SetSnapshot(); + + std::unique_ptr<Transaction> txn3( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + assert(txn3); + ASSERT_OK(txn3->SetReadTimestampForValidation(20)); + std::string dontcare; + ASSERT_OK(txn3->GetForUpdate(ReadOptions(), "foo", &dontcare)); + ASSERT_OK(txn3->SingleDelete("foo")); + ASSERT_OK(txn3->SetName("txn3")); + ASSERT_OK(txn3->Prepare()); + ASSERT_OK(txn3->SetCommitTimestamp(30)); + // txn3 reads at ts=20 > txn2's commit timestamp, and commits at ts=30. + // txn3 can commit successfully, leaving a tombstone with ts=30. + ASSERT_OK(txn3->Commit()); + txn3.reset(); + + bool called = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLatestSequenceForKey:mem", [&](void* arg) { + auto* const ts_ptr = reinterpret_cast<std::string*>(arg); + assert(ts_ptr); + Slice ts_slc = *ts_ptr; + uint64_t last_ts = 0; + ASSERT_TRUE(GetFixed64(&ts_slc, &last_ts)); + ASSERT_EQ(30, last_ts); + called = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // txn1's read timestamp is 25 < 30 (commit timestamp of txn3). Therefore, + // the tombstone written by txn3 causes the conflict checking to fail. + ASSERT_OK(txn1->SetReadTimestampForValidation(25)); + ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), "foo", &dontcare).IsBusy()); + ASSERT_TRUE(called); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <cstdio> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Transactions not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc new file mode 100644 index 000000000..86a9511a4 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc @@ -0,0 +1,4078 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <atomic> +#include <cinttypes> +#include <functional> +#include <string> +#include <thread> + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/debug.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "table/mock_table.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_test.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +using CommitEntry = WritePreparedTxnDB::CommitEntry; +using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b; +using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat; + +TEST(PreparedHeap, BasicsTest) { + WritePreparedTxnDB::PreparedHeap heap; + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(14l); + // Test with one element + ASSERT_EQ(14l, heap.top()); + heap.push(24l); + heap.push(34l); + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.push(44l); + heap.push(54l); + heap.push(64l); + heap.push(74l); + heap.push(84l); + } + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.erase(24l); + // Test that old min is still on top + ASSERT_EQ(14l, heap.top()); + heap.erase(14l); + // Test that the new comes to the top after multiple erase + ASSERT_EQ(34l, heap.top()); + heap.erase(34l); + // Test that the new comes to the top after single erase + ASSERT_EQ(44l, heap.top()); + heap.erase(54l); + ASSERT_EQ(44l, heap.top()); + heap.pop(); // pop 44l + // Test that the erased items are ignored after pop + ASSERT_EQ(64l, heap.top()); + heap.erase(44l); + // Test that erasing an already popped item would work + ASSERT_EQ(64l, heap.top()); + heap.erase(84l); + ASSERT_EQ(64l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(85l); + heap.push(86l); + heap.push(87l); + heap.push(88l); + heap.push(89l); + } + heap.erase(87l); + heap.erase(85l); + heap.erase(89l); + heap.erase(86l); + heap.erase(88l); + // Test top remains the same after a random order of many erases + ASSERT_EQ(64l, heap.top()); + heap.pop(); + // Test that pop works with a series of random pending erases + ASSERT_EQ(74l, heap.top()); + ASSERT_FALSE(heap.empty()); + heap.pop(); + // Test that empty works + ASSERT_TRUE(heap.empty()); +} + +// This is a scenario reconstructed from a buggy trace. Test that the bug does +// not resurface again. +TEST(PreparedHeap, EmptyAtTheEnd) { + WritePreparedTxnDB::PreparedHeap heap; + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } + ASSERT_EQ(40l, heap.top()); + // Although not a recommended scenario, we must be resilient against erase + // without a prior push. + heap.erase(50l); + ASSERT_EQ(40l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } + ASSERT_EQ(40l, heap.top()); + + heap.erase(60l); + ASSERT_EQ(40l, heap.top()); + heap.erase(40l); + ASSERT_TRUE(heap.empty()); + + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(40l); + } + ASSERT_EQ(40l, heap.top()); + heap.erase(50l); + ASSERT_EQ(40l, heap.top()); + { + MutexLock ml(heap.push_pop_mutex()); + heap.push(60l); + } + ASSERT_EQ(40l, heap.top()); + + heap.erase(40l); + // Test that the erase has not emptied the heap (we had a bug doing that) + ASSERT_FALSE(heap.empty()); + ASSERT_EQ(60l, heap.top()); + heap.erase(60l); + ASSERT_TRUE(heap.empty()); +} + +// Generate random order of PreparedHeap access and test that the heap will be +// successfully emptied at the end. +TEST(PreparedHeap, Concurrent) { + const size_t t_cnt = 10; + ROCKSDB_NAMESPACE::port::Thread t[t_cnt + 1]; + WritePreparedTxnDB::PreparedHeap heap; + port::RWMutex prepared_mutex; + std::atomic<size_t> last; + + for (size_t n = 0; n < 100; n++) { + last = 0; + t[0] = ROCKSDB_NAMESPACE::port::Thread([&]() { + Random rnd(1103); + for (size_t seq = 1; seq <= t_cnt; seq++) { + // This is not recommended usage but we should be resilient against it. + bool skip_push = rnd.OneIn(5); + if (!skip_push) { + MutexLock ml(heap.push_pop_mutex()); + std::this_thread::yield(); + heap.push(seq); + last.store(seq); + } + } + }); + for (size_t i = 1; i <= t_cnt; i++) { + t[i] = + ROCKSDB_NAMESPACE::port::Thread([&heap, &prepared_mutex, &last, i]() { + auto seq = i; + do { + std::this_thread::yield(); + } while (last.load() < seq); + WriteLock wl(&prepared_mutex); + heap.erase(seq); + }); + } + for (size_t i = 0; i <= t_cnt; i++) { + t[i].join(); + } + ASSERT_TRUE(heap.empty()); + } +} + +// Test that WriteBatchWithIndex correctly counts the number of sub-batches +TEST(WriteBatchWithIndex, SubBatchCnt) { + ColumnFamilyOptions cf_options; + std::string cf_name = "two"; + DB* db; + Options options; + options.create_if_missing = true; + const std::string dbname = test::PerThreadDBPath("transaction_testdb"); + EXPECT_OK(DestroyDB(dbname, options)); + ASSERT_OK(DB::Open(options, dbname, &db)); + ColumnFamilyHandle* cf_handle = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); + WriteOptions write_options; + size_t batch_cnt = 1; + size_t save_points = 0; + std::vector<size_t> batch_cnt_at; + WriteBatchWithIndex batch(db->DefaultColumnFamily()->GetComparator(), 0, true, + 0); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key"), Slice("value"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key2"), Slice("value2"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the keys + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key"), Slice("value3"))); + batch_cnt++; + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the 2nd key. It should not be counted duplicate since a + // sub-patch is cut after the last duplicate. + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(Slice("key2"), Slice("value4"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + // duplicate the keys but in a different cf. It should not be counted as + // duplicate keys + batch_cnt_at.push_back(batch_cnt); + batch.SetSavePoint(); + save_points++; + ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5"))); + ASSERT_EQ(batch_cnt, batch.SubBatchCnt()); + + // Test that the number of sub-batches matches what we count with + // SubBatchCounter + std::map<uint32_t, const Comparator*> comparators; + comparators[0] = db->DefaultColumnFamily()->GetComparator(); + comparators[cf_handle->GetID()] = cf_handle->GetComparator(); + SubBatchCounter counter(comparators); + ASSERT_OK(batch.GetWriteBatch()->Iterate(&counter)); + ASSERT_EQ(batch_cnt, counter.BatchCount()); + + // Test that RollbackToSavePoint will properly resets the number of + // sub-batches + for (size_t i = save_points; i > 0; i--) { + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt()); + } + + // Test the count is right with random batches + { + const size_t TOTAL_KEYS = 20; // 20 ~= 10 to cause a few randoms + Random rnd(1131); + std::string keys[TOTAL_KEYS]; + for (size_t k = 0; k < TOTAL_KEYS; k++) { + int len = static_cast<int>(rnd.Uniform(50)); + keys[k] = test::RandomKey(&rnd, len); + } + for (size_t i = 0; i < 1000; i++) { // 1000 random batches + WriteBatchWithIndex rndbatch(db->DefaultColumnFamily()->GetComparator(), + 0, true, 0); + for (size_t k = 0; k < 10; k++) { // 10 key per batch + size_t ki = static_cast<size_t>(rnd.Uniform(TOTAL_KEYS)); + Slice key = Slice(keys[ki]); + std::string tmp = rnd.RandomString(16); + Slice value = Slice(tmp); + ASSERT_OK(rndbatch.Put(key, value)); + } + SubBatchCounter batch_counter(comparators); + ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter)); + ASSERT_EQ(rndbatch.SubBatchCnt(), batch_counter.BatchCount()); + } + } + + delete cf_handle; + delete db; +} + +TEST(CommitEntry64b, BasicTest) { + const size_t INDEX_BITS = static_cast<size_t>(21); + const size_t INDEX_SIZE = static_cast<size_t>(1ull << INDEX_BITS); + const CommitEntry64bFormat FORMAT(static_cast<size_t>(INDEX_BITS)); + + // zero-initialized CommitEntry64b should indicate an empty entry + CommitEntry64b empty_entry64b; + uint64_t empty_index = 11ul; + CommitEntry empty_entry; + bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT); + ASSERT_FALSE(ok); + + // the zero entry is reserved for un-initialized entries + const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1; + // Samples over the numbers that are covered by that many index bits + std::array<uint64_t, 4> is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}}; + // Samples over the numbers that are covered by that many commit bits + std::array<uint64_t, 4> ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}}; + // Iterate over prepare numbers that have i) cover all bits of a sequence + // number, and ii) include some bits that fall into the range of index or + // commit bits + for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) { + for (uint64_t i : is) { + for (uint64_t d : ds) { + uint64_t p = base + i + d; + for (uint64_t c : {p, p + d / 2, p + d}) { + uint64_t index = p % INDEX_SIZE; + CommitEntry before(p, c), after; + CommitEntry64b entry64b(before, FORMAT); + ok = entry64b.Parse(index, &after, FORMAT); + ASSERT_TRUE(ok); + if (!(before == after)) { + printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64 + " c %" PRIu64 " index %" PRIu64 "\n", + base, i, d, p, c, index); + } + ASSERT_EQ(before, after); + } + } + } + } +} + +class WritePreparedTxnDBMock : public WritePreparedTxnDB { + public: + WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt) + : WritePreparedTxnDB(db_impl, opt) {} + void SetDBSnapshots(const std::vector<SequenceNumber>& snapshots) { + snapshots_ = snapshots; + } + void TakeSnapshot(SequenceNumber seq) { snapshots_.push_back(seq); } + + protected: + const std::vector<SequenceNumber> GetSnapshotListFromDB( + SequenceNumber /* unused */) override { + return snapshots_; + } + + private: + std::vector<SequenceNumber> snapshots_; +}; + +class WritePreparedTransactionTestBase : public TransactionTestBase { + public: + WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue, + TxnDBWritePolicy write_policy, + WriteOrdering write_ordering) + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + write_ordering){}; + + protected: + void UpdateTransactionDBOptions(size_t snapshot_cache_bits, + size_t commit_cache_bits) { + txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits; + txn_db_options.wp_commit_cache_bits = commit_cache_bits; + } + void UpdateTransactionDBOptions(size_t snapshot_cache_bits) { + txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits; + } + // If expect_update is set, check if it actually updated old_commit_map_. If + // it did not and yet suggested not to check the next snapshot, do the + // opposite to check if it was not a bad suggestion. + void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit, + uint64_t snapshot, + uint64_t next_snapshot, + bool expect_update) { + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + // reset old_commit_map_empty_ so that its value indicate whether + // old_commit_map_ was updated + wp_db->old_commit_map_empty_ = true; + bool check_next = wp_db->MaybeUpdateOldCommitMap(prepare, commit, snapshot, + snapshot < next_snapshot); + if (expect_update == wp_db->old_commit_map_empty_) { + printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64 + " next: %" PRIu64 "\n", + prepare, commit, snapshot, next_snapshot); + } + EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_); + if (!check_next && wp_db->old_commit_map_empty_) { + // do the opposite to make sure it was not a bad suggestion + const bool dont_care_bool = true; + wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot, + dont_care_bool); + if (!wp_db->old_commit_map_empty_) { + printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64 + " next: %" PRIu64 "\n", + prepare, commit, snapshot, next_snapshot); + } + EXPECT_TRUE(wp_db->old_commit_map_empty_); + } + } + + // Test that a CheckAgainstSnapshots thread reading old_snapshots will not + // miss a snapshot because of a concurrent update by UpdateSnapshots that is + // writing new_snapshots. Both threads are broken at two points. The sync + // points to enforce them are specified by a1, a2, b1, and b2. CommitEntry + // entry is expected to be vital for one of the snapshots that is common + // between the old and new list of snapshots. + void SnapshotConcurrentAccessTestInternal( + WritePreparedTxnDB* wp_db, + const std::vector<SequenceNumber>& old_snapshots, + const std::vector<SequenceNumber>& new_snapshots, CommitEntry& entry, + SequenceNumber& version, size_t a1, size_t a2, size_t b1, size_t b2) { + // First reset the snapshot list + const std::vector<SequenceNumber> empty_snapshots; + wp_db->old_commit_map_empty_ = true; + wp_db->UpdateSnapshots(empty_snapshots, ++version); + // Then initialize it with the old_snapshots + wp_db->UpdateSnapshots(old_snapshots, ++version); + + // Starting from the first thread, cut each thread at two points + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a1), + "WritePreparedTxnDB::UpdateSnapshots:s:start"}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b1), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a1)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a2), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b1)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b2), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a2)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:end", + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b2)}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_); + ROCKSDB_NAMESPACE::port::Thread t1( + [&]() { wp_db->UpdateSnapshots(new_snapshots, version); }); + wp_db->CheckAgainstSnapshots(entry); + t1.join(); + ASSERT_FALSE(wp_db->old_commit_map_empty_); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + wp_db->old_commit_map_empty_ = true; + wp_db->UpdateSnapshots(empty_snapshots, ++version); + wp_db->UpdateSnapshots(old_snapshots, ++version); + // Starting from the second thread, cut each thread at two points + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a1), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:start"}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b1), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a1)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a2), + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b1)}, + {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b2), + "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a2)}, + {"WritePreparedTxnDB::UpdateSnapshots:p:end", + "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b2)}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_); + ROCKSDB_NAMESPACE::port::Thread t1( + [&]() { wp_db->UpdateSnapshots(new_snapshots, version); }); + wp_db->CheckAgainstSnapshots(entry); + t1.join(); + ASSERT_FALSE(wp_db->old_commit_map_empty_); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + // Verify value of keys. + void VerifyKeys(const std::unordered_map<std::string, std::string>& data, + const Snapshot* snapshot = nullptr) { + std::string value; + ReadOptions read_options; + read_options.snapshot = snapshot; + for (auto& kv : data) { + auto s = db->Get(read_options, kv.first, &value); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + if (kv.second != value) { + printf("key = %s\n", kv.first.c_str()); + } + ASSERT_EQ(kv.second, value); + } else { + ASSERT_EQ(kv.second, "NOT_FOUND"); + } + + // Try with MultiGet API too + std::vector<std::string> values; + auto s_vec = db->MultiGet(read_options, {db->DefaultColumnFamily()}, + {kv.first}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(kv.second == values[0]); + } else { + ASSERT_EQ(kv.second, "NOT_FOUND"); + } + } + } + + // Verify all versions of keys. + void VerifyInternalKeys(const std::vector<KeyVersion>& expected_versions) { + std::vector<KeyVersion> versions; + const size_t kMaxKeys = 100000; + ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key, + expected_versions.back().user_key, kMaxKeys, + &versions)); + ASSERT_EQ(expected_versions.size(), versions.size()); + for (size_t i = 0; i < versions.size(); i++) { + ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key); + ASSERT_EQ(expected_versions[i].sequence, versions[i].sequence); + ASSERT_EQ(expected_versions[i].type, versions[i].type); + if (versions[i].type != kTypeDeletion && + versions[i].type != kTypeSingleDeletion) { + ASSERT_EQ(expected_versions[i].value, versions[i].value); + } + // Range delete not supported. + ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion); + } + } +}; + +class WritePreparedTransactionTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> { + public: + WritePreparedTransactionTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())){}; +}; + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class SnapshotConcurrentAccessTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface<std::tuple< + bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> { + public: + SnapshotConcurrentAccessTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())){}; + + protected: + // A test is split into split_cnt_ tests, each identified with split_id_ where + // 0 <= split_id_ < split_cnt_ + size_t split_id_; + size_t split_cnt_; +}; +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class SeqAdvanceConcurrentTest + : public WritePreparedTransactionTestBase, + virtual public ::testing::WithParamInterface<std::tuple< + bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> { + public: + SeqAdvanceConcurrentTest() + : WritePreparedTransactionTestBase( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam())), + split_id_(std::get<4>(GetParam())), + split_cnt_(std::get<5>(GetParam())) { + special_env.skip_fsync_ = true; + }; + + protected: + // A test is split into split_cnt_ tests, each identified with split_id_ where + // 0 <= split_id_ < split_cnt_ + size_t split_id_; + size_t split_cnt_; +}; + +INSTANTIATE_TEST_CASE_P( + WritePreparedTransaction, WritePreparedTransactionTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite))); + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P( + TwoWriteQueues, SnapshotConcurrentAccessTest, + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20), + + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20))); + +INSTANTIATE_TEST_CASE_P( + OneWriteQueue, SnapshotConcurrentAccessTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20))); + +INSTANTIATE_TEST_CASE_P( + TwoWriteQueues, SeqAdvanceConcurrentTest, + ::testing::Values( + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10), + std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10))); + +INSTANTIATE_TEST_CASE_P( + OneWriteQueue, SeqAdvanceConcurrentTest, + ::testing::Values( + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10), + std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(WritePreparedTransactionTest, CommitMap) { + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ASSERT_NE(wp_db, nullptr); + ASSERT_NE(wp_db->db_impl_, nullptr); + size_t size = wp_db->COMMIT_CACHE_SIZE; + CommitEntry c = {5, 12}, e; + bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e); + ASSERT_FALSE(evicted); + + // Should be able to read the same value + CommitEntry64b dont_care; + bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(c, e); + // Should be able to distinguish between overlapping entries + found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_NE(c.prep_seq + size, e.prep_seq); + // Should be able to detect non-existent entry + found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e); + ASSERT_FALSE(found); + + // Reject an invalid exchange + CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size}; + CommitEntry64b e2_64b(e2, wp_db->FORMAT); + bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e); + ASSERT_FALSE(exchanged); + // check whether it did actually reject that + found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(c, e); + + // Accept a valid exchange + CommitEntry64b c_64b(c, wp_db->FORMAT); + CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1}; + exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3); + ASSERT_TRUE(exchanged); + // check whether it did actually accepted that + found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(e3, e); + + // Rewrite an entry + CommitEntry e4 = {e3.prep_seq + size, e3.commit_seq + size + 1}; + evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e); + ASSERT_TRUE(evicted); + ASSERT_EQ(e3, e); + found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e); + ASSERT_TRUE(found); + ASSERT_EQ(e4, e); +} + +TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) { + // If prepare <= snapshot < commit we should keep the entry around since its + // nonexistence could be interpreted as committed in the snapshot while it is + // not true. We keep such entries around by adding them to the + // old_commit_map_. + uint64_t p /*prepare*/, c /*commit*/, s /*snapshot*/, ns /*next_snapshot*/; + p = 10l, c = 15l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + // If we do not expect the old commit map to be updated, try also with a next + // snapshot that is expected to update the old commit map. This would test + // that MaybeUpdateOldCommitMap would not prevent us from checking the next + // snapshot that must be checked. + p = 10l, c = 15l, s = 20l, ns = 11l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 10l, c = 20l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 10l, c = 20l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 20l, c = 20l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 20l, c = 20l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + + p = 10l, c = 25l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true); + + p = 20l, c = 25l, s = 20l, ns = 21l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true); + + p = 21l, c = 25l, s = 20l, ns = 22l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); + p = 21l, c = 25l, s = 20l, ns = 19l; + MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false); +} + +// Trigger the condition where some old memtables are skipped when doing +// TransactionUtil::CheckKey(), and make sure the result is still correct. +TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) { + const int kAttemptHistoryMemtable = 0; + const int kAttemptImmMemTable = 1; + for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable; + attempt++) { + options.max_write_buffer_number_to_maintain = 3; + ASSERT_OK(ReOpen()); + + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.set_snapshot = true; + string value; + + ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn != nullptr); + ASSERT_OK(txn->SetName("txn")); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn2 != nullptr); + ASSERT_OK(txn2->SetName("txn2")); + + // This transaction is created to cause potential conflict. + Transaction* txn_x = db->BeginTransaction(write_options); + ASSERT_OK(txn_x->SetName("txn_x")); + ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3"))); + ASSERT_OK(txn_x->Prepare()); + + // Create snapshots after the prepare, but there should still + // be a conflict when trying to read "foo". + + if (attempt == kAttemptImmMemTable) { + // For the second attempt, hold flush from beginning. The memtable + // will be switched to immutable after calling TEST_SwitchMemtable() + // while CheckKey() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable", + "FlushJob::Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + // force a memtable flush. The memtable should still be kept + FlushOptions flush_ops; + if (attempt == kAttemptHistoryMemtable) { + ASSERT_OK(db->Flush(flush_ops)); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB()); + ASSERT_OK(db_impl->TEST_SwitchMemtable()); + } + uint64_t num_imm_mems; + ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable, + &num_imm_mems)); + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(0, num_imm_mems); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(1, num_imm_mems); + } + + // Put something in active memtable + ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar"))); + + // Create txn3 after flushing, but this transaction also needs to + // check all memtables because of they contains uncommitted data. + Transaction* txn3 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn3 != nullptr); + ASSERT_OK(txn3->SetName("txn3")); + + // Commit the pending write + ASSERT_OK(txn_x->Commit()); + + // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will + // pass. In all cases, both memtables are queried. + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy()); + // We should have checked two memtables, active and either immutable + // or history memtable, depending on the test case. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + + get_perf_context()->Reset(); + ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value)); + ASSERT_EQ(value, "bar"); + // We should have checked two memtables, and since there is no + // conflict, another Get() will be made and fetch the data from + // DB. If it is in immutable memtable, two extra memtable reads + // will be issued. If it is not (in history), only one will + // be made, which is to the active memtable. + if (attempt == kAttemptHistoryMemtable) { + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } else { + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(4, get_perf_context()->get_from_memtable_count); + } + + Transaction* txn4 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn4 != nullptr); + ASSERT_OK(txn4->SetName("txn4")); + get_perf_context()->Reset(); + ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value)); + if (attempt == kAttemptHistoryMemtable) { + // Active memtable will be checked in snapshot validation and when + // getting the value. + ASSERT_EQ(2, get_perf_context()->get_from_memtable_count); + } else { + // Only active memtable will be checked in snapshot validation but + // both of active and immutable snapshot will be queried when + // getting the value. + ASSERT_EQ(attempt, kAttemptImmMemTable); + ASSERT_EQ(3, get_perf_context()->get_from_memtable_count); + } + + ASSERT_OK(txn2->Commit()); + ASSERT_OK(txn4->Commit()); + + TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + SetPerfLevel(PerfLevel::kDisable); + + delete txn; + delete txn2; + delete txn3; + delete txn4; + delete txn_x; + } +} + +// Reproduce the bug with two snapshots with the same seuqence number and test +// that the release of the first snapshot will not affect the reads by the other +// snapshot +TEST_P(WritePreparedTransactionTest, DoubleSnapshot) { + TransactionOptions txn_options; + Status s; + + // Insert initial value + ASSERT_OK(db->Put(WriteOptions(), "key", "value1")); + + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + Transaction* txn = + wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key", "value2")); + ASSERT_OK(txn->Prepare()); + // Three snapshots with the same seq number + const Snapshot* snapshot0 = wp_db->GetSnapshot(); + const Snapshot* snapshot1 = wp_db->GetSnapshot(); + const Snapshot* snapshot2 = wp_db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE; + SequenceNumber overlap_seq = txn->GetId() + cache_size; + delete txn; + + // 4th snapshot with a larger seq + const Snapshot* snapshot3 = wp_db->GetSnapshot(); + // Cause an eviction to advance max evicted seq number + // This also fetches the 4 snapshots from db since their seq is lower than the + // new max + wp_db->AddCommitted(overlap_seq, overlap_seq); + + ReadOptions ropt; + // It should see the value before commit + ropt.snapshot = snapshot2; + PinnableSlice pinnable_val; + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + wp_db->ReleaseSnapshot(snapshot1); + + // It should still see the value before commit + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + // Cause an eviction to advance max evicted seq number and trigger updating + // the snapshot list + overlap_seq += cache_size; + wp_db->AddCommitted(overlap_seq, overlap_seq); + + // It should still see the value before commit + s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_OK(s); + ASSERT_TRUE(pinnable_val == "value1"); + pinnable_val.Reset(); + + wp_db->ReleaseSnapshot(snapshot0); + wp_db->ReleaseSnapshot(snapshot2); + wp_db->ReleaseSnapshot(snapshot3); +} + +size_t UniqueCnt(std::vector<SequenceNumber> vec) { + std::set<SequenceNumber> aset; + for (auto i : vec) { + aset.insert(i); + } + return aset.size(); +} +// Test that the entries in old_commit_map_ get garbage collected properly +TEST_P(WritePreparedTransactionTest, OldCommitMapGC) { + const size_t snapshot_cache_bits = 0; + const size_t commit_cache_bits = 0; + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + std::unique_ptr<WritePreparedTxnDBMock> wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + + SequenceNumber seq = 0; + // Take the first snapshot that overlaps with two txn + auto prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + auto prep_seq2 = ++seq; + wp_db->AddPrepared(prep_seq2); + auto snap_seq1 = seq; + wp_db->TakeSnapshot(snap_seq1); + auto commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + auto commit_seq2 = ++seq; + wp_db->AddCommitted(prep_seq2, commit_seq2); + wp_db->RemovePrepared(prep_seq2); + // Take the 2nd and 3rd snapshot that overlap with the same txn + prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + auto snap_seq2 = seq; + wp_db->TakeSnapshot(snap_seq2); + seq++; + auto snap_seq3 = seq; + wp_db->TakeSnapshot(snap_seq3); + seq++; + commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + // Make sure max_evicted_seq_ will be larger than 2nd snapshot by evicting the + // only item in the commit_cache_ via another commit. + prep_seq = ++seq; + wp_db->AddPrepared(prep_seq); + commit_seq = ++seq; + wp_db->AddCommitted(prep_seq, commit_seq); + wp_db->RemovePrepared(prep_seq); + + // Verify that the evicted commit entries for all snapshots are in the + // old_commit_map_ + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(3, wp_db->old_commit_map_.size()); + ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq2])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 2nd snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq2); + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(2, wp_db->old_commit_map_.size()); + ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1])); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 1st snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq1); + { + ASSERT_FALSE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(1, wp_db->old_commit_map_.size()); + ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3])); + } + + // Verify that the 3rd snapshot is cleaned up after the release + wp_db->ReleaseSnapshotInternal(snap_seq3); + { + ASSERT_TRUE(wp_db->old_commit_map_empty_.load()); + ReadLock rl(&wp_db->old_commit_map_mutex_); + ASSERT_EQ(0, wp_db->old_commit_map_.size()); + } +} + +TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) { + std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l, + 600l, 700l, 800l, 900l}; + const size_t snapshot_cache_bits = 2; + const uint64_t cache_size = 1ul << snapshot_cache_bits; + // Safety check to express the intended size in the test. Can be adjusted if + // the snapshots lists changed. + ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size()); + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits); + std::unique_ptr<WritePreparedTxnDBMock> wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + SequenceNumber version = 1000l; + ASSERT_EQ(0, wp_db->snapshots_total_); + wp_db->UpdateSnapshots(snapshots, version); + ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_); + // seq numbers are chosen so that we have two of them between each two + // snapshots. If the diff of two consecutive seq is more than 5, there is a + // snapshot between them. + std::vector<SequenceNumber> seqs = {50l, 55l, 150l, 155l, 250l, 255l, 350l, + 355l, 450l, 455l, 550l, 555l, 650l, 655l, + 750l, 755l, 850l, 855l, 950l, 955l}; + ASSERT_GT(seqs.size(), 1); + for (size_t i = 0; i + 1 < seqs.size(); i++) { + wp_db->old_commit_map_empty_ = true; // reset + CommitEntry commit_entry = {seqs[i], seqs[i + 1]}; + wp_db->CheckAgainstSnapshots(commit_entry); + // Expect update if there is snapshot in between the prepare and commit + bool expect_update = commit_entry.commit_seq - commit_entry.prep_seq > 5 && + commit_entry.commit_seq >= snapshots.front() && + commit_entry.prep_seq <= snapshots.back(); + ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_); + } + + // Test that search will include multiple snapshot from snapshot cache + { + // exclude first and last item in the cache + CommitEntry commit_entry = {snapshots.front() + 1, + snapshots[cache_size - 1] - 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2); + } + + // Test that search will include multiple snapshot from old snapshots + { + // include two in the middle + CommitEntry commit_entry = {snapshots[cache_size] + 1, + snapshots[cache_size + 2] + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), 2); + } + + // Test that search will include both snapshot cache and old snapshots + // Case 1: includes all in snapshot cache + { + CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size()); + } + + // Case 2: includes all snapshot caches except the smallest + { + CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1); + } + + // Case 3: includes only the largest of snapshot cache + { + CommitEntry commit_entry = {snapshots[cache_size - 1] - 1, + snapshots.back() + 1}; + wp_db->old_commit_map_empty_ = true; // reset + wp_db->old_commit_map_.clear(); + wp_db->CheckAgainstSnapshots(commit_entry); + ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1); + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in +// parallel with UpdateSnapshots. +TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) { + // We have a sync point in the method under test after checking each snapshot. + // If you increase the max number of snapshots in this test, more sync points + // in the methods must also be added. + const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l, + 60l, 70l, 80l, 90l, 100l}; + const size_t snapshot_cache_bits = 2; + // Safety check to express the intended size in the test. Can be adjusted if + // the snapshots lists changed. + ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size()); + SequenceNumber version = 1000l; + // Choose the cache size so that the new snapshot list could replace all the + // existing items in the cache and also have some overflow. + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits); + std::unique_ptr<WritePreparedTxnDBMock> wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + const size_t extra = 2; + size_t loop_id = 0; + // Add up to extra items that do not fit into the cache + for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + extra; + old_size++) { + const std::vector<SequenceNumber> old_snapshots( + snapshots.begin(), snapshots.begin() + old_size); + + // Each member of old snapshot might or might not appear in the new list. We + // create a common_snapshots for each combination. + size_t new_comb_cnt = size_t(1) << old_size; + for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) { + if (loop_id % split_cnt_ != split_id_) continue; + printf("."); // To signal progress + fflush(stdout); + std::vector<SequenceNumber> common_snapshots; + for (size_t i = 0; i < old_snapshots.size(); i++) { + if (IsInCombination(i, new_comb)) { + common_snapshots.push_back(old_snapshots[i]); + } + } + // And add some new snapshots to the common list + for (size_t added_snapshots = 0; + added_snapshots <= snapshots.size() - old_snapshots.size(); + added_snapshots++) { + std::vector<SequenceNumber> new_snapshots = common_snapshots; + for (size_t i = 0; i < added_snapshots; i++) { + new_snapshots.push_back(snapshots[old_snapshots.size() + i]); + } + for (auto it = common_snapshots.begin(); it != common_snapshots.end(); + ++it) { + auto snapshot = *it; + // Create a commit entry that is around the snapshot and thus should + // be not be discarded + CommitEntry entry = {static_cast<uint64_t>(snapshot - 1), + snapshot + 1}; + // The critical part is when iterating the snapshot cache. Afterwards, + // we are operating under the lock + size_t a_range = + std::min(old_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1; + size_t b_range = + std::min(new_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1; + // Break each thread at two points + for (size_t a1 = 1; a1 <= a_range; a1++) { + for (size_t a2 = a1 + 1; a2 <= a_range; a2++) { + for (size_t b1 = 1; b1 <= b_range; b1++) { + for (size_t b2 = b1 + 1; b2 <= b_range; b2++) { + SnapshotConcurrentAccessTestInternal( + wp_db.get(), old_snapshots, new_snapshots, entry, version, + a1, a2, b1, b2); + } + } + } + } + } + } + } + } + printf("\n"); +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// This test clarifies the contract of AdvanceMaxEvictedSeq method +TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasic) { + DBImpl* mock_db = new DBImpl(options, dbname); + std::unique_ptr<WritePreparedTxnDBMock> wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + + // 1. Set the initial values for max, prepared, and snapshots + SequenceNumber zero_max = 0l; + // Set the initial list of prepared txns + const std::vector<SequenceNumber> initial_prepared = {10, 30, 50, 100, + 150, 200, 250}; + for (auto p : initial_prepared) { + wp_db->AddPrepared(p); + } + // This updates the max value and also set old prepared + SequenceNumber init_max = 100; + wp_db->AdvanceMaxEvictedSeq(zero_max, init_max); + const std::vector<SequenceNumber> initial_snapshots = {20, 40}; + wp_db->SetDBSnapshots(initial_snapshots); + // This will update the internal cache of snapshots from the DB + wp_db->UpdateSnapshots(initial_snapshots, init_max); + + // 2. Invoke AdvanceMaxEvictedSeq + const std::vector<SequenceNumber> latest_snapshots = {20, 110, 220, 300}; + wp_db->SetDBSnapshots(latest_snapshots); + SequenceNumber new_max = 200; + wp_db->AdvanceMaxEvictedSeq(init_max, new_max); + + // 3. Verify that the state matches with AdvanceMaxEvictedSeq contract + // a. max should be updated to new_max + ASSERT_EQ(wp_db->max_evicted_seq_, new_max); + // b. delayed prepared should contain every txn <= max and prepared should + // only contain txns > max + auto it = initial_prepared.begin(); + for (; it != initial_prepared.end() && *it <= new_max; ++it) { + ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it)); + } + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty(); + ++it, wp_db->prepared_txns_.pop()) { + ASSERT_EQ(*it, wp_db->prepared_txns_.top()); + } + ASSERT_TRUE(it == initial_prepared.end()); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + // c. snapshots should contain everything below new_max + auto sit = latest_snapshots.begin(); + for (size_t i = 0; sit != latest_snapshots.end() && *sit <= new_max && + i < wp_db->snapshots_total_; + sit++, i++) { + ASSERT_TRUE(i < wp_db->snapshots_total_); + // This test is in small scale and the list of snapshots are assumed to be + // within the cache size limit. This is just a safety check to double check + // that assumption. + ASSERT_TRUE(i < wp_db->SNAPSHOT_CACHE_SIZE); + ASSERT_EQ(*sit, wp_db->snapshot_cache_[i]); + } +} + +// A new snapshot should always be always larger than max_evicted_seq_ +// Otherwise the snapshot does not go through AdvanceMaxEvictedSeq +TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) { + WriteOptions woptions; + TransactionOptions txn_options; + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + Transaction* txn0 = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value"))); + ASSERT_OK(txn0->Commit()); + const SequenceNumber seq = txn0->GetId(); // is also prepare seq + delete txn0; + std::vector<Transaction*> txns; + // Inc seq without committing anything + for (int i = 0; i < 10; i++) { + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + ASSERT_OK(txn->Put(Slice("key" + std::to_string(i)), Slice("value"))); + ASSERT_OK(txn->Prepare()); + txns.push_back(txn); + } + + // The new commit is seq + 10 + ASSERT_OK(db->Put(woptions, "key", "value")); + auto snap = wp_db->GetSnapshot(); + const SequenceNumber last_seq = snap->GetSequenceNumber(); + wp_db->ReleaseSnapshot(snap); + ASSERT_LT(seq, last_seq); + // Otherwise our test is not effective + ASSERT_LT(last_seq - seq, wp_db->INC_STEP_FOR_MAX_EVICTED); + + // Evict seq out of commit cache + const SequenceNumber overwrite_seq = seq + wp_db->COMMIT_CACHE_SIZE; + // Check that the next write could make max go beyond last + auto last_max = wp_db->max_evicted_seq_.load(); + wp_db->AddCommitted(overwrite_seq, overwrite_seq); + // Check that eviction has advanced the max + ASSERT_LT(last_max, wp_db->max_evicted_seq_.load()); + // Check that the new max has not advanced the last seq + ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq); + for (auto txn : txns) { + txn->Rollback(); + delete txn; + } +} + +// A new snapshot should always be always larger than max_evicted_seq_ +// In very rare cases max could be below last published seq. Test that +// taking snapshot will wait for max to catch up. +TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + + const int writes = 50; + const int batch_cnt = 4; + ROCKSDB_NAMESPACE::port::Thread t1([&]() { + for (int i = 0; i < writes; i++) { + WriteBatch batch; + // For duplicate keys cause 4 commit entries, each evicting an entry that + // is not published yet, thus causing max evicted seq go higher than last + // published. + for (int b = 0; b < batch_cnt; b++) { + ASSERT_OK(batch.Put("foo", "foo")); + } + ASSERT_OK(db->Write(woptions, &batch)); + } + }); + + ROCKSDB_NAMESPACE::port::Thread t2([&]() { + while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread + std::this_thread::yield(); + } + for (int i = 0; i < 10; i++) { + SequenceNumber max_lower_bound = wp_db->max_evicted_seq_; + auto snap = db->GetSnapshot(); + if (snap->GetSequenceNumber() != 0) { + // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus + // compare with the lower bound instead as an approximation. + ASSERT_LT(max_lower_bound, snap->GetSequenceNumber()); + } // seq 0 is ok to be less than max since nothing is visible to it + db->ReleaseSnapshot(snap); + } + }); + + t1.join(); + t2.join(); + + // Make sure that the test has worked and seq number has advanced as we + // thought + auto snap = db->GetSnapshot(); + ASSERT_GT(snap->GetSequenceNumber(), batch_cnt * writes - 1); + db->ReleaseSnapshot(snap); +} + +// Test that reads without snapshots would not hit an undefined state +TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + + const int writes = 50; + ROCKSDB_NAMESPACE::port::Thread t1([&]() { + for (int i = 0; i < writes; i++) { + WriteBatch batch; + ASSERT_OK(batch.Put("key", "foo")); + ASSERT_OK(db->Write(woptions, &batch)); + } + }); + + ROCKSDB_NAMESPACE::port::Thread t2([&]() { + while (wp_db->max_evicted_seq_ == 0) { // wait for insert thread + std::this_thread::yield(); + } + ReadOptions ropt; + PinnableSlice pinnable_val; + TransactionOptions txn_options; + for (int i = 0; i < 10; i++) { + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + pinnable_val.Reset(); + std::vector<std::string> values; + auto s_vec = + txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + Slice key("key"); + txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s, + true); + ASSERT_TRUE(s.ok() || s.IsTryAgain()); + delete txn; + } + }); + + t1.join(); + t2.join(); + + // Make sure that the test has worked and seq number has advanced as we + // thought + auto snap = db->GetSnapshot(); + ASSERT_GT(snap->GetSequenceNumber(), writes - 1); + db->ReleaseSnapshot(snap); +} + +// Check that old_commit_map_ cleanup works correctly if the snapshot equals +// max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // only 1 entry => frequent eviction + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WriteOptions woptions; + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + // Insert something to increase seq + ASSERT_OK(db->Put(woptions, "key", "value")); + auto snap = db->GetSnapshot(); + auto snap_seq = snap->GetSequenceNumber(); + // Another insert should trigger eviction + load snapshot from db + ASSERT_OK(db->Put(woptions, "key", "value")); + // This is the scenario that we check agaisnt + ASSERT_EQ(snap_seq, wp_db->max_evicted_seq_); + // old_commit_map_ now has some data that needs gc + ASSERT_EQ(1, wp_db->snapshots_total_); + ASSERT_EQ(1, wp_db->old_commit_map_.size()); + + db->ReleaseSnapshot(snap); + + // Another insert should trigger eviction + load snapshot from db + ASSERT_OK(db->Put(woptions, "key", "value")); + + // the snapshot and related metadata must be properly garbage collected + ASSERT_EQ(0, wp_db->snapshots_total_); + ASSERT_TRUE(wp_db->snapshots_all_.empty()); + ASSERT_EQ(0, wp_db->old_commit_map_.size()); +} + +TEST_P(WritePreparedTransactionTest, AdvanceSeqByOne) { + auto snap = db->GetSnapshot(); + auto seq1 = snap->GetSequenceNumber(); + db->ReleaseSnapshot(snap); + + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + wp_db->AdvanceSeqByOne(); + + snap = db->GetSnapshot(); + auto seq2 = snap->GetSequenceNumber(); + db->ReleaseSnapshot(snap); + + ASSERT_LT(seq1, seq2); +} + +// Test that the txn Initilize calls the overridden functions +TEST_P(WritePreparedTransactionTest, TxnInitialize) { + TransactionOptions txn_options; + WriteOptions write_options; + ASSERT_OK(db->Put(write_options, "key", "value")); + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value1"))); + ASSERT_OK(txn0->Prepare()); + + // SetSnapshot is overridden to update min_uncommitted_ + txn_options.set_snapshot = true; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + auto snap = txn1->GetSnapshot(); + auto snap_impl = reinterpret_cast<const SnapshotImpl*>(snap); + // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be + // udpated + ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq); + + ASSERT_OK(txn0->Rollback()); + ASSERT_OK(txn1->Rollback()); + delete txn0; + delete txn1; +} + +// This tests that transactions with duplicate keys perform correctly after max +// is advancing their prepared sequence numbers. This will not be the case if +// for example the txn does not add the prepared seq for the second sub-batch to +// the PreparedHeap structure. +TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // disable commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + + ReadOptions ropt; + PinnableSlice pinnable_val; + WriteOptions write_options; + TransactionOptions txn_options; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value1"))); + ASSERT_OK(txn0->Put(Slice("key"), Slice("value2"))); + ASSERT_OK(txn0->Prepare()); + + ASSERT_OK(db->Put(write_options, "key2", "value")); + // Will cause max advance due to disabled commit cache + ASSERT_OK(db->Put(write_options, "key3", "value")); + + auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + delete txn0; + + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + + txn0 = db->GetTransactionByName("xid"); + ASSERT_OK(txn0->Rollback()); + delete txn0; +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and +// delayed_prepared_, when is run concurrently with advancing max_evicted_seq, +// which moves prepared txns from prepared_txns_ to delayed_prepared_. +TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // disable commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ReadOptions ropt; + PinnableSlice pinnable_val; + WriteOptions write_options; + TransactionOptions txn_options; + std::vector<Transaction*> txns, committed_txns; + + const int cnt = 100; + for (int i = 0; i < cnt; i++) { + Transaction* txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName("xid" + std::to_string(i))); + auto key = "key1" + std::to_string(i); + auto value = "value1" + std::to_string(i); + ASSERT_OK(txn->Put(Slice(key), Slice(value))); + ASSERT_OK(txn->Prepare()); + txns.push_back(txn); + } + + port::Mutex mutex; + Random rnd(1103); + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + for (int i = 0; i < cnt; i++) { + uint32_t index = rnd.Uniform(cnt - i); + Transaction* txn; + { + MutexLock l(&mutex); + txn = txns[index]; + txns.erase(txns.begin() + index); + } + // Since commit cache is practically disabled, commit results in immediate + // advance in max_evicted_seq_ and subsequently moving some prepared txns + // to delayed_prepared_. + ASSERT_OK(txn->Commit()); + committed_txns.push_back(txn); + } + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + while (1) { + MutexLock l(&mutex); + if (txns.empty()) { + break; + } + auto min_uncommitted = wp_db->SmallestUnCommittedSeq(); + ASSERT_LE(min_uncommitted, (*txns.begin())->GetId()); + } + }); + + commit_thread.join(); + read_thread.join(); + for (auto txn : committed_txns) { + delete txn; + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) { + // Given the sequential run of txns, with this timeout we should never see a + // deadlock nor a timeout unless we have a key conflict, which should be + // almost infeasible. + txn_db_options.transaction_lock_timeout = 1000; + txn_db_options.default_lock_timeout = 1000; + ASSERT_OK(ReOpen()); + FlushOptions fopt; + + // Number of different txn types we use in this test + const size_t type_cnt = 5; + // The size of the first write group + // TODO(myabandeh): This should be increase for pre-release tests + const size_t first_group_size = 2; + // Total number of txns we run in each test + // TODO(myabandeh): This should be increase for pre-release tests + const size_t txn_cnt = first_group_size + 1; + + size_t base[txn_cnt + 1] = { + 1, + }; + for (size_t bi = 1; bi <= txn_cnt; bi++) { + base[bi] = base[bi - 1] * type_cnt; + } + const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt)); + printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n); + for (size_t n = 0; n < max_n; n++) { + if (n > 0) { + ASSERT_OK(ReOpen()); + } + + if (n % split_cnt_ != split_id_) continue; + if (n % 1000 == 0) { + printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n); + } + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + auto seq = db_impl->TEST_GetLastVisibleSequence(); + with_empty_commits = 0; + exp_seq = seq; + // This is increased before writing the batch for commit + commit_writes = 0; + // This is increased before txn starts linking if it expects to do a commit + // eventually + expected_commits = 0; + std::vector<port::Thread> threads; + + linked.store(0, std::memory_order_release); + std::atomic<bool> batch_formed(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::EnterAsBatchGroupLeader:End", + [&](void* /*arg*/) { batch_formed = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) { + size_t orig_linked = linked.fetch_add(1, std::memory_order_acq_rel); + if (orig_linked == 0) { + // Wait until the others are linked too. + while (linked.load(std::memory_order_acquire) < first_group_size) { + } + } else if (orig_linked == first_group_size) { + // Make the 2nd batch of the rest of writes plus any followup + // commits from the first batch + while (linked.load(std::memory_order_acquire) < + txn_cnt + commit_writes) { + } + } + // Then we will have one or more batches consisting of follow-up + // commits from the 2nd batch. There is a bit of non-determinism here + // but it should be tolerable. + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (size_t bi = 0; bi < txn_cnt; bi++) { + // get the bi-th digit in number system based on type_cnt + size_t d = (n % base[bi + 1]) / base[bi]; + switch (d) { + case 0: + threads.emplace_back(&TransactionTestBase::TestTxn0, this, bi); + break; + case 1: + threads.emplace_back(&TransactionTestBase::TestTxn1, this, bi); + break; + case 2: + threads.emplace_back(&TransactionTestBase::TestTxn2, this, bi); + break; + case 3: + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); + break; + case 4: + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); + break; + default: + FAIL(); + } + // wait to be linked + while (linked.load(std::memory_order_acquire) <= bi) { + } + // after a queue of size first_group_size + if (bi + 1 == first_group_size) { + while (!batch_formed) { + } + // to make it more deterministic, wait until the commits are linked + while (linked.load(std::memory_order_acquire) <= + bi + expected_commits) { + } + } + } + for (auto& t : threads) { + t.join(); + } + if (options.two_write_queues) { + // In this case none of the above scheduling tricks to deterministically + // form merged batches works because the writes go to separate queues. + // This would result in different write groups in each run of the test. We + // still keep the test since although non-deterministic and hard to debug, + // it is still useful to have. + // TODO(myabandeh): Add a deterministic unit test for two_write_queues + } + + // Check if memtable inserts advanced seq number as expected + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_EQ(exp_seq, seq); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Check if recovery preserves the last sequence number + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->TEST_GetLastVisibleSequence(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + + // Check if flush preserves the last sequence number + ASSERT_OK(db_impl->Flush(fopt)); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + + // Check if recovery after flush preserves the last sequence number + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + seq = db_impl->GetLatestSequenceNumber(); + ASSERT_LE(exp_seq, seq + with_empty_commits); + } +} + +// Run a couple of different txns among them some uncommitted. Restart the db at +// a couple points to check whether the list of uncommitted txns are recovered +// properly. +TEST_P(WritePreparedTransactionTest, BasicRecovery) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + + TestTxn0(0); + + TransactionOptions txn_options; + WriteOptions write_options; + size_t index = 1000; + Transaction* txn0 = db->BeginTransaction(write_options, txn_options); + auto istr0 = std::to_string(index); + auto s = txn0->SetName("xid" + istr0); + ASSERT_OK(s); + s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0)); + ASSERT_OK(s); + s = txn0->Prepare(); + ASSERT_OK(s); + auto prep_seq_0 = txn0->GetId(); + + TestTxn1(0); + + index++; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + auto istr1 = std::to_string(index); + s = txn1->SetName("xid" + istr1); + ASSERT_OK(s); + s = txn1->Put(Slice("foo1" + istr1), Slice("bar")); + ASSERT_OK(s); + s = txn1->Prepare(); + ASSERT_OK(s); + auto prep_seq_1 = txn1->GetId(); + + TestTxn2(0); + + ReadOptions ropt; + PinnableSlice pinnable_val; + // Check the value is not committed before restart + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + + delete txn0; + delete txn1; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + // After recovery, all the uncommitted txns (0 and 1) should be inserted into + // delayed_prepared_ + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_); + ASSERT_LE(prep_seq_1, wp_db->max_evicted_seq_); + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_EQ(2, wp_db->delayed_prepared_.size()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_0) != + wp_db->delayed_prepared_.end()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_1) != + wp_db->delayed_prepared_.end()); + } + + // Check the value is still not committed after restart + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.IsNotFound()); + pinnable_val.Reset(); + + TestTxn3(0); + + // Test that a recovered txns will be properly marked committed for the next + // recovery + txn1 = db->GetTransactionByName("xid" + istr1); + ASSERT_NE(txn1, nullptr); + ASSERT_OK(txn1->Commit()); + delete txn1; + + index++; + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + auto istr2 = std::to_string(index); + s = txn2->SetName("xid" + istr2); + ASSERT_OK(s); + s = txn2->Put(Slice("foo2" + istr2), Slice("bar")); + ASSERT_OK(s); + s = txn2->Prepare(); + ASSERT_OK(s); + auto prep_seq_2 = txn2->GetId(); + + delete txn2; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + wp_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + + // 0 and 2 are prepared and 1 is committed + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_EQ(2, wp_db->delayed_prepared_.size()); + const auto& end = wp_db->delayed_prepared_.end(); + ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_0), end); + ASSERT_EQ(wp_db->delayed_prepared_.find(prep_seq_1), end); + ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_2), end); + } + ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_); + ASSERT_LE(prep_seq_2, wp_db->max_evicted_seq_); + + // Commit all the remaining txns + txn0 = db->GetTransactionByName("xid" + istr0); + ASSERT_NE(txn0, nullptr); + ASSERT_OK(txn0->Commit()); + txn2 = db->GetTransactionByName("xid" + istr2); + ASSERT_NE(txn2, nullptr); + ASSERT_OK(txn2->Commit()); + + // Check the value is committed after commit + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(pinnable_val == ("bar0" + istr0)); + pinnable_val.Reset(); + + delete txn0; + delete txn2; + ASSERT_OK(wp_db->db_impl_->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_empty_); + + // Check the value is still committed after recovery + s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(pinnable_val == ("bar0" + istr0)); + pinnable_val.Reset(); +} + +// After recovery the commit map is empty while the max is set. The code would +// go through a different path which requires a separate test. Test that the +// committed data before the restart is visible to all snapshots. +TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) { + for (bool end_with_prepare : {false, true}) { + ASSERT_OK(ReOpen()); + WriteOptions woptions; + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + SequenceNumber prepare_seq = kMaxSequenceNumber; + if (end_with_prepare) { + TransactionOptions txn_options; + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->SetName("xid0")); + ASSERT_OK(txn->Prepare()); + prepare_seq = txn->GetId(); + delete txn; + } + dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash(); + auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + ASSERT_OK(ReOpenNoDelete()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + ASSERT_NE(wp_db, nullptr); + ASSERT_GT(wp_db->max_evicted_seq_, 0); // max after recovery + // Take a snapshot right after recovery + const Snapshot* snap = db->GetSnapshot(); + auto snap_seq = snap->GetSequenceNumber(); + ASSERT_GT(snap_seq, 0); + + for (SequenceNumber seq = 0; + seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) { + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq)); + } + if (end_with_prepare) { + ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq)); + } + // trivial check + ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq)); + + db->ReleaseSnapshot(snap); + + ASSERT_OK(db->Put(woptions, "key", "value")); + // Take a snapshot after some writes + snap = db->GetSnapshot(); + snap_seq = snap->GetSequenceNumber(); + for (SequenceNumber seq = 0; + seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) { + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq)); + } + if (end_with_prepare) { + ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq)); + } + // trivial check + ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq)); + + db->ReleaseSnapshot(snap); + } +} + +// Shows the contract of IsInSnapshot when called on invalid/released snapshots +TEST_P(WritePreparedTransactionTest, IsInSnapshotReleased) { + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + WriteOptions woptions; + ASSERT_OK(db->Put(woptions, "key", "value")); + // snap seq = 1 + const Snapshot* snap1 = db->GetSnapshot(); + ASSERT_OK(db->Put(woptions, "key", "value")); + ASSERT_OK(db->Put(woptions, "key", "value")); + // snap seq = 3 + const Snapshot* snap2 = db->GetSnapshot(); + const SequenceNumber seq = 1; + // Evict seq out of commit cache + size_t overwrite_seq = wp_db->COMMIT_CACHE_SIZE + seq; + wp_db->AddCommitted(overwrite_seq, overwrite_seq); + SequenceNumber snap_seq; + uint64_t min_uncommitted = kMinUnCommittedSeq; + bool released; + + released = false; + snap_seq = snap1->GetSequenceNumber(); + ASSERT_LE(seq, snap_seq); + // Valid snapshot lower than max + ASSERT_LE(snap_seq, wp_db->max_evicted_seq_); + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_FALSE(released); + + released = false; + snap_seq = snap1->GetSequenceNumber(); + // Invaid snapshot lower than max + ASSERT_LE(snap_seq + 1, wp_db->max_evicted_seq_); + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + db->ReleaseSnapshot(snap1); + + released = false; + // Released snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + // The release does not take affect until the next max advance + ASSERT_FALSE(released); + + released = false; + // Invaid snapshot lower than max + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + // This make the snapshot release to reflect in txn db structures + wp_db->AdvanceMaxEvictedSeq(wp_db->max_evicted_seq_, + wp_db->max_evicted_seq_ + 1); + + released = false; + // Released snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_TRUE(released); + + released = false; + // Invaid snapshot lower than max + ASSERT_TRUE( + wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released)); + ASSERT_TRUE(released); + + snap_seq = snap2->GetSequenceNumber(); + + released = false; + // Unreleased snapshot lower than max + ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released)); + ASSERT_FALSE(released); + + db->ReleaseSnapshot(snap2); +} + +// Test WritePreparedTxnDB's IsInSnapshot against different ordering of +// snapshot, max_committed_seq_, prepared, and commit entries. +TEST_P(WritePreparedTransactionTest, IsInSnapshot) { + WriteOptions wo; + // Use small commit cache to trigger lots of eviction and fast advance of + // max_evicted_seq_ + const size_t commit_cache_bits = 3; + // Same for snapshot cache size + const size_t snapshot_cache_bits = 2; + + // Take some preliminary snapshots first. This is to stress the data structure + // that holds the old snapshots as it will be designed to be efficient when + // only a few snapshots are below the max_evicted_seq_. + for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) { + // Leave some gap between the preliminary snapshots and the final snapshot + // that we check. This should test for also different overlapping scenarios + // between the last snapshot and the commits. + for (int max_gap = 1; max_gap < 10; max_gap++) { + // Since we do not actually write to db, we mock the seq as it would be + // increased by the db. The only exception is that we need db seq to + // advance for our snapshots. for which we apply a dummy put each time we + // increase our mock of seq. + uint64_t seq = 0; + // At each step we prepare a txn and then we commit it in the next txn. + // This emulates the consecutive transactions that write to the same key + uint64_t cur_txn = 0; + // Number of snapshots taken so far + int num_snapshots = 0; + // Number of gaps applied so far + int gap_cnt = 0; + // The final snapshot that we will inspect + uint64_t snapshot = 0; + bool found_committed = false; + // To stress the data structure that maintain prepared txns, at each cycle + // we add a new prepare txn. These do not mean to be committed for + // snapshot inspection. + std::set<uint64_t> prepared; + // We keep the list of txns committed before we take the last snapshot. + // These should be the only seq numbers that will be found in the snapshot + std::set<uint64_t> committed_before; + // The set of commit seq numbers to be excluded from IsInSnapshot queries + std::set<uint64_t> commit_seqs; + DBImpl* mock_db = new DBImpl(options, dbname); + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + std::unique_ptr<WritePreparedTxnDBMock> wp_db( + new WritePreparedTxnDBMock(mock_db, txn_db_options)); + // We continue until max advances a bit beyond the snapshot. + while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) { + // do prepare for a transaction + seq++; + wp_db->AddPrepared(seq); + prepared.insert(seq); + + // If cur_txn is not started, do prepare for it. + if (!cur_txn) { + seq++; + cur_txn = seq; + wp_db->AddPrepared(cur_txn); + } else { // else commit it + seq++; + wp_db->AddCommitted(cur_txn, seq); + wp_db->RemovePrepared(cur_txn); + commit_seqs.insert(seq); + if (!snapshot) { + committed_before.insert(cur_txn); + } + cur_txn = 0; + } + + if (num_snapshots < max_snapshots - 1) { + // Take preliminary snapshots + wp_db->TakeSnapshot(seq); + num_snapshots++; + } else if (gap_cnt < max_gap) { + // Wait for some gap before taking the final snapshot + gap_cnt++; + } else if (!snapshot) { + // Take the final snapshot if it is not already taken + snapshot = seq; + wp_db->TakeSnapshot(snapshot); + num_snapshots++; + } + + // If the snapshot is taken, verify seq numbers visible to it. We redo + // it at each cycle to test that the system is still sound when + // max_evicted_seq_ advances. + if (snapshot) { + for (uint64_t s = 1; + s <= seq && commit_seqs.find(s) == commit_seqs.end(); s++) { + bool was_committed = + (committed_before.find(s) != committed_before.end()); + bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot); + if (was_committed != is_in_snapshot) { + printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64 + " snapshot %" PRIu64 + " gap_cnt %d num_snapshots %d s %" PRIu64 "\n", + max_snapshots, max_gap, seq, + wp_db->max_evicted_seq_.load(), snapshot, gap_cnt, + num_snapshots, s); + } + ASSERT_EQ(was_committed, is_in_snapshot); + found_committed = found_committed || is_in_snapshot; + } + } + } + // Safety check to make sure the test actually ran + ASSERT_TRUE(found_committed); + // As an extra check, check if prepared set will be properly empty after + // they are committed. + if (cur_txn) { + wp_db->AddCommitted(cur_txn, seq); + wp_db->RemovePrepared(cur_txn); + } + for (auto p : prepared) { + wp_db->AddCommitted(p, seq); + wp_db->RemovePrepared(p); + } + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + } + } +} + +void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s, + PinnableSlice& exp_v, Slice key) { + Status s; + PinnableSlice v; + s = db->Get(roptions, db->DefaultColumnFamily(), key, &v); + ASSERT_EQ(exp_s, s); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(exp_v == v); + } + + // Try with MultiGet API too + std::vector<std::string> values; + auto s_vec = + db->MultiGet(roptions, {db->DefaultColumnFamily()}, {key}, &values); + ASSERT_EQ(1, values.size()); + ASSERT_EQ(1, s_vec.size()); + s = s_vec[0]; + ASSERT_EQ(exp_s, s); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_TRUE(exp_v == values[0]); + } +} + +void ASSERT_SAME(TransactionDB* db, Status exp_s, PinnableSlice& exp_v, + Slice key) { + ASSERT_SAME(ReadOptions(), db, exp_s, exp_v, key); +} + +TEST_P(WritePreparedTransactionTest, Rollback) { + ReadOptions roptions; + WriteOptions woptions; + TransactionOptions txn_options; + const size_t num_keys = 4; + const size_t num_values = 5; + for (size_t ikey = 1; ikey <= num_keys; ikey++) { + for (size_t ivalue = 0; ivalue < num_values; ivalue++) { + for (bool crash : {false, true}) { + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + std::string key_str = "key" + std::to_string(ikey); + switch (ivalue) { + case 0: + break; + case 1: + ASSERT_OK(db->Put(woptions, key_str, "initvalue1")); + break; + case 2: + ASSERT_OK(db->Merge(woptions, key_str, "initvalue2")); + break; + case 3: + ASSERT_OK(db->Delete(woptions, key_str)); + break; + case 4: + ASSERT_OK(db->SingleDelete(woptions, key_str)); + break; + default: + FAIL(); + } + + PinnableSlice v1; + auto s1 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key1"), &v1); + PinnableSlice v2; + auto s2 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key2"), &v2); + PinnableSlice v3; + auto s3 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key3"), &v3); + PinnableSlice v4; + auto s4 = + db->Get(roptions, db->DefaultColumnFamily(), Slice("key4"), &v4); + Transaction* txn = db->BeginTransaction(woptions, txn_options); + auto s = txn->SetName("xid0"); + ASSERT_OK(s); + s = txn->Put(Slice("key1"), Slice("value1")); + ASSERT_OK(s); + s = txn->Merge(Slice("key2"), Slice("value2")); + ASSERT_OK(s); + s = txn->Delete(Slice("key3")); + ASSERT_OK(s); + s = txn->SingleDelete(Slice("key4")); + ASSERT_OK(s); + s = txn->Prepare(); + ASSERT_OK(s); + + { + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_FALSE(wp_db->prepared_txns_.empty()); + ASSERT_EQ(txn->GetId(), wp_db->prepared_txns_.top()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + + if (crash) { + delete txn; + auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + ASSERT_NE(db, nullptr); + wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + txn = db->GetTransactionByName("xid0"); + ASSERT_FALSE(wp_db->delayed_prepared_empty_); + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_FALSE(wp_db->delayed_prepared_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) != + wp_db->delayed_prepared_.end()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + + s = txn->Rollback(); + ASSERT_OK(s); + + { + ASSERT_TRUE(wp_db->delayed_prepared_empty_); + ReadLock rl(&wp_db->prepared_mutex_); + ASSERT_TRUE(wp_db->prepared_txns_.empty()); + ASSERT_TRUE(wp_db->delayed_prepared_.empty()); + } + + ASSERT_SAME(db, s1, v1, "key1"); + ASSERT_SAME(db, s2, v2, "key2"); + ASSERT_SAME(db, s3, v3, "key3"); + ASSERT_SAME(db, s4, v4, "key4"); + delete txn; + } + } + } +} + +TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) { + // Use large buffer to avoid memtable flush after 1024 insertions + options.write_buffer_size = 1024 * 1024; + ASSERT_OK(ReOpen()); + std::vector<KeyVersion> versions; + uint64_t seq = 0; + for (uint64_t i = 1; i <= 1024; i++) { + std::string v = "bar" + std::to_string(i); + ASSERT_OK(db->Put(WriteOptions(), "foo", v)); + VerifyKeys({{"foo", v}}); + seq++; // one for the key/value + KeyVersion kv = {"foo", v, seq, kTypeValue}; + if (options.two_write_queues) { + seq++; // one for the commit + } + versions.emplace_back(kv); + } + std::reverse(std::begin(versions), std::end(versions)); + VerifyInternalKeys(versions); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_OK(db_impl->FlushWAL(true)); + // Use small buffer to ensure memtable flush during recovery + options.write_buffer_size = 1024; + ASSERT_OK(ReOpenNoDelete()); + VerifyInternalKeys(versions); +} + +TEST_P(WritePreparedTransactionTest, SequenceNumberZero) { + ASSERT_OK(db->Put(WriteOptions(), "foo", "bar")); + VerifyKeys({{"foo", "bar"}}); + const Snapshot* snapshot = db->GetSnapshot(); + ASSERT_OK(db->Flush(FlushOptions())); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Compaction will output keys with sequence number 0, if it is visible to + // earliest snapshot. Make sure IsInSnapshot() report sequence number 0 is + // visible to any snapshot. + VerifyKeys({{"foo", "bar"}}); + VerifyKeys({{"foo", "bar"}}, snapshot); + VerifyInternalKeys({{"foo", "bar", 0, kTypeValue}}); + db->ReleaseSnapshot(snapshot); +} + +// Compaction should not remove a key if it is not committed, and should +// proceed with older versions of the key as-if the new version doesn't exist. +TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + // Snapshots to avoid keys get evicted. + std::vector<const Snapshot*> snapshots; + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + + auto add_key = [&](std::function<Status()> func) { + ASSERT_OK(func()); + expected_seq++; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + snapshots.push_back(db->GetSnapshot()); + }; + + // Each key here represent a standalone test case. + add_key([&]() { return db->Put(WriteOptions(), "key1", "value1_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key2", "value2_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key3", "value3_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key4", "value4_1"); }); + add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_1"); }); + add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_2"); }); + add_key([&]() { return db->Put(WriteOptions(), "key6", "value6_1"); }); + add_key([&]() { return db->Put(WriteOptions(), "key7", "value7_1"); }); + ASSERT_OK(db->Flush(FlushOptions())); + add_key([&]() { return db->Delete(WriteOptions(), "key6"); }); + add_key([&]() { return db->SingleDelete(WriteOptions(), "key7"); }); + + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1_2")); + ASSERT_OK(transaction->Delete("key2")); + ASSERT_OK(transaction->SingleDelete("key3")); + ASSERT_OK(transaction->Merge("key4", "value4_2")); + ASSERT_OK(transaction->Merge("key5", "value5_3")); + ASSERT_OK(transaction->Put("key6", "value6_2")); + ASSERT_OK(transaction->Put("key7", "value7_2")); + // Prepare but not commit. + ASSERT_OK(transaction->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + ASSERT_OK(db->Flush(FlushOptions())); + for (auto* s : snapshots) { + db->ReleaseSnapshot(s); + } + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({ + {"key1", "value1_1"}, + {"key2", "value2_1"}, + {"key3", "value3_1"}, + {"key4", "value4_1"}, + {"key5", "value5_1,value5_2"}, + {"key6", "NOT_FOUND"}, + {"key7", "NOT_FOUND"}, + }); + VerifyInternalKeys({ + {"key1", "value1_2", expected_seq, kTypeValue}, + {"key1", "value1_1", 0, kTypeValue}, + {"key2", "", expected_seq, kTypeDeletion}, + {"key2", "value2_1", 0, kTypeValue}, + {"key3", "", expected_seq, kTypeSingleDeletion}, + {"key3", "value3_1", 0, kTypeValue}, + {"key4", "value4_2", expected_seq, kTypeMerge}, + {"key4", "value4_1", 0, kTypeValue}, + {"key5", "value5_3", expected_seq, kTypeMerge}, + {"key5", "value5_1,value5_2", 0, kTypeValue}, + {"key6", "value6_2", expected_seq, kTypeValue}, + {"key7", "value7_2", expected_seq, kTypeValue}, + }); + ASSERT_OK(transaction->Commit()); + VerifyKeys({ + {"key1", "value1_2"}, + {"key2", "NOT_FOUND"}, + {"key3", "NOT_FOUND"}, + {"key4", "value4_1,value4_2"}, + {"key5", "value5_1,value5_2,value5_3"}, + {"key6", "value6_2"}, + {"key7", "value7_2"}, + }); + delete transaction; +} + +// Compaction should keep keys visible to a snapshot based on commit sequence, +// not just prepare sequence. +TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + auto* txn1 = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Put("key1", "value1_1")); + ASSERT_OK(txn1->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + ASSERT_OK(txn1->Commit()); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence()); + delete txn1; + // Take a snapshots to avoid keys get evicted before compaction. + const Snapshot* snapshot1 = db->GetSnapshot(); + auto* txn2 = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Put("key2", "value2_1")); + ASSERT_OK(txn2->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + // txn1 commit before snapshot2 and it is visible to snapshot2. + // txn2 commit after snapshot2 and it is not visible. + const Snapshot* snapshot2 = db->GetSnapshot(); + ASSERT_OK(txn2->Commit()); + ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence()); + delete txn2; + // Take a snapshots to avoid keys get evicted before compaction. + const Snapshot* snapshot3 = db->GetSnapshot(); + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2")); + expected_seq++; // 1 for write + SequenceNumber seq1 = expected_seq; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2")); + expected_seq++; // 1 for write + SequenceNumber seq2 = expected_seq; + if (options.two_write_queues) { + expected_seq++; // 1 for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Flush(FlushOptions())); + db->ReleaseSnapshot(snapshot1); + db->ReleaseSnapshot(snapshot3); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({{"key1", "value1_2"}, {"key2", "value2_2"}}); + VerifyKeys({{"key1", "value1_1"}, {"key2", "NOT_FOUND"}}, snapshot2); + VerifyInternalKeys({ + {"key1", "value1_2", seq1, kTypeValue}, + // "value1_1" is visible to snapshot2. Also keys at bottom level visible + // to earliest snapshot will output with seq = 0. + {"key1", "value1_1", 0, kTypeValue}, + {"key2", "value2_2", seq2, kTypeValue}, + }); + db->ReleaseSnapshot(snapshot2); +} + +TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // disable commit cache + for (bool has_recent_prepare : {true, false}) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Delete("key1")); + ASSERT_OK(transaction->Prepare()); + // snapshot1 should get min_uncommitted from prepared_txns_ heap. + auto snapshot1 = db->GetSnapshot(); + ASSERT_EQ(transaction->GetId(), + ((SnapshotImpl*)snapshot1)->min_uncommitted_); + // Add a commit to advance max_evicted_seq and move the prepared transaction + // into delayed_prepared_ set. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + Transaction* txn2 = nullptr; + if (has_recent_prepare) { + txn2 = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Put("key3", "value3")); + ASSERT_OK(txn2->Prepare()); + } + // snapshot2 should get min_uncommitted from delayed_prepared_ set. + auto snapshot2 = db->GetSnapshot(); + ASSERT_EQ(transaction->GetId(), + ((SnapshotImpl*)snapshot1)->min_uncommitted_); + ASSERT_OK(transaction->Commit()); + delete transaction; + if (has_recent_prepare) { + ASSERT_OK(txn2->Commit()); + delete txn2; + } + VerifyKeys({{"key1", "NOT_FOUND"}}); + VerifyKeys({{"key1", "value1"}}, snapshot1); + VerifyKeys({{"key1", "value1"}}, snapshot2); + db->ReleaseSnapshot(snapshot1); + db->ReleaseSnapshot(snapshot2); + } +} + +// Insert two values, v1 and v2, for a key. Between prepare and commit of v2 +// take two snapshots, s1 and s2. Release s1 during compaction. +// Test to make sure compaction doesn't get confused and think s1 can see both +// values, and thus compact out the older value by mistake. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1")); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1_2")); + ASSERT_OK(transaction->Prepare()); + auto snapshot1 = db->GetSnapshot(); + // Increment sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + auto snapshot2 = db->GetSnapshot(); + ASSERT_OK(transaction->Commit()); + delete transaction; + VerifyKeys({{"key1", "value1_2"}}); + VerifyKeys({{"key1", "value1_1"}}, snapshot1); + VerifyKeys({{"key1", "value1_1"}}, snapshot2); + // Add a flush to avoid compaction to fallback to trivial move. + + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; + auto callback = [&](void*) { + if (callback_finished) { + return; + } + // Release snapshot1 after CompactionIterator init. + // CompactionIterator need to figure out the earliest snapshot + // that can see key1:value1_2 is kMaxSequenceNumber, not + // snapshot1 or snapshot2. + db->ReleaseSnapshot(snapshot1); + // Add some keys to advance max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "key3", "value3")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "value4")); + callback_finished = true; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + VerifyKeys({{"key1", "value1_2"}}); + VerifyKeys({{"key1", "value1_1"}}, snapshot2); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Insert two values, v1 and v2, for a key. Take two snapshots, s1 and s2, +// after committing v2. Release s1 during compaction, right after compaction +// processes v2 and before processes v1. Test to make sure compaction doesn't +// get confused and believe v1 and v2 are visible to different snapshot +// (v1 by s2, v2 by s1) and refuse to compact out v1. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + ASSERT_OK(db->Put(WriteOptions(), "key1", "value2")); + SequenceNumber v2_seq = db->GetLatestSequenceNumber(); + auto* s1 = db->GetSnapshot(); + // Advance sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "dummy")); + auto* s2 = db->GetSnapshot(); + + int count_value = 0; + auto callback = [&](void* arg) { + auto* ikey = reinterpret_cast<ParsedInternalKey*>(arg); + if (ikey->user_key == "key1") { + count_value++; + if (count_value == 2) { + // Processing v1. + db->ReleaseSnapshot(s1); + // Add some keys to advance max_evicted_seq and update + // old_commit_map. + ASSERT_OK(db->Put(WriteOptions(), "key3", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "dummy")); + } + } + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:ProcessKV", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // value1 should be compact out. + VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}}); + + // cleanup + db->ReleaseSnapshot(s2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Insert two values, v1 and v2, for a key. Insert another dummy key +// so to evict the commit cache for v2, while v1 is still in commit cache. +// Take two snapshots, s1 and s2. Release s1 during compaction. +// Since commit cache for v2 is evicted, and old_commit_map don't have +// s1 (it is released), +// TODO(myabandeh): how can we be sure that the v2's commit info is evicted +// (and not v1's)? Instead of putting a dummy, we can directly call +// AddCommitted(v2_seq + cache_size, ...) to evict v2's entry from commit cache. +TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 1; // commit cache size = 2 + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Add a dummy key to evict v2 commit cache, but keep v1 commit cache. + // It also advance max_evicted_seq and can trigger old_commit_map cleanup. + auto add_dummy = [&]() { + auto* txn_dummy = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn_dummy->SetName("txn_dummy")); + ASSERT_OK(txn_dummy->Put("dummy", "dummy")); + ASSERT_OK(txn_dummy->Prepare()); + ASSERT_OK(txn_dummy->Commit()); + delete txn_dummy; + }; + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key1", "value2")); + ASSERT_OK(txn->Prepare()); + // TODO(myabandeh): replace it with GetId()? + auto v2_seq = db->GetLatestSequenceNumber(); + ASSERT_OK(txn->Commit()); + delete txn; + auto* s1 = db->GetSnapshot(); + // Dummy key to advance sequence number. + add_dummy(); + auto* s2 = db->GetSnapshot(); + + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; + auto callback = [&](void*) { + if (callback_finished) { + return; + } + db->ReleaseSnapshot(s1); + // Add some dummy entries to trigger s1 being cleanup from old_commit_map. + add_dummy(); + add_dummy(); + callback_finished = true; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // value1 should be compact out. + VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}}); + + db->ReleaseSnapshot(s2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 0; // minimum commit cache + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + SequenceNumber put_seq = db->GetLatestSequenceNumber(); + auto* transaction = + db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Delete("key1")); + ASSERT_OK(transaction->Prepare()); + SequenceNumber del_seq = db->GetLatestSequenceNumber(); + auto snapshot1 = db->GetSnapshot(); + // Increment sequence number. + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + auto snapshot2 = db->GetSnapshot(); + ASSERT_OK(transaction->Commit()); + delete transaction; + VerifyKeys({{"key1", "NOT_FOUND"}}); + VerifyKeys({{"key1", "value1"}}, snapshot1); + VerifyKeys({{"key1", "value1"}}, snapshot2); + ASSERT_OK(db->Flush(FlushOptions())); + + auto callback = [&](void* compaction) { + // Release snapshot1 after CompactionIterator init. + // CompactionIterator need to double check and find out snapshot2 is now + // the earliest existing snapshot. + if (compaction != nullptr) { + db->ReleaseSnapshot(snapshot1); + // Add some keys to advance max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "key3", "value3")); + ASSERT_OK(db->Put(WriteOptions(), "key4", "value4")); + } + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Only verify for key1. Both the put and delete for the key should be kept. + // Since the delete tombstone is not visible to snapshot2, we need to keep + // at least one version of the key, for write-conflict check. + VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion}, + {"key1", "value1", put_seq, kTypeValue}}); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithSD) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->SingleDelete("key")); + ASSERT_OK(txn->Put("wow", "value")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(db->Flush(FlushOptions())); + + const bool two_write_queues = std::get<1>(GetParam()); + if (two_write_queues) { + // In the case of two queues, commit another txn just to bump + // last_published_seq so that a subsequent GetSnapshot() call can return + // a snapshot with higher sequence. + auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(dummy_txn->Put("haha", "value")); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + } + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(txn->Commit()); + delete txn; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { + if (!arg) { + return; + } + db->ReleaseSnapshot(snapshot); + + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithSD2) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->Put("bar", "value")); + ASSERT_OK(txn->SingleDelete("key")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(db->Flush(FlushOptions())); + + ASSERT_OK(txn->Commit()); + delete txn; + + ASSERT_OK(db->Put(WriteOptions(), "haha", "value")); + + // Create a dummy transaction to take a snapshot for ww-conflict detection. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + auto* dummy_txn = + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:2", [&](void* /*arg*/) { + ASSERT_OK(dummy_txn->Rollback()); + delete dummy_txn; + + ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Put(WriteOptions(), "haha2", "value")); + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + db->ReleaseSnapshot(snapshot); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestSnapshotDuringCompaction_WithDelete) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(txn->Delete("b")); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Prepare()); + + const bool two_write_queues = std::get<1>(GetParam()); + if (two_write_queues) { + // In the case of two queues, commit another txn just to bump + // last_published_seq so that a subsequent GetSnapshot() call can return + // a snapshot with higher sequence. + auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr); + ASSERT_OK(dummy_txn->Put("haha", "value")); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + } + auto* snapshot1 = db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + delete txn; + auto* snapshot2 = db->GetSnapshot(); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:BottommostDelete:1", [&](void* arg) { + if (!arg) { + return; + } + db->ReleaseSnapshot(snapshot1); + + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "dummy1", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + db->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseSnapshotBetweenSDAndPutDuringCompaction) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Create a dummy transaction to take a snapshot for ww-conflict detection. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + auto* dummy_txn = + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr); + // Increment seq + ASSERT_OK(db->Put(WriteOptions(), "bar", "value")); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + auto* snapshot1 = db->GetSnapshot(); + // Increment seq + ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value")); + auto* snapshot2 = db->GetSnapshot(); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:KeepSDForWW", [&](void* /*arg*/) { + db->ReleaseSnapshot(snapshot1); + + ASSERT_OK(db->Put(WriteOptions(), "dontcare2", "value2")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + db->ReleaseSnapshot(snapshot2); + ASSERT_OK(dummy_txn->Commit()); + delete dummy_txn; + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, + ReleaseEarliestWriteConflictSnapshot_SingleDelete) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + std::unique_ptr<Transaction> txn; + txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(), + /*old_txn=*/nullptr)); + ASSERT_OK(txn->SetName("txn1")); + ASSERT_OK(txn->SingleDelete("b")); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + + auto* snapshot1 = db->GetSnapshot(); + + // Bump seq of the db by performing writes so that + // earliest_snapshot_ < earliest_write_conflict_snapshot_ in + // CompactionIterator. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + + // Create another snapshot for write conflict checking + std::unique_ptr<Transaction> txn2; + { + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + txn2.reset( + db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr)); + } + + // Bump seq so that the subsequent bg flush won't create a snapshot with the + // same seq as the previous snapshot for conflict checking. + ASSERT_OK(db->Put(WriteOptions(), "y", "dont")); + + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) { + // Rolling back txn2 should release its snapshot(for ww checking). + ASSERT_OK(txn2->Rollback()); + txn2.reset(); + // Advance max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "value")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + db->ReleaseSnapshot(snapshot1); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(db->SingleDelete(WriteOptions(), "b")); + + // Take a snapshot so that the SD won't be dropped during flush. + auto* tmp_snapshot = db->GetSnapshot(); + + ASSERT_OK(db->Put(WriteOptions(), "b", "value2")); + auto* snapshot = db->GetSnapshot(); + ASSERT_OK(db->Flush(FlushOptions())); + + db->ReleaseSnapshot(tmp_snapshot); + + // Bump the sequence so that the below bg compaction job's snapshot will be + // different from snapshot's sequence. + ASSERT_OK(db->Put(WriteOptions(), "z", "foo")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { + const auto* const ikey = + reinterpret_cast<const ParsedInternalKey*>(arg); + assert(ikey); + if (ikey->user_key == "b") { + assert(ikey->type == kTypeValue); + db->ReleaseSnapshot(snapshot); + + // Bump max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Generate an L0 with only SD for one key "b". + ASSERT_OK(db->Put(WriteOptions(), "a", "value")); + ASSERT_OK(db->Put(WriteOptions(), "b", "value")); + // Take a snapshot so that subsequent flush outputs the SD for "b". + auto* tmp_snapshot = db->GetSnapshot(); + ASSERT_OK(db->SingleDelete(WriteOptions(), "b")); + ASSERT_OK(db->Put(WriteOptions(), "c", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:3", [&](void* arg) { + if (!arg) { + db->ReleaseSnapshot(tmp_snapshot); + // Bump max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->Flush(FlushOptions())); + // Finish generating L0 with only SD for "b". + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Move the L0 to L2. + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(db->Put(WriteOptions(), "b", "value1")); + + auto* snapshot = db->GetSnapshot(); + + // Bump seq so that a subsequent flush/compaction job's snapshot is larger + // than the above snapshot's seq. + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + + // Generate a second L0. + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { + const auto* const ikey = + reinterpret_cast<const ParsedInternalKey*>(arg); + assert(ikey); + if (ikey->user_key == "b") { + assert(ikey->type == kTypeValue); + db->ReleaseSnapshot(snapshot); + + // Bump max_evicted_seq. + ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Although the user-contract indicates that a SD can only be issued for a key +// that exists and has not been overwritten, it is still possible for a Delete +// to be present when write-prepared transaction is rolled back. +TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; }; + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Get a write conflict snapshot by creating a transaction with + // set_snapshot=true. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + std::unique_ptr<Transaction> dummy_txn( + db->BeginTransaction(WriteOptions(), txn_opts)); + + std::unique_ptr<Transaction> txn0( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn0->Put("foo", "value")); + ASSERT_OK(txn0->SetName("xid0")); + ASSERT_OK(txn0->Prepare()); + + // Create an SST with only {"foo": "value"}. + ASSERT_OK(db->Flush(FlushOptions())); + + // Insert a Delete to cancel out the prior Put by txn0. + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + // Create a second SST. + ASSERT_OK(db->Flush(FlushOptions())); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value1")); + + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + + int count = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { + const auto* const c = reinterpret_cast<const Compaction*>(arg); + assert(!c); + // Trigger once only for SingleDelete during flush. + if (0 == count) { + ++count; + db->ReleaseSnapshot(snapshot); + // Bump max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Create a third SST containing a SD without its matching PUT. + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + DBImpl* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB()); + assert(dbimpl); + ASSERT_OK(dbimpl->TEST_CompactRange( + /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr, + /*column_family=*/nullptr, /*disallow_trivial_mode=*/true)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Release the conflict-checking snapshot. + ASSERT_OK(dummy_txn->Rollback()); +} + +// A more complex test to verify compaction/flush should keep keys visible +// to snapshots. +TEST_P(WritePreparedTransactionTest, + CompactionKeepSnapshotVisibleKeysRandomized) { + constexpr size_t kNumTransactions = 10; + constexpr size_t kNumIterations = 1000; + + std::vector<Transaction*> transactions(kNumTransactions, nullptr); + std::vector<size_t> versions(kNumTransactions, 0); + std::unordered_map<std::string, std::string> current_data; + std::vector<const Snapshot*> snapshots; + std::vector<std::unordered_map<std::string, std::string>> snapshot_data; + + Random rnd(1103); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + for (size_t i = 0; i < kNumTransactions; i++) { + std::string key = "key" + std::to_string(i); + std::string value = "value0"; + ASSERT_OK(db->Put(WriteOptions(), key, value)); + current_data[key] = value; + } + VerifyKeys(current_data); + + for (size_t iter = 0; iter < kNumIterations; iter++) { + auto r = rnd.Next() % (kNumTransactions + 1); + if (r < kNumTransactions) { + std::string key = "key" + std::to_string(r); + if (transactions[r] == nullptr) { + std::string value = "value" + std::to_string(versions[r] + 1); + auto* txn = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn->SetName("txn" + std::to_string(r))); + ASSERT_OK(txn->Put(key, value)); + ASSERT_OK(txn->Prepare()); + transactions[r] = txn; + } else { + std::string value = "value" + std::to_string(++versions[r]); + ASSERT_OK(transactions[r]->Commit()); + delete transactions[r]; + transactions[r] = nullptr; + current_data[key] = value; + } + } else { + auto* snapshot = db->GetSnapshot(); + VerifyKeys(current_data, snapshot); + snapshots.push_back(snapshot); + snapshot_data.push_back(current_data); + } + VerifyKeys(current_data); + } + // Take a last snapshot to test compaction with uncommitted prepared + // transaction. + snapshots.push_back(db->GetSnapshot()); + snapshot_data.push_back(current_data); + + ASSERT_EQ(snapshots.size(), snapshot_data.size()); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + ASSERT_OK(db->Flush(FlushOptions())); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + for (size_t i = 0; i < snapshots.size(); i++) { + VerifyKeys(snapshot_data[i], snapshots[i]); + } + // cleanup + for (size_t i = 0; i < kNumTransactions; i++) { + if (transactions[i] == nullptr) { + continue; + } + ASSERT_OK(transactions[i]->Commit()); + delete transactions[i]; + } + for (size_t i = 0; i < snapshots.size(); i++) { + db->ReleaseSnapshot(snapshots[i]); + } +} + +// Compaction should not apply the optimization to output key with sequence +// number equal to 0 if the key is not visible to earliest snapshot, based on +// commit sequence number. +TEST_P(WritePreparedTransactionTest, + CompactionShouldKeepSequenceForUncommittedKeys) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + // Keep track of expected sequence number. + SequenceNumber expected_seq = 0; + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("key1", "value1")); + ASSERT_OK(transaction->Prepare()); + ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber()); + SequenceNumber seq1 = expected_seq; + ASSERT_OK(db->Put(WriteOptions(), "key2", "value2")); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + expected_seq++; // one for data + if (options.two_write_queues) { + expected_seq++; // one for commit + } + ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence()); + ASSERT_OK(db->Flush(FlushOptions())); + // Dummy keys to avoid compaction trivially move files and get around actual + // compaction logic. + ASSERT_OK(db->Put(WriteOptions(), "a", "dummy")); + ASSERT_OK(db->Put(WriteOptions(), "z", "dummy")); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + VerifyKeys({ + {"key1", "NOT_FOUND"}, + {"key2", "value2"}, + }); + VerifyInternalKeys({ + // "key1" has not been committed. It keeps its sequence number. + {"key1", "value1", seq1, kTypeValue}, + // "key2" is committed and output with seq = 0. + {"key2", "value2", 0, kTypeValue}, + }); + ASSERT_OK(transaction->Commit()); + VerifyKeys({ + {"key1", "value1"}, + {"key2", "value2"}, + }); + delete transaction; +} + +TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) { + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + const Snapshot* snapshot = nullptr; + ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); + auto* txn = db->BeginTransaction(WriteOptions()); + ASSERT_OK(txn->SetName("txn")); + ASSERT_OK(txn->Put("key1", "value2")); + ASSERT_OK(txn->Prepare()); + + auto callback = [&](void*) { + // Snapshot is taken after compaction start. It should be taken into + // consideration for whether to compact out value1. + snapshot = db->GetSnapshot(); + ASSERT_OK(txn->Commit()); + delete txn; + }; + SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", + callback); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_NE(nullptr, snapshot); + VerifyKeys({{"key1", "value2"}}); + VerifyKeys({{"key1", "value1"}}, snapshot); + db->ReleaseSnapshot(snapshot); +} + +TEST_P(WritePreparedTransactionTest, Iterate) { + auto verify_state = [](Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); + }; + + auto verify_iter = [&](const std::string& expected_val) { + // Get iterator from a concurrent transaction and make sure it has the + // same view as an iterator from the DB. + auto* txn = db->BeginTransaction(WriteOptions()); + + for (int i = 0; i < 2; i++) { + Iterator* iter = (i == 0) ? db->NewIterator(ReadOptions()) + : txn->GetIterator(ReadOptions()); + // Seek + iter->Seek("foo"); + verify_state(iter, "foo", expected_val); + // Next + iter->Seek("a"); + verify_state(iter, "a", "va"); + iter->Next(); + verify_state(iter, "foo", expected_val); + // SeekForPrev + iter->SeekForPrev("y"); + verify_state(iter, "foo", expected_val); + // Prev + iter->SeekForPrev("z"); + verify_state(iter, "z", "vz"); + iter->Prev(); + verify_state(iter, "foo", expected_val); + delete iter; + } + delete txn; + }; + + ASSERT_OK(db->Put(WriteOptions(), "foo", "v1")); + auto* transaction = db->BeginTransaction(WriteOptions()); + ASSERT_OK(transaction->SetName("txn")); + ASSERT_OK(transaction->Put("foo", "v2")); + ASSERT_OK(transaction->Prepare()); + VerifyKeys({{"foo", "v1"}}); + // dummy keys + ASSERT_OK(db->Put(WriteOptions(), "a", "va")); + ASSERT_OK(db->Put(WriteOptions(), "z", "vz")); + verify_iter("v1"); + ASSERT_OK(transaction->Commit()); + VerifyKeys({{"foo", "v2"}}); + verify_iter("v2"); + delete transaction; +} + +TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) { + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Refresh().IsNotSupported()); + delete iter; +} + +// Committing an delayed prepared has two non-atomic steps: update commit cache, +// remove seq from delayed_prepared_. The read in IsInSnapshot also involves two +// non-atomic steps of checking these two data structures. This test breaks each +// in the middle to ensure correctness in spite of non-atomic execution. +// Note: This test is limitted to the case where snapshot is larger than the +// max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + for (auto split_read : {true, false}) { + std::vector<bool> split_options = {false}; + if (split_read) { + // Also test for break before mutex + split_options.push_back(true); + } + for (auto split_before_mutex : split_options) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB()); + // Fill up the commit cache + std::string init_value("value1"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Prepare a transaction but do not commit it + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), Slice("value2"))); + ASSERT_OK(txn->Prepare()); + // Commit a bunch of entries to advance max evicted seq and make the + // prepared a delayed prepared + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + } + // The snapshot should not see the delayed prepared entry + auto snap = db->GetSnapshot(); + + if (split_read) { + if (split_before_mutex) { + // split before acquiring prepare_mutex_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause", + "AtomicCommitOfDelayedPrepared:Commit:before"}, + {"AtomicCommitOfDelayedPrepared:Commit:after", + "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"}}); + } else { + // split right after reading from the commit cache + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause", + "AtomicCommitOfDelayedPrepared:Commit:before"}, + {"AtomicCommitOfDelayedPrepared:Commit:after", + "WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"}}); + } + } else { // split commit + // split right before removing from delayed_prepared_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::RemovePrepared:pause", + "AtomicCommitOfDelayedPrepared:Read:before"}, + {"AtomicCommitOfDelayedPrepared:Read:after", + "WritePreparedTxnDB::RemovePrepared:resume"}}); + } + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:before"); + ASSERT_OK(txn->Commit()); + if (split_before_mutex) { + // Do bunch of inserts to evict the commit entry from the cache. This + // would prevent the 2nd look into commit cache under prepare_mutex_ + // to see the commit entry. + auto seq = db_impl->TEST_GetLastVisibleSequence(); + size_t tries = 0; + while (wp_db->max_evicted_seq_ < seq && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + } + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:after"); + delete txn; + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:before"); + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should not see the commit of delayed prepared + ASSERT_TRUE(value == init_value); + TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:after"); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } // for split_before_mutex + } // for split_read +} + +// When max evicted seq advances a prepared seq, it involves two updates: i) +// adding prepared seq to delayed_prepared_, ii) updating max_evicted_seq_. +// ::IsInSnapshot also reads these two values in a non-atomic way. This test +// ensures correctness if the update occurs after ::IsInSnapshot reads +// delayed_prepared_empty_ and before it reads max_evicted_seq_. +// Note: this test focuses on read snapshot larger than max_evicted_seq_. +TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + // Fill up the commit cache + std::string init_value("value1"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Prepare a transaction but do not commit it + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), Slice("value2"))); + ASSERT_OK(txn->Prepare()); + // Create a gap between prepare seq and snapshot seq + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + // The snapshot should not see the delayed prepared entry + auto snap = db->GetSnapshot(); + ASSERT_LT(txn->GetId(), snap->GetSequenceNumber()); + + // split right after reading delayed_prepared_empty_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause", + "AtomicUpdateOfDelayedPrepared:before"}, + {"AtomicUpdateOfDelayedPrepared:after", + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:before"); + // Commit a bunch of entries to advance max evicted seq and make the + // prepared a delayed prepared + size_t tries = 0; + while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + // This is the case on which the test focuses + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:after"); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should not see the uncommitted value of delayed prepared + ASSERT_TRUE(value == init_value); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + ASSERT_OK(txn->Commit()); + delete txn; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Eviction from commit cache and update of max evicted seq are two non-atomic +// steps. Similarly the read of max_evicted_seq_ in ::IsInSnapshot and reading +// from commit cache are two non-atomic steps. This tests if the update occurs +// after reading max_evicted_seq_ and before reading the commit cache. +// Note: the test focuses on snapshot larger than max_evicted_seq_ +TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) { + const size_t snapshot_cache_bits = 7; // same as default + const size_t commit_cache_bits = 3; // 8 entries + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + // Fill up the commit cache + std::string init_value("value1"); + std::string last_value("value_final"); + for (int i = 0; i < 10; i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value))); + } + // Do an uncommitted write to prevent min_uncommitted optimization + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Put(Slice("key0"), last_value)); + ASSERT_OK(txn1->Prepare()); + // Do a write with prepare to get the prepare seq + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key1"), last_value)); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + // Create a gap between commit entry and snapshot seq + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + // The snapshot should see the last commit + auto snap = db->GetSnapshot(); + ASSERT_LE(txn->GetId(), snap->GetSequenceNumber()); + + // split right after reading max_evicted_seq_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause", + "NonAtomicUpdateOfMaxEvictedSeq:before"}, + {"NonAtomicUpdateOfMaxEvictedSeq:after", + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() { + TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:before"); + // Commit a bunch of entries to advance max evicted seq beyond txn->GetId() + size_t tries = 0; + while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + tries++; + }; + ASSERT_LT(tries, 50); + // This is the case on which the test focuses + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:after"); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + roptions.snapshot = snap; + PinnableSlice value; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value); + ASSERT_OK(s); + // It should see the committed value of the evicted entry + ASSERT_TRUE(value == last_value); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + commit_thread.join(); + delete txn; + ASSERT_OK(txn1->Commit()); + delete txn1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Test when we add a prepared seq when the max_evicted_seq_ already goes beyond +// that. The test focuses on a race condition between AddPrepared and +// AdvanceMaxEvictedSeq functions. +TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) { + if (!options.two_write_queues) { + // This test is only for two write queues + return; + } + const size_t snapshot_cache_bits = 7; // same as default + // 1 entry to advance max after the 2nd commit + const size_t commit_cache_bits = 0; + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db); + std::string some_value("value_some"); + std::string uncommitted_value("value_uncommitted"); + // Prepare two uncommitted transactions + Transaction* txn1 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn1->SetName("xid1")); + ASSERT_OK(txn1->Put(Slice("key1"), some_value)); + ASSERT_OK(txn1->Prepare()); + Transaction* txn2 = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn2->SetName("xid2")); + ASSERT_OK(txn2->Put(Slice("key2"), some_value)); + ASSERT_OK(txn2->Prepare()); + // Start the txn here so the other thread could get its id + Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value)); + port::Mutex txn_mutex_; + + // t1) Insert prepared entry, t2) commit other entries to advance max + // evicted sec and finish checking the existing prepared entries, t1) + // AddPrepared, t2) update max_evicted_seq_ + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"AddPreparedCallback::AddPrepared::begin:pause", + "AddPreparedBeforeMax::read_thread:start"}, + {"AdvanceMaxEvictedSeq::update_max:pause", + "AddPreparedCallback::AddPrepared::begin:resume"}, + {"AddPreparedCallback::AddPrepared::end", + "AdvanceMaxEvictedSeq::update_max:resume"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + txn_mutex_.Lock(); + ASSERT_OK(txn->Prepare()); + txn_mutex_.Unlock(); + }); + + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + TEST_SYNC_POINT("AddPreparedBeforeMax::read_thread:start"); + // Publish seq number with a commit + ASSERT_OK(txn1->Commit()); + // Since the commit cache size is one the 2nd commit evict the 1st one and + // invokes AdcanceMaxEvictedSeq + ASSERT_OK(txn2->Commit()); + + ReadOptions roptions; + PinnableSlice value; + // The snapshot should not see the uncommitted value from write_thread + auto snap = db->GetSnapshot(); + ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber()); + // This is the scenario that we test for + txn_mutex_.Lock(); + ASSERT_GT(wp_db->max_evicted_seq_, txn->GetId()); + txn_mutex_.Unlock(); + roptions.snapshot = snap; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key0", &value); + ASSERT_TRUE(s.IsNotFound()); + db->ReleaseSnapshot(snap); + }); + + read_thread.join(); + write_thread.join(); + delete txn1; + delete txn2; + ASSERT_OK(txn->Commit()); + delete txn; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// When an old prepared entry gets committed, there is a gap between the time +// that it is published and when it is cleaned up from old_prepared_. This test +// stresses such cases. +TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) { + const size_t snapshot_cache_bits = 7; // same as default + for (const size_t commit_cache_bits : {0, 2, 3}) { + for (const size_t sub_batch_cnt : {1, 2, 3}) { + UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + ASSERT_OK(ReOpen()); + std::atomic<const Snapshot*> snap = {nullptr}; + std::atomic<SequenceNumber> exp_prepare = {0}; + ROCKSDB_NAMESPACE::port::Thread callback_thread; + // Value is synchronized via snap + PinnableSlice value; + // Take a snapshot after publish and before RemovePrepared:Start + auto snap_callback = [&]() { + ASSERT_EQ(nullptr, snap.load()); + snap.store(db->GetSnapshot()); + ReadOptions roptions; + roptions.snapshot = snap.load(); + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value); + ASSERT_OK(s); + }; + auto callback = [&](void* param) { + SequenceNumber prep_seq = *((SequenceNumber*)param); + if (prep_seq == exp_prepare.load()) { // only for write_thread + // We need to spawn a thread to avoid deadlock since getting a + // snpashot might end up calling AdvanceSeqByOne which needs joining + // the write queue. + callback_thread = ROCKSDB_NAMESPACE::port::Thread(snap_callback); + TEST_SYNC_POINT("callback:end"); + } + }; + // Wait for the first snapshot be taken in GetSnapshotInternal. Although + // it might be updated before GetSnapshotInternal finishes but this should + // cover most of the cases. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"}, + }); + SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback); + SyncPoint::GetInstance()->EnableProcessing(); + // Thread to cause frequent evictions + ROCKSDB_NAMESPACE::port::Thread eviction_thread([&]() { + // Too many txns might cause commit_seq - prepare_seq in another thread + // to go beyond DELTA_UPPERBOUND + for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1"))); + } + }); + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) { + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + std::string val_str = "value" + std::to_string(i); + for (size_t b = 0; b < sub_batch_cnt; b++) { + ASSERT_OK(txn->Put(Slice("key2"), val_str)); + } + ASSERT_OK(txn->Prepare()); + // Let an eviction to kick in + std::this_thread::yield(); + + exp_prepare.store(txn->GetId()); + ASSERT_OK(txn->Commit()); + delete txn; + // Wait for the snapshot taking that is triggered by + // RemovePrepared:Start callback + callback_thread.join(); + + // Read with the snapshot taken before delayed_prepared_ cleanup + ReadOptions roptions; + roptions.snapshot = snap.load(); + ASSERT_NE(nullptr, roptions.snapshot); + PinnableSlice value2; + auto s = + db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2); + ASSERT_OK(s); + // It should see its own write + ASSERT_TRUE(val_str == value2); + // The value read by snapshot should not change + ASSERT_STREQ(value2.ToString().c_str(), value.ToString().c_str()); + + db->ReleaseSnapshot(roptions.snapshot); + snap.store(nullptr); + } + }); + write_thread.join(); + eviction_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } + } +} + +// Test that updating the commit map will not affect the existing snapshots +TEST_P(WritePreparedTransactionTest, AtomicCommit) { + for (bool skip_prepare : {true, false}) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritePreparedTxnDB::AddCommitted:start", + "AtomicCommit::GetSnapshot:start"}, + {"AtomicCommit::Get:end", + "WritePreparedTxnDB::AddCommitted:start:pause"}, + {"WritePreparedTxnDB::AddCommitted:end", "AtomicCommit::Get2:start"}, + {"AtomicCommit::Get2:end", + "WritePreparedTxnDB::AddCommitted:end:pause:"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread write_thread([&]() { + if (skip_prepare) { + ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value"))); + } else { + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); + ASSERT_OK(txn->SetName("xid")); + ASSERT_OK(txn->Put(Slice("key"), Slice("value"))); + ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn->Commit()); + delete txn; + } + }); + ROCKSDB_NAMESPACE::port::Thread read_thread([&]() { + ReadOptions roptions; + TEST_SYNC_POINT("AtomicCommit::GetSnapshot:start"); + roptions.snapshot = db->GetSnapshot(); + PinnableSlice val; + auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &val); + TEST_SYNC_POINT("AtomicCommit::Get:end"); + TEST_SYNC_POINT("AtomicCommit::Get2:start"); + ASSERT_SAME(roptions, db, s, val, "key"); + TEST_SYNC_POINT("AtomicCommit::Get2:end"); + db->ReleaseSnapshot(roptions.snapshot); + }); + read_thread.join(); + write_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(WritePreparedTransactionTest, BasicRollbackDeletionTypeCb) { + options.level0_file_num_compaction_trigger = 2; + // Always use SingleDelete to rollback Put. + txn_db_options.rollback_deletion_type_callback = + [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; }; + + const auto write_to_db = [&]() { + assert(db); + std::unique_ptr<Transaction> txn0( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Put("a", "v0")); + ASSERT_OK(txn0->Prepare()); + + // Generate sst1: [PUT('a')] + ASSERT_OK(db->Flush(FlushOptions())); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = options.num_levels - 1; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + ASSERT_OK(db->Put(WriteOptions(), "a", "v1")); + + ASSERT_OK(db->SingleDelete(WriteOptions(), "a")); + // Generate another SST with a SD to cover the oldest PUT('a') + ASSERT_OK(db->Flush(FlushOptions())); + + auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB()); + assert(dbimpl); + ASSERT_OK(dbimpl->TEST_WaitForCompact()); + + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + } + + { + std::string value; + const Status s = db->Get(ReadOptions(), "a", &value); + ASSERT_TRUE(s.IsNotFound()); + } + }; + + // Destroy and reopen + ASSERT_OK(ReOpen()); + write_to_db(); +} + +// Test that we can change write policy from WriteCommitted to WritePrepared +// after a clean shutdown (which would empty the WAL) +TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal); +} + +// Test that we fail fast if WAL is not emptied between changing the write +// policy from WriteCommitted to WritePrepared +TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal); +} + +// Test that we can change write policy from WritePrepare back to WriteCommitted +// after a clean shutdown (which would empty the WAL) +TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal); +} + +// Test that we fail fast if WAL is not emptied between changing the write +// policy from WriteCommitted to WritePrepared +TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) { + bool empty_wal = true; + CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + if (getenv("CIRCLECI")) { + // Looking for backtrace on "Resource temporarily unavailable" exceptions + ::testing::FLAGS_gtest_catch_exceptions = false; + } + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.cc b/src/rocksdb/utilities/transactions/write_prepared_txn.cc new file mode 100644 index 000000000..16b5cc1cb --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn.cc @@ -0,0 +1,512 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn.h" + +#include <cinttypes> +#include <map> +#include <set> + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/write_prepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +struct WriteOptions; + +WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : PessimisticTransaction(txn_db, write_options, txn_options, false), + wpt_db_(txn_db) { + // Call Initialize outside PessimisticTransaction constructor otherwise it + // would skip overridden functions in WritePreparedTxn since they are not + // defined yet in the constructor of PessimisticTransaction + Initialize(txn_options); +} + +void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + prepare_batch_cnt_ = 0; +} + +void WritePreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WritePreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, + backed_by_snapshot); + Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + pinnable_val, &callback); + const bool callback_valid = + callback.valid(); // NOTE: validity of callback must always be checked + // before it is destructed + if (res.ok()) { + if (!LIKELY(callback_valid && + wpt_db_->ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + res = Status::TryAgain(); + } + } + + return res; +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(db_iter); +} + +Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WritePrepareTxnDB, not the root db. + Iterator* db_iter = wpt_db_->NewIterator(options, column_family); + assert(db_iter); + + return write_batch_.NewIteratorWithBase(column_family, db_iter); +} + +Status WritePreparedTxn::PrepareInternal() { + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool kFirstPrepareBatch = true; + auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // Having AddPrepared in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &log_number_, /*log ref*/ 0, + !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_, + &add_prepared_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + SetId(prepare_seq); + return s; +} + +Status WritePreparedTxn::CommitWithoutPrepareInternal() { + // For each duplicate key we account for a new sub-batch + const size_t batch_cnt = GetWriteBatch()->SubBatchCnt(); + return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt); +} + +Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch, + size_t batch_cnt) { + return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this); +} + +Status WritePreparedTxn::CommitInternal() { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitInternal prepare_seq: %" PRIu64, GetID()); + // We take the commit-time batch and append the Commit marker. + // The Memtable will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + auto prepare_seq = GetId(); + const bool includes_data = !empty && !for_recovery; + assert(prepare_batch_cnt_); + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt); + // This is to call AddPrepared on CommitTimeWriteBatch + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is already + // a connection between the memtable and its WAL, so there is no need to + // redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + // If `two_write_queues && includes_data`, then `do_one_write` is false. The + // following `WriteImpl` will insert the data of the commit-time-batch into + // the database before updating the commit cache. Therefore, the data of the + // commmit-time-batch is considered uncommitted. Furthermore, since data of + // the commit-time-batch are not locked, it is possible for two uncommitted + // versions of the same key to co-exist for a (short) period of time until + // the commit cache is updated by the second write. If the two uncommitted + // keys are compacted to the bottommost level in the meantime, it is possible + // that compaction iterator will zero out the sequence numbers of both, thus + // violating the invariant that an SST does not have two identical internal + // keys. To prevent this situation, we should allow the usage of + // commit-time-batch only if the user sets + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to + // true. See the comments about GetCommitTimeWriteBatch() in + // include/rocksdb/utilities/transaction.h. + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues && + s.ok())) { + // Note: RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_); + } // else RemovePrepared is called from within PreReleaseCallback + if (UNLIKELY(!do_one_write)) { + assert(!s.ok()); + // Cleanup the prepared entry we added with add_prepared_callback + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + return s; + } // else do the 2nd write to publish seq + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + const size_t kZeroData = 0; + // Update commit map only from the 2nd queue + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch( + wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData, + commit_batch_seq, commit_batch_cnt); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_aux_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + return s; +} + +Status WritePreparedTxn::RollbackInternal() { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "RollbackInternal prepare_seq: %" PRIu64, GetId()); + + assert(db_impl_); + assert(wpt_db_); + + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + write_options_.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap(); + auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap(); + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* const db_; + WritePreparedTxnDB* const wpt_db_; + WritePreparedTxnReadCallback callback_; + WriteBatch* rollback_batch_; + std::map<uint32_t, const Comparator*>& comparators_; + std::map<uint32_t, ColumnFamilyHandle*>& handles_; + using CFKeys = std::set<Slice, SetComparator>; + std::map<uint32_t, CFKeys> keys_; + bool rollback_merge_operands_; + ReadOptions roptions_; + + RollbackWriteBatchBuilder( + DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq, + WriteBatch* dst_batch, + std::map<uint32_t, const Comparator*>& comparators, + std::map<uint32_t, ColumnFamilyHandle*>& handles, + bool rollback_merge_operands, const ReadOptions& _roptions) + : db_(db), + wpt_db_(wpt_db), + callback_(wpt_db, snap_seq), // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands), + roptions_(_roptions) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto it = cf_keys.insert(key); + // second is false if a element already existed. + if (it.second == false) { + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback_; + s = db_->GetImpl(roptions_, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch_->SingleDelete(cf_handle, key); + } else { + s = rollback_batch_->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + + protected: + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } + } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + wpt_db_->txn_db_options_.rollback_merge_operands, + roptions); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ONE_BATCH = 1; + const bool kFirstPrepareBatch = true; + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, ONE_BATCH, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while + // the rollback batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr, + NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + assert(!db_impl_->immutable_db_options().two_write_queues); + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + return s; + } // else do the 2nd write for commit + uint64_t rollback_seq = seq_used; + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write rollback_seq: %" PRIu64, + rollback_seq); + // Commit the batch by writing an empty batch to the queue that will release + // the commit sequence number to readers. + WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare( + wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_); + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal (status=%s) commit: %" PRIu64, + s.ToString().c_str(), GetId()); + // TODO(lth): For WriteUnPrepared that rollback is called frequently, + // RemovePrepared could be moved to the callback to reduce lock contention. + if (s.ok()) { + wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_); + } + // Note: RemovePrepared for prepared batch is called from within + // PreReleaseCallback + wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH); + + return s; +} + +Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check<const SnapshotImpl>(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted, + kBackedByDBSnapshot); + // TODO(yanqin): support user-defined timestamp + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +void WritePreparedTxn::SetSnapshot() { + const bool kForWWConflictCheck = true; + SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck); + SetSnapshotInternal(snapshot); +} + +Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { + auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch); + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + return ret; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.h b/src/rocksdb/utilities/transactions/write_prepared_txn.h new file mode 100644 index 000000000..30d9bdb99 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <atomic> +#include <mutex> +#include <stack> +#include <string> +#include <unordered_map> +#include <vector> + +#include "db/write_callback.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/autovector.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/transaction_base.h" +#include "utilities/transactions/transaction_util.h" + +namespace ROCKSDB_NAMESPACE { + +class WritePreparedTxnDB; + +// This impl could write to DB also uncommitted data and then later tell apart +// committed data from uncommitted data. Uncommitted data could be after the +// Prepare phase in 2PC (WritePreparedTxn) or before that +// (WriteUnpreparedTxnImpl). +class WritePreparedTxn : public PessimisticTransaction { + public: + WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + // No copying allowed + WritePreparedTxn(const WritePreparedTxn&) = delete; + void operator=(const WritePreparedTxn&) = delete; + + virtual ~WritePreparedTxn() {} + + // To make WAL commit markers visible, the snapshot will be based on the last + // seq in the WAL that is also published, LastPublishedSequence, as opposed to + // the last seq in the memtable. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + // Note: The behavior is undefined in presence of interleaved writes to the + // same transaction. + // To make WAL commit markers visible, the snapshot will be + // based on the last seq in the WAL that is also published, + // LastPublishedSequence, as opposed to the last seq in the memtable. + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual void SetSnapshot() override; + + protected: + void Initialize(const TransactionOptions& txn_options) override; + // Override the protected SetId to make it visible to the friend class + // WritePreparedTxnDB + inline void SetId(uint64_t id) override { Transaction::SetId(id); } + + private: + friend class WritePreparedTransactionTest_BasicRecoveryTest_Test; + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + + Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override; + + // Since the data is already written to memtables at the Prepare phase, the + // commit entails writing only a commit marker in the WAL. The sequence number + // of the commit marker is then the commit timestamp of the transaction. To + // make WAL commit markers visible, the snapshot will be based on the last seq + // in the WAL that is also published, LastPublishedSequence, as opposed to the + // last seq in the memtable. + Status CommitInternal() override; + + Status RollbackInternal() override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override; + + WritePreparedTxnDB* wpt_db_; + // Number of sub-batches in prepare + size_t prepare_batch_cnt_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc new file mode 100644 index 000000000..595c3df8f --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc @@ -0,0 +1,1030 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn_db.h" + +#include <algorithm> +#include <cinttypes> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/transaction_db_mutex_impl.h" + +// This function is for testing only. If it returns true, then all entries in +// the commit cache will be evicted. Unit and/or stress tests (db_stress) +// can implement this function and customize how frequently commit cache +// eviction occurs. +// TODO: remove this function once we can configure commit cache to be very +// small so that eviction occurs very frequently. This requires the commit +// cache entry to be able to encode prepare and commit sequence numbers so that +// the commit sequence number does not have to be within a certain range of +// prepare sequence number. +extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void) + __attribute__((__weak__)); + +namespace ROCKSDB_NAMESPACE { + +Status WritePreparedTxnDB::Initialize( + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles) { + auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB()); + assert(dbimpl != nullptr); + auto rtxns = dbimpl->recovered_transactions(); + std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt; + for (auto rtxn : rtxns) { + // There should only one batch for WritePrepared policy. + assert(rtxn.second->batches_.size() == 1); + const auto& seq = rtxn.second->batches_.begin()->first; + const auto& batch_info = rtxn.second->batches_.begin()->second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + ordered_seq_cnt[seq] = cnt; + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices, + handles); + return s; +} + +Status WritePreparedTxnDB::VerifyCFOptions( + const ColumnFamilyOptions& cf_options) { + Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options); + if (!s.ok()) { + return s; + } + if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) { + return Status::InvalidArgument( + "memtable_factory->CanHandleDuplicatedKey() cannot be false with " + "WritePrpeared transactions"); + } + return Status::OK(); +} + +Transaction* WritePreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WritePreparedTxn(this, write_options, txn_options); + } +} + +Status WritePreparedTxnDB::Write(const WriteOptions& opts, + WriteBatch* updates) { + if (txn_db_options_.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN); + } else { + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::Write( + const WriteOptions& opts, + const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) { + if (optimizations.skip_concurrency_control) { + // Skip locking the rows + const size_t UNKNOWN_BATCH_CNT = 0; + const size_t ONE_BATCH_CNT = 1; + const size_t batch_cnt = optimizations.skip_duplicate_key_check + ? ONE_BATCH_CNT + : UNKNOWN_BATCH_CNT; + WritePreparedTxn* NO_TXN = nullptr; + return WriteInternal(opts, updates, batch_cnt, NO_TXN); + } else { + // TODO(myabandeh): Make use of skip_duplicate_key_check hint + // Fall back to unoptimized version + return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates); + } +} + +Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, + WriteBatch* batch, size_t batch_cnt, + WritePreparedTxn* txn) { + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal"); + if (batch->Count() == 0) { + // Otherwise our 1 seq per batch logic will break since there is no seq + // increased for this batch. + return Status::OK(); + } + + if (write_options_orig.protection_bytes_per_key > 0) { + auto s = WriteBatchInternal::UpdateProtectionInfo( + batch, write_options_orig.protection_bytes_per_key); + if (!s.ok()) { + return s; + } + } + + if (batch_cnt == 0) { // not provided, then compute it + // TODO(myabandeh): add an option to allow user skipping this cost + SubBatchCounter counter(*GetCFComparatorMap()); + auto s = batch->Iterate(&counter); + if (!s.ok()) { + return s; + } + batch_cnt = counter.BatchCount(); + WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD); + ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches", + static_cast<uint64_t>(batch_cnt)); + } + assert(batch_cnt); + + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + WriteOptions write_options(write_options_orig); + // In the absence of Prepare markers, use Noop as a batch separator + auto s = WriteBatchInternal::InsertNoop(batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const uint64_t no_log_ref = 0; + uint64_t seq_used = kMaxSequenceNumber; + const size_t ZERO_PREPARES = 0; + const bool kSeperatePrepareCommitBatches = true; + // Since this is not 2pc, there is no need for AddPrepared but having it in + // the PreReleaseCallback enables an optimization. Refer to + // SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + this, db_impl_, batch_cnt, + db_impl_->immutable_db_options().two_write_queues, + !kSeperatePrepareCommitBatches); + WritePreparedCommitEntryPreReleaseCallback update_commit_map( + this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref, + !DISABLE_MEMTABLE, &seq_used, batch_cnt, + pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + uint64_t prepare_seq = seq_used; + if (txn != nullptr) { + txn->SetId(prepare_seq); + } + if (!s.ok()) { + return s; + } + if (do_one_write) { + return s; + } // else do the 2nd write for commit + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "CommitBatchInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + // Commit the batch by writing an empty batch to the 2nd queue that will + // release the commit sequence number to readers. + const size_t ZERO_COMMITS = 0; + WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare( + this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS); + WriteBatch empty_batch; + write_options.disableWAL = true; + write_options.sync = false; + const size_t ONE_BATCH = 1; // Just to inc the seq + s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr, + no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_prepare); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note: RemovePrepared is called from within PreReleaseCallback + return s; +} + +Status WritePreparedTxnDB::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted, + backed_by_snapshot); + bool* dont_care = nullptr; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + get_impl_options.value_found = dont_care; + get_impl_options.callback = &callback; + auto res = db_impl_->GetImpl(options, key, get_impl_options); + if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(), + backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +void WritePreparedTxnDB::UpdateCFComparatorMap( + const std::vector<ColumnFamilyHandle*>& handles) { + auto cf_map = new std::map<uint32_t, const Comparator*>(); + auto handle_map = new std::map<uint32_t, ColumnFamilyHandle*>(); + for (auto h : handles) { + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + if (id != 0) { + (*handle_map)[id] = h; + } else { + // The pointer to the default cf handle in the handles will be deleted. + // Use the pointer maintained by the db instead. + (*handle_map)[id] = DefaultColumnFamily(); + } + } + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { + auto old_cf_map_ptr = cf_map_.get(); + assert(old_cf_map_ptr); + auto cf_map = new std::map<uint32_t, const Comparator*>(*old_cf_map_ptr); + auto old_handle_map_ptr = handle_map_.get(); + assert(old_handle_map_ptr); + auto handle_map = + new std::map<uint32_t, ColumnFamilyHandle*>(*old_handle_map_ptr); + auto id = h->GetID(); + const Comparator* comparator = h->GetComparator(); + (*cf_map)[id] = comparator; + (*handle_map)[id] = h; + cf_map_.reset(cf_map); + handle_map_.reset(handle_map); +} + +std::vector<Status> WritePreparedTxnDB::MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) { + assert(values); + size_t num_keys = keys.size(); + values->resize(num_keys); + + std::vector<Status> stat_list(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]); + } + return stat_list; +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WritePreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr<ManagedSnapshot> s, + SequenceNumber min_uncommitted) + : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot), + snapshot(s) {} + + WritePreparedTxnReadCallback callback; + std::shared_ptr<ManagedSnapshot> snapshot; +}; + +namespace { +static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast<WritePreparedTxnDB::IteratorState*>(arg1); +} +} // anonymous namespace + +Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check<const SnapshotImpl>(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_; + own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot); + } + assert(snapshot_seq != kMaxSequenceNumber); + auto* cfd = + static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +Status WritePreparedTxnDB::NewIterators( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_families, + std::vector<Iterator*>* iterators) { + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + if (options.snapshot != nullptr) { + snapshot_seq = options.snapshot->GetSequenceNumber(); + min_uncommitted = + static_cast_with_check<const SnapshotImpl>(options.snapshot) + ->min_uncommitted_; + } else { + auto* snapshot = GetSnapshot(); + // We take a snapshot to make sure that the related data in the commit map + // are not deleted. + snapshot_seq = snapshot->GetSequenceNumber(); + own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot); + min_uncommitted = + static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_; + } + iterators->clear(); + iterators->reserve(column_families.size()); + for (auto* column_family : column_families) { + auto* cfd = + static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); + auto* db_iter = + db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); + db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); + iterators->push_back(db_iter); + } + return Status::OK(); +} + +void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) { + // Adcance max_evicted_seq_ no more than 100 times before the cache wraps + // around. + INC_STEP_FOR_MAX_EVICTED = + std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1)); + snapshot_cache_ = std::unique_ptr<std::atomic<SequenceNumber>[]>( + new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {}); + commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>( + new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {}); + dummy_max_snapshot_.number_ = kMaxSequenceNumber; + rollback_deletion_type_callback_ = + txn_db_opts.rollback_deletion_type_callback; +} + +void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max, + bool locked) { + // When max_evicted_seq_ advances, move older entries from prepared_txns_ + // to delayed_prepared_. This guarantees that if a seq is lower than max, + // then it is not in prepared_txns_ and save an expensive, synchronized + // lookup from a shared set. delayed_prepared_ is expected to be empty in + // normal cases. + ROCKS_LOG_DETAILS( + info_log_, + "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64, + prepared_txns_.empty(), + prepared_txns_.empty() ? 0 : prepared_txns_.top()); + const SequenceNumber prepared_top = prepared_txns_.top(); + const bool empty = prepared_top == kMaxSequenceNumber; + // Preliminary check to avoid the synchronization cost + if (!empty && prepared_top <= new_max) { + if (locked) { + // Needed to avoid double locking in pop(). + prepared_txns_.push_pop_mutex()->Unlock(); + } + WriteLock wl(&prepared_mutex_); + // Need to fetch fresh values of ::top after mutex is acquired + while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) { + auto to_be_popped = prepared_txns_.top(); + delayed_prepared_.insert(to_be_popped); + ROCKS_LOG_WARN(info_log_, + "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64 + " new_max=%" PRIu64 ")", + static_cast<uint64_t>(delayed_prepared_.size()), + to_be_popped, new_max); + delayed_prepared_empty_.store(false, std::memory_order_release); + // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise + // there will be a point in time that the entry is neither in + // prepared_txns_ nor in delayed_prepared_, which will not be checked if + // delayed_prepared_empty_ is false. + prepared_txns_.pop(); + } + if (locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + } +} + +void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64, + seq, max_evicted_seq_.load()); + TEST_SYNC_POINT("AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPrepared::begin:resume"); + if (!locked) { + prepared_txns_.push_pop_mutex()->Lock(); + } + prepared_txns_.push_pop_mutex()->AssertHeld(); + prepared_txns_.push(seq); + auto new_max = future_max_evicted_seq_.load(); + if (UNLIKELY(seq <= new_max)) { + // This should not happen in normal case + ROCKS_LOG_ERROR( + info_log_, + "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64 + " <= %" PRIu64, + seq, new_max); + CheckPreparedAgainstMax(new_max, true /*locked*/); + } + if (!locked) { + prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPrepared::end"); +} + +void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt) { + ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64, + prepare_seq, commit_seq); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause"); + auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE; + CommitEntry64b evicted_64b; + CommitEntry evicted; + bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted); + if (LIKELY(to_be_evicted)) { + assert(evicted.prep_seq != prepare_seq); + auto prev_max = max_evicted_seq_.load(std::memory_order_acquire); + ROCKS_LOG_DETAILS(info_log_, + "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64, + evicted.prep_seq, evicted.commit_seq, prev_max); + if (prev_max < evicted.commit_seq) { + auto last = db_impl_->GetLastPublishedSequence(); // could be 0 + SequenceNumber max_evicted_seq; + if (LIKELY(evicted.commit_seq < last)) { + assert(last > 0); + // Inc max in larger steps to avoid frequent updates + max_evicted_seq = + std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1); + } else { + // legit when a commit entry in a write batch overwrite the previous one + max_evicted_seq = evicted.commit_seq; + } +#ifdef OS_LINUX + if (rocksdb_write_prepared_TEST_ShouldClearCommitCache && + rocksdb_write_prepared_TEST_ShouldClearCommitCache()) { + max_evicted_seq = last; + } +#endif // OS_LINUX + ROCKS_LOG_DETAILS(info_log_, + "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64 + " => %lu", + prepare_seq, evicted.prep_seq, evicted.commit_seq, + prev_max, max_evicted_seq); + AdvanceMaxEvictedSeq(prev_max, max_evicted_seq); + } + if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) { + WriteLock wl(&prepared_mutex_); + auto dp_iter = delayed_prepared_.find(evicted.prep_seq); + if (dp_iter != delayed_prepared_.end()) { + // This is a rare case that txn is committed but prepared_txns_ is not + // cleaned up yet. Refer to delayed_prepared_commits_ definition for + // why it should be kept updated. + delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq; + ROCKS_LOG_DEBUG(info_log_, + "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64, + evicted.prep_seq, evicted.commit_seq); + } + } + // After each eviction from commit cache, check if the commit entry should + // be kept around because it overlaps with a live snapshot. + CheckAgainstSnapshots(evicted); + } + bool succ = + ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq}); + if (UNLIKELY(!succ)) { + ROCKS_LOG_ERROR(info_log_, + "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64 + ",%" PRIu64 " retrying...", + indexed_seq, prepare_seq, commit_seq); + // A very rare event, in which the commit entry is updated before we do. + // Here we apply a very simple solution of retrying. + if (loop_cnt > 100) { + throw std::runtime_error("Infinite loop in AddCommitted!"); + } + AddCommitted(prepare_seq, commit_seq, ++loop_cnt); + return; + } + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause"); +} + +void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq, + const size_t batch_cnt) { + TEST_SYNC_POINT_CALLBACK( + "RemovePrepared:Start", + const_cast<void*>(reinterpret_cast<const void*>(&prepare_seq))); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume"); + ROCKS_LOG_DETAILS(info_log_, + "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt, + prepare_seq, batch_cnt); + WriteLock wl(&prepared_mutex_); + for (size_t i = 0; i < batch_cnt; i++) { + prepared_txns_.erase(prepare_seq + i); + bool was_empty = delayed_prepared_.empty(); + if (!was_empty) { + delayed_prepared_.erase(prepare_seq + i); + auto it = delayed_prepared_commits_.find(prepare_seq + i); + if (it != delayed_prepared_commits_.end()) { + ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64, + prepare_seq + i); + delayed_prepared_commits_.erase(it); + } + bool is_empty = delayed_prepared_.empty(); + if (was_empty != is_empty) { + delayed_prepared_empty_.store(is_empty, std::memory_order_release); + } + } + } +} + +bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq, + CommitEntry64b* entry_64b, + CommitEntry* entry) const { + *entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].load( + std::memory_order_acquire); + bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq, + const CommitEntry& new_entry, + CommitEntry* evicted_entry) { + CommitEntry64b new_entry_64b(new_entry, FORMAT); + CommitEntry64b evicted_entry_64b = + commit_cache_[static_cast<size_t>(indexed_seq)].exchange( + new_entry_64b, std::memory_order_acq_rel); + bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT); + return valid; +} + +bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry_64b, + const CommitEntry& new_entry) { + auto& atomic_entry = commit_cache_[static_cast<size_t>(indexed_seq)]; + CommitEntry64b new_entry_64b(new_entry, FORMAT); + bool succ = atomic_entry.compare_exchange_strong( + expected_entry_64b, new_entry_64b, std::memory_order_acq_rel, + std::memory_order_acquire); + return succ; +} + +void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max) { + ROCKS_LOG_DETAILS(info_log_, + "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64, + prev_max, new_max); + // Declare the intention before getting snapshot from the DB. This helps a + // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if + // it has not already. Otherwise the new snapshot is when we ask DB for + // snapshots smaller than future max. + auto updated_future_max = prev_max; + while (updated_future_max < new_max && + !future_max_evicted_seq_.compare_exchange_weak( + updated_future_max, new_max, std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; + + CheckPreparedAgainstMax(new_max, false /*locked*/); + + // With each change to max_evicted_seq_ fetch the live snapshots behind it. + // We use max as the version of snapshots to identify how fresh are the + // snapshot list. This works because the snapshots are between 0 and + // max, so the larger the max, the more complete they are. + SequenceNumber new_snapshots_version = new_max; + std::vector<SequenceNumber> snapshots; + bool update_snapshots = false; + if (new_snapshots_version > snapshots_version_) { + // This is to avoid updating the snapshots_ if it already updated + // with a more recent vesion by a concrrent thread + update_snapshots = true; + // We only care about snapshots lower then max + snapshots = GetSnapshotListFromDB(new_max); + } + if (update_snapshots) { + UpdateSnapshots(snapshots, new_snapshots_version); + if (!snapshots.empty()) { + WriteLock wl(&old_commit_map_mutex_); + for (auto snap : snapshots) { + // This allows IsInSnapshot to tell apart the reads from in valid + // snapshots from the reads from committed values in valid snapshots. + old_commit_map_[snap]; + } + old_commit_map_empty_.store(false, std::memory_order_release); + } + } + auto updated_prev_max = prev_max; + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause"); + TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume"); + while (updated_prev_max < new_max && + !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + }; +} + +const Snapshot* WritePreparedTxnDB::GetSnapshot() { + const bool kForWWConflictCheck = true; + return GetSnapshotInternal(!kForWWConflictCheck); +} + +SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal( + bool for_ww_conflict_check) { + // Note: for this optimization setting the last sequence number and obtaining + // the smallest uncommitted seq should be done atomically. However to avoid + // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the + // snapshot. Since we always updated the list of unprepared seq (via + // AddPrepared) AFTER the last sequence is updated, this guarantees that the + // smallest uncommitted seq that we pair with the snapshot is smaller or equal + // the value that would be obtained otherwise atomically. That is ok since + // this optimization works as long as min_uncommitted is less than or equal + // than the smallest uncommitted seq when the snapshot was taken. + auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq(); + SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first"); + assert(snap_impl); + SequenceNumber snap_seq = snap_impl->GetSequenceNumber(); + // Note: Check against future_max_evicted_seq_ (in contrast with + // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq. + if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) { + // There is a very rare case in which the commit entry evicts another commit + // entry that is not published yet thus advancing max evicted seq beyond the + // last published seq. This case is not likely in real-world setup so we + // handle it with a few retries. + size_t retry = 0; + SequenceNumber max; + while ((max = future_max_evicted_seq_.load()) != 0 && + snap_impl->GetSequenceNumber() <= max && retry < 100) { + ROCKS_LOG_WARN(info_log_, + "GetSnapshot snap: %" PRIu64 " max: %" PRIu64 + " retry %" ROCKSDB_PRIszt, + snap_impl->GetSequenceNumber(), max, retry); + ReleaseSnapshot(snap_impl); + // Wait for last visible seq to catch up with max, and also go beyond it + // by one. + AdvanceSeqByOne(); + snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check); + assert(snap_impl); + retry++; + } + assert(snap_impl->GetSequenceNumber() > max); + if (snap_impl->GetSequenceNumber() <= max) { + throw std::runtime_error( + "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) + + " after " + std::to_string(retry) + + " retries is still less than futre_max_evicted_seq_" + + std::to_string(max)); + } + } + EnhanceSnapshot(snap_impl, min_uncommitted); + ROCKS_LOG_DETAILS( + db_impl_->immutable_db_options().info_log, + "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64, + snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted); + TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end"); + return snap_impl; +} + +void WritePreparedTxnDB::AdvanceSeqByOne() { + // Inserting an empty value will i) let the max evicted entry to be + // published, i.e., max == last_published, increase the last published to + // be one beyond max, i.e., max < last_published. + WriteOptions woptions; + TransactionOptions txn_options; + Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr); + std::hash<std::thread::id> hasher; + char name[64]; + snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id())); + assert(strlen(name) < 64 - 1); + Status s = txn0->SetName(name); + assert(s.ok()); + if (s.ok()) { + // Without prepare it would simply skip the commit + s = txn0->Prepare(); + } + assert(s.ok()); + if (s.ok()) { + s = txn0->Commit(); + } + assert(s.ok()); + delete txn0; +} + +const std::vector<SequenceNumber> WritePreparedTxnDB::GetSnapshotListFromDB( + SequenceNumber max) { + ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max); + InstrumentedMutexLock dblock(db_impl_->mutex()); + db_impl_->mutex()->AssertHeld(); + return db_impl_->snapshots().GetAll(nullptr, max); +} + +void WritePreparedTxnDB::ReleaseSnapshotInternal( + const SequenceNumber snap_seq) { + // TODO(myabandeh): relax should enough since the synchronizatin is already + // done by snapshots_mutex_ under which this function is called. + if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) { + // Then this is a rare case that transaction did not finish before max + // advances. It is expected for a few read-only backup snapshots. For such + // snapshots we might have kept around a couple of entries in the + // old_commit_map_. Check and do garbage collection if that is the case. + bool need_gc = false; + { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snap_seq); + need_gc = prep_set_entry != old_commit_map_.end(); + } + if (need_gc) { + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64, + snap_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_.erase(snap_seq); + old_commit_map_empty_.store(old_commit_map_.empty(), + std::memory_order_release); + } + } +} + +void WritePreparedTxnDB::CleanupReleasedSnapshots( + const std::vector<SequenceNumber>& new_snapshots, + const std::vector<SequenceNumber>& old_snapshots) { + auto newi = new_snapshots.begin(); + auto oldi = old_snapshots.begin(); + for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) { + assert(*newi >= *oldi); // cannot have new snapshots with lower seq + if (*newi == *oldi) { // still not released + auto value = *newi; + while (newi != new_snapshots.end() && *newi == value) { + newi++; + } + while (oldi != old_snapshots.end() && *oldi == value) { + oldi++; + } + } else { + assert(*newi > *oldi); // *oldi is released + ReleaseSnapshotInternal(*oldi); + oldi++; + } + } + // Everything remained in old_snapshots is released and must be cleaned up + for (; oldi != old_snapshots.end(); oldi++) { + ReleaseSnapshotInternal(*oldi); + } +} + +void WritePreparedTxnDB::UpdateSnapshots( + const std::vector<SequenceNumber>& snapshots, + const SequenceNumber& version) { + ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64, + version); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead"); + WriteLock wl(&snapshots_mutex_); + snapshots_version_ = version; + // We update the list concurrently with the readers. + // Both new and old lists are sorted and the new list is subset of the + // previous list plus some new items. Thus if a snapshot repeats in + // both new and old lists, it will appear upper in the new list. So if + // we simply insert the new snapshots in order, if an overwritten item + // is still valid in the new list is either written to the same place in + // the array or it is written in a higher palce before it gets + // overwritten by another item. This guarantess a reader that reads the + // list bottom-up will eventaully see a snapshot that repeats in the + // update, either before it gets overwritten by the writer or + // afterwards. + size_t i = 0; + auto it = snapshots.begin(); + for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) { + snapshot_cache_[i].store(*it, std::memory_order_release); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#ifndef NDEBUG + // Release the remaining sync points since they are useless given that the + // reader would also use lock to access snapshots + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i); + } +#endif + snapshots_.clear(); + for (; it != snapshots.end(); ++it) { + // Insert them to a vector that is less efficient to access + // concurrently + snapshots_.push_back(*it); + } + // Update the size at the end. Otherwise a parallel reader might read + // items that are not set yet. + snapshots_total_.store(snapshots.size(), std::memory_order_release); + + // Note: this must be done after the snapshots data structures are updated + // with the new list of snapshots. + CleanupReleasedSnapshots(snapshots, snapshots_all_); + snapshots_all_ = snapshots; + + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end"); +} + +void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) { + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start"); +#ifndef NDEBUG + size_t sync_i = 0; +#endif + // First check the snapshot cache that is efficient for concurrent access + auto cnt = snapshots_total_.load(std::memory_order_acquire); + // The list might get updated concurrently as we are reading from it. The + // reader should be able to read all the snapshots that are still valid + // after the update. Since the survived snapshots are written in a higher + // place before gets overwritten the reader that reads bottom-up will + // eventully see it. + const bool next_is_larger = true; + // We will set to true if the border line snapshot suggests that. + bool search_larger_list = false; + size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE); + for (; 0 < ip1; ip1--) { + SequenceNumber snapshot_seq = + snapshot_cache_[ip1 - 1].load(std::memory_order_acquire); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", + ++sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + if (ip1 == SNAPSHOT_CACHE_SIZE) { // border line snapshot + // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq + // then later also continue the search to larger snapshots + search_larger_list = snapshot_seq < evicted.commit_seq; + } + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, !next_is_larger)) { + break; + } + } +#ifndef NDEBUG + // Release the remaining sync points before accquiring the lock + for (++sync_i; sync_i <= 10; ++sync_i) { + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i); + TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i); + } +#endif + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end"); + TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end"); + if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) { + // Then access the less efficient list of snapshots_ + WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64 + "> with %" ROCKSDB_PRIszt " snapshots", + evicted.prep_seq, evicted.commit_seq, cnt); + ReadLock rl(&snapshots_mutex_); + // Items could have moved from the snapshots_ to snapshot_cache_ before + // accquiring the lock. To make sure that we do not miss a valid snapshot, + // read snapshot_cache_ again while holding the lock. + for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) { + SequenceNumber snapshot_seq = + snapshot_cache_[i].load(std::memory_order_acquire); + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq, next_is_larger)) { + break; + } + } + for (auto snapshot_seq_2 : snapshots_) { + if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq, + snapshot_seq_2, next_is_larger)) { + break; + } + } + } +} + +bool WritePreparedTxnDB::MaybeUpdateOldCommitMap( + const uint64_t& prep_seq, const uint64_t& commit_seq, + const uint64_t& snapshot_seq, const bool next_is_larger = true) { + // If we do not store an entry in old_commit_map_ we assume it is committed in + // all snapshots. If commit_seq <= snapshot_seq, it is considered already in + // the snapshot so we need not to keep the entry around for this snapshot. + if (commit_seq <= snapshot_seq) { + // continue the search if the next snapshot could be smaller than commit_seq + return !next_is_larger; + } + // then snapshot_seq < commit_seq + if (prep_seq <= snapshot_seq) { // overlapping range + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ROCKS_LOG_WARN(info_log_, + "old_commit_map_mutex_ overhead for %" PRIu64 + " commit entry: <%" PRIu64 ",%" PRIu64 ">", + snapshot_seq, prep_seq, commit_seq); + WriteLock wl(&old_commit_map_mutex_); + old_commit_map_empty_.store(false, std::memory_order_release); + auto& vec = old_commit_map_[snapshot_seq]; + vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq); + // We need to store it once for each overlapping snapshot. Returning true to + // continue the search if there is more overlapping snapshot. + return true; + } + // continue the search if the next snapshot could be larger than prep_seq + return next_is_larger; +} + +WritePreparedTxnDB::~WritePreparedTxnDB() { + // At this point there could be running compaction/flush holding a + // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB. + // Make sure those jobs finished before destructing WritePreparedTxnDB. + if (!db_impl_->shutting_down_) { + db_impl_->CancelAllBackgroundWork(true /*wait*/); + } +} + +void SubBatchCounter::InitWithComp(const uint32_t cf) { + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); +} + +void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) { + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + batches_++; + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.h b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h new file mode 100644 index 000000000..25a382473 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h @@ -0,0 +1,1125 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <mutex> +#include <queue> +#include <set> +#include <string> +#include <unordered_map> +#include <vector> + +#include "db/db_iter.h" +#include "db/pre_release_callback.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "logging/logging.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" +#include "util/set_comparator.h" +#include "util/string_util.h" +#include "utilities/transactions/pessimistic_transaction.h" +#include "utilities/transactions/pessimistic_transaction_db.h" +#include "utilities/transactions/write_prepared_txn.h" + +namespace ROCKSDB_NAMESPACE { +enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot }; + +// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC. +// In this way some data in the DB might not be committed. The DB provides +// mechanisms to tell such data apart from committed data. +class WritePreparedTxnDB : public PessimisticTransactionDB { + public: + explicit WritePreparedTxnDB(DB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + explicit WritePreparedTxnDB(StackableDB* db, + const TransactionDBOptions& txn_db_options) + : PessimisticTransactionDB(db, txn_db_options), + SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits), + SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)), + COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits), + COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)), + FORMAT(COMMIT_CACHE_BITS) { + Init(txn_db_options); + } + + virtual ~WritePreparedTxnDB(); + + virtual Status Initialize( + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + using TransactionDB::Write; + Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using PessimisticTransactionDB::Write; + Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&, + WriteBatch* updates) override; + + // Write the batch to the underlying DB and mark it as committed. Could be + // used by both directly from TxnDB or through a transaction. + Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch, + size_t batch_cnt, WritePreparedTxn* txn); + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using DB::MultiGet; + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, + std::vector<std::string>* values) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + using DB::NewIterators; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_families, + std::vector<Iterator*>* iterators) override; + + // Check whether the transaction that wrote the value with sequence number seq + // is visible to the snapshot with sequence number snapshot_seq. + // Returns true if commit_seq <= snapshot_seq + // If the snapshot_seq is already released and snapshot_seq <= max, sets + // *snap_released to true and returns true as well. + inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq, + uint64_t min_uncommitted = kMinUnCommittedSeq, + bool* snap_released = nullptr) const { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, min_uncommitted); + assert(min_uncommitted >= kMinUnCommittedSeq); + // Caller is responsible to initialize snap_released. + assert(snap_released == nullptr || *snap_released == false); + // Here we try to infer the return value without looking into prepare list. + // This would help avoiding synchronization over a shared map. + // TODO(myabandeh): optimize this. This sequence of checks must be correct + // but not necessary efficient + if (prep_seq == 0) { + // Compaction will output keys to bottom-level with sequence number 0 if + // it is visible to the earliest snapshot. + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + if (snapshot_seq < prep_seq) { + // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + if (prep_seq < min_uncommitted) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 + " because of min_uncommitted %" PRIu64, + prep_seq, snapshot_seq, 1, min_uncommitted); + return true; + } + // Commit of delayed prepared has two non-atomic steps: add to commit cache, + // remove from delayed prepared. Our reads from these two is also + // non-atomic. By looking into commit cache first thus we might not find the + // prep_seq neither in commit cache not in delayed_prepared_. To fix that i) + // we check if there was any delayed prepared BEFORE looking into commit + // cache, ii) if there was, we complete the search steps to be these: i) + // commit cache, ii) delayed prepared, commit cache again. In this way if + // the first query to commit cache missed the commit, the 2nd will catch it. + bool was_empty; + SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub; + CommitEntry64b dont_care; + auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE; + size_t repeats = 0; + do { + repeats++; + assert(repeats < 100); + if (UNLIKELY(repeats >= 100)) { + throw std::runtime_error( + "The read was intrupted 100 times by update to max_evicted_seq_. " + "This is unexpected in all setups"); + } + max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"); + was_empty = delayed_prepared_empty_.load(std::memory_order_acquire); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"); + CommitEntry cached; + bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause"); + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"); + if (exist && prep_seq == cached.prep_seq) { + // It is committed and also not evicted from commit cache + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + // else it could be committed but not inserted in the map which could + // happen after recovery, or it could be committed and evicted by another + // commit, or never committed. + + // At this point we don't know if it was committed or it is still prepared + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) { + continue; + } + // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub + if (max_evicted_seq_ub < prep_seq) { + // Not evicted from cache and also not present, so must be still + // prepared + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause"); + TEST_SYNC_POINT( + "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"); + if (!was_empty) { + // We should not normally reach here + WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD); + ReadLock rl(&prepared_mutex_); + ROCKS_LOG_WARN( + info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64, + static_cast<uint64_t>(delayed_prepared_.size()), prep_seq); + if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) { + // This is the order: 1) delayed_prepared_commits_ update, 2) publish + // 3) delayed_prepared_ clean up. So check if it is the case of a late + // clenaup. + auto it = delayed_prepared_commits_.find(prep_seq); + if (it == delayed_prepared_commits_.end()) { + // Then it is not committed yet + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } else { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " commit: %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, it->second, + snapshot_seq <= it->second); + return it->second <= snapshot_seq; + } + } else { + // 2nd query to commit cache. Refer to was_empty comment above. + exist = GetCommitEntry(indexed_seq, &dont_care, &cached); + if (exist && prep_seq == cached.prep_seq) { + ROCKS_LOG_DETAILS( + info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq); + return cached.commit_seq <= snapshot_seq; + } + max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire); + } + } + } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)); + // When advancing max_evicted_seq_, we move older entires from prepared to + // delayed_prepared_. Also we move evicted entries from commit cache to + // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <= + // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in + // old_commit_map_, iii) committed with no conflict with any snapshot. Case + // (i) delayed_prepared_ is checked above + if (max_evicted_seq_ub < snapshot_seq) { // then (ii) cannot be the case + // only (iii) is the case: committed + // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq < + // snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + // else (ii) might be the case: check the commit data saved for this + // snapshot. If there was no overlapping commit entry, then it is committed + // with a commit_seq lower than any live snapshot, including snapshot_seq. + if (old_commit_map_empty_.load(std::memory_order_acquire)) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + assert(snap_released); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + *snap_released = true; + return true; + } + { + // We should not normally reach here unless sapshot_seq is old. This is a + // rare case and it is ok to pay the cost of mutex ReadLock for such old, + // reading transactions. + WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD); + ReadLock rl(&old_commit_map_mutex_); + auto prep_set_entry = old_commit_map_.find(snapshot_seq); + bool found = prep_set_entry != old_commit_map_.end(); + if (found) { + auto& vec = prep_set_entry->second; + found = std::binary_search(vec.begin(), vec.end(), prep_seq); + } else { + // coming from compaction + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32 " released=1", + prep_seq, snapshot_seq, 0); + // This snapshot is not valid anymore. We cannot tell if prep_seq is + // committed before or after the snapshot. Return true but also set + // snap_released to true. + assert(snap_released); + *snap_released = true; + return true; + } + + if (!found) { + ROCKS_LOG_DETAILS(info_log_, + "IsInSnapshot %" PRIu64 " in %" PRIu64 + " returns %" PRId32, + prep_seq, snapshot_seq, 1); + return true; + } + } + // (ii) it the case: it is committed but after the snapshot_seq + ROCKS_LOG_DETAILS( + info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32, + prep_seq, snapshot_seq, 0); + return false; + } + + // Add the transaction with prepare sequence seq to the prepared list. + // Note: must be called serially with increasing seq on each call. + // locked is true if prepared_mutex_ is already locked. + void AddPrepared(uint64_t seq, bool locked = false); + // Check if any of the prepared txns are less than new max_evicted_seq_. Must + // be called with prepared_mutex_ write locked. + void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked); + // Remove the transaction with prepare sequence seq from the prepared list + void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1); + // Add the transaction with prepare sequence prepare_seq and commit sequence + // commit_seq to the commit map. loop_cnt is to detect infinite loops. + // Note: must be called serially. + void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq, + uint8_t loop_cnt = 0); + + struct CommitEntry { + uint64_t prep_seq; + uint64_t commit_seq; + CommitEntry() : prep_seq(0), commit_seq(0) {} + CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {} + bool operator==(const CommitEntry& rhs) const { + return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq; + } + }; + + struct CommitEntry64bFormat { + explicit CommitEntry64bFormat(size_t index_bits) + : INDEX_BITS(index_bits), + PREP_BITS(static_cast<size_t>(64 - PAD_BITS - INDEX_BITS)), + COMMIT_BITS(static_cast<size_t>(64 - PREP_BITS)), + COMMIT_FILTER(static_cast<uint64_t>((1ull << COMMIT_BITS) - 1)), + DELTA_UPPERBOUND(static_cast<uint64_t>((1ull << COMMIT_BITS))) {} + // Number of higher bits of a sequence number that is not used. They are + // used to encode the value type, ... + const size_t PAD_BITS = static_cast<size_t>(8); + // Number of lower bits from prepare seq that can be skipped as they are + // implied by the index of the entry in the array + const size_t INDEX_BITS; + // Number of bits we use to encode the prepare seq + const size_t PREP_BITS; + // Number of bits we use to encode the commit seq. + const size_t COMMIT_BITS; + // Filter to encode/decode commit seq + const uint64_t COMMIT_FILTER; + // The value of commit_seq - prepare_seq + 1 must be less than this bound + const uint64_t DELTA_UPPERBOUND; + }; + + // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ... + // INDEX Delta Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ... + // DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA + // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and + // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the + // bits that do not have to be encoded (will be provided externally) DELTA: + // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of + // index bits + PADs + struct CommitEntry64b { + constexpr CommitEntry64b() noexcept : rep_(0) {} + + CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format) + : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {} + + CommitEntry64b(const uint64_t ps, const uint64_t cs, + const CommitEntry64bFormat& format) { + assert(ps < static_cast<uint64_t>( + (1ull << (format.PREP_BITS + format.INDEX_BITS)))); + assert(ps <= cs); + uint64_t delta = cs - ps + 1; // make initialized delta always >= 1 + // zero is reserved for uninitialized entries + assert(0 < delta); + assert(delta < format.DELTA_UPPERBOUND); + if (delta >= format.DELTA_UPPERBOUND) { + throw std::runtime_error( + "commit_seq >> prepare_seq. The allowed distance is " + + std::to_string(format.DELTA_UPPERBOUND) + " commit_seq is " + + std::to_string(cs) + " prepare_seq is " + std::to_string(ps)); + } + rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER; + rep_ = rep_ | delta; + } + + // Return false if the entry is empty + bool Parse(const uint64_t indexed_seq, CommitEntry* entry, + const CommitEntry64bFormat& format) { + uint64_t delta = rep_ & format.COMMIT_FILTER; + // zero is reserved for uninitialized entries + assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS))); + if (delta == 0) { + return false; // initialized entry would have non-zero delta + } + + assert(indexed_seq < static_cast<uint64_t>((1ull << format.INDEX_BITS))); + uint64_t prep_up = rep_ & ~format.COMMIT_FILTER; + prep_up >>= format.PAD_BITS; + const uint64_t& prep_low = indexed_seq; + entry->prep_seq = prep_up | prep_low; + + entry->commit_seq = entry->prep_seq + delta - 1; + return true; + } + + private: + uint64_t rep_; + }; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + std::shared_ptr<std::map<uint32_t, const Comparator*>> GetCFComparatorMap() { + return cf_map_; + } + std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> GetCFHandleMap() { + return handle_map_; + } + void UpdateCFComparatorMap( + const std::vector<ColumnFamilyHandle*>& handles) override; + void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override; + + virtual const Snapshot* GetSnapshot() override; + SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check); + + protected: + virtual Status VerifyCFOptions( + const ColumnFamilyOptions& cf_options) override; + // Assign the min and max sequence numbers for reading from the db. A seq > + // max is not valid, and a seq < min is valid, and a min <= seq < max requires + // further checking. Normally max is defined by the snapshot and min is by + // minimum uncommitted seq. + inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max); + // Validate is a snapshot sequence number is still valid based on the latest + // db status. backed_by_snapshot specifies if the number is baked by an actual + // snapshot object. order specified the memory order with which we load the + // atomic variables: relax is enough for the default since we care about last + // value seen by same thread. + inline bool ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order = std::memory_order_relaxed); + // Get a dummy snapshot that refers to kMaxSequenceNumber + Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; } + + bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + return rollback_deletion_type_callback_ + ? rollback_deletion_type_callback_(this, column_family, key) + : false; + } + + std::function<bool(TransactionDB*, ColumnFamilyHandle*, const Slice&)> + rollback_deletion_type_callback_; + + private: + friend class AddPreparedCallback; + friend class PreparedHeap_BasicsTest_Test; + friend class PreparedHeap_Concurrent_Test; + friend class PreparedHeap_EmptyAtTheEnd_Test; + friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccess_Test; + friend class WritePreparedCommitEntryPreReleaseCallback; + friend class WritePreparedTransactionTestBase; + friend class WritePreparedTxn; + friend class WritePreparedTxnDBMock; + friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test; + friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasic_Test; + friend class + WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicates_Test; + friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test; + friend class WritePreparedTransactionTest_BasicRecovery_Test; + friend class WritePreparedTransactionTest_CheckAgainstSnapshots_Test; + friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test; + friend class WritePreparedTransactionTest_ConflictDetectionAfterRecovery_Test; + friend class WritePreparedTransactionTest_CommitMap_Test; + friend class WritePreparedTransactionTest_DoubleSnapshot_Test; + friend class WritePreparedTransactionTest_IsInSnapshotEmptyMap_Test; + friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test; + friend class WritePreparedTransactionTest_IsInSnapshot_Test; + friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test; + friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test; + friend class + WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test; + friend class + WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test; + friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test; + friend class WritePreparedTransactionTest_OldCommitMapGC_Test; + friend class WritePreparedTransactionTest_Rollback_Test; + friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test; + friend class WriteUnpreparedTxn; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class MultiOpsTxnsStressTest; + + void Init(const TransactionDBOptions& txn_db_opts); + + void WPRecordTick(uint32_t ticker_type) const { + RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type); + } + + // A heap with the amortized O(1) complexity for erase. It uses one extra heap + // to keep track of erased entries that are not yet on top of the main heap. + class PreparedHeap { + // The mutex is required for push and pop from PreparedHeap. ::erase will + // use external synchronization via prepared_mutex_. + port::Mutex push_pop_mutex_; + std::deque<uint64_t> heap_; + std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>> + erased_heap_; + std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber}; + // True when testing crash recovery + bool TEST_CRASH_ = false; + friend class WritePreparedTxnDB; + + public: + ~PreparedHeap() { + if (!TEST_CRASH_) { + assert(heap_.empty()); + assert(erased_heap_.empty()); + } + } + port::Mutex* push_pop_mutex() { return &push_pop_mutex_; } + + inline bool empty() { return top() == kMaxSequenceNumber; } + // Returns kMaxSequenceNumber if empty() and the smallest otherwise. + inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); } + inline void push(uint64_t v) { + push_pop_mutex_.AssertHeld(); + if (heap_.empty()) { + heap_top_.store(v, std::memory_order_release); + } else { + assert(heap_top_.load() < v); + } + heap_.push_back(v); + } + void pop(bool locked = false) { + if (!locked) { + push_pop_mutex()->Lock(); + } + push_pop_mutex_.AssertHeld(); + heap_.pop_front(); + while (!heap_.empty() && !erased_heap_.empty() && + // heap_.top() > erased_heap_.top() could happen if we have erased + // a non-existent entry. Ideally the user should not do that but we + // should be resilient against it. + heap_.front() >= erased_heap_.top()) { + if (heap_.front() == erased_heap_.top()) { + heap_.pop_front(); + } + uint64_t erased __attribute__((__unused__)); + erased = erased_heap_.top(); + erased_heap_.pop(); + // No duplicate prepare sequence numbers + assert(erased_heap_.empty() || erased_heap_.top() != erased); + } + while (heap_.empty() && !erased_heap_.empty()) { + erased_heap_.pop(); + } + heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber, + std::memory_order_release); + if (!locked) { + push_pop_mutex()->Unlock(); + } + } + // Concurrrent calls needs external synchronization. It is safe to be called + // concurrent to push and pop though. + void erase(uint64_t seq) { + if (!empty()) { + auto top_seq = top(); + if (seq < top_seq) { + // Already popped, ignore it. + } else if (top_seq == seq) { + pop(); +#ifndef NDEBUG + MutexLock ml(push_pop_mutex()); + assert(heap_.empty() || heap_.front() != seq); +#endif + } else { // top() > seq + // Down the heap, remember to pop it later + erased_heap_.push(seq); + } + } + } + }; + + void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; } + + // Get the commit entry with index indexed_seq from the commit table. It + // returns true if such entry exists. + bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b, + CommitEntry* entry) const; + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction, + // sets the evicted_entry and returns true. + bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry, + CommitEntry* evicted_entry); + + // Rewrite the entry with the index indexed_seq in the commit table with the + // commit entry new_entry only if the existing entry matches the + // expected_entry. Returns false otherwise. + bool ExchangeCommitEntry(const uint64_t indexed_seq, + CommitEntry64b& expected_entry, + const CommitEntry& new_entry); + + // Increase max_evicted_seq_ from the previous value prev_max to the new + // value. This also involves taking care of prepared txns that are not + // committed before new_max, as well as updating the list of live snapshots at + // the time of updating the max. Thread-safety: this function can be called + // concurrently. The concurrent invocations of this function is equivalent to + // a serial invocation in which the last invocation is the one with the + // largest new_max value. + void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max, + const SequenceNumber& new_max); + + inline SequenceNumber SmallestUnCommittedSeq() { + // Note: We have two lists to look into, but for performance reasons they + // are not read atomically. Since CheckPreparedAgainstMax copies the entry + // to delayed_prepared_ before removing it from prepared_txns_, to ensure + // that a prepared entry will not go unmissed, we look into them in opposite + // order: first read prepared_txns_ and then delayed_prepared_. + + // This must be called before calling ::top. This is because the concurrent + // thread would call ::RemovePrepared before updating + // GetLatestSequenceNumber(). Reading then in opposite order here guarantees + // that the ::top that we read would be lower the ::top if we had otherwise + // update/read them atomically. + auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1; + auto min_prepare = prepared_txns_.top(); + // Since we update the prepare_heap always from the main write queue via + // PreReleaseCallback, the prepared_txns_.top() indicates the smallest + // prepared data in 2pc transactions. For non-2pc transactions that are + // written in two steps, we also update prepared_txns_ at the first step + // (via the same mechanism) so that their uncommitted data is reflected in + // SmallestUnCommittedSeq. + if (!delayed_prepared_empty_.load()) { + ReadLock rl(&prepared_mutex_); + if (!delayed_prepared_.empty()) { + return *delayed_prepared_.begin(); + } + } + bool empty = min_prepare == kMaxSequenceNumber; + if (empty) { + // Since GetLatestSequenceNumber is updated + // after prepared_txns_ are, the value of GetLatestSequenceNumber would + // reflect any uncommitted data that is not added to prepared_txns_ yet. + // Otherwise, if there is no concurrent txn, this value simply reflects + // that latest value in the memtable. + return next_prepare; + } else { + return std::min(min_prepare, next_prepare); + } + } + + // Enhance the snapshot object by recording in it the smallest uncommitted seq + inline void EnhanceSnapshot(SnapshotImpl* snapshot, + SequenceNumber min_uncommitted) { + assert(snapshot); + assert(min_uncommitted <= snapshot->number_ + 1); + snapshot->min_uncommitted_ = min_uncommitted; + } + + virtual const std::vector<SequenceNumber> GetSnapshotListFromDB( + SequenceNumber max); + + // Will be called by the public ReleaseSnapshot method. Does the maintenance + // internal to WritePreparedTxnDB + void ReleaseSnapshotInternal(const SequenceNumber snap_seq); + + // Update the list of snapshots corresponding to the soon-to-be-updated + // max_evicted_seq_. Thread-safety: this function can be called concurrently. + // The concurrent invocations of this function is equivalent to a serial + // invocation in which the last invocation is the one with the largest + // version value. + void UpdateSnapshots(const std::vector<SequenceNumber>& snapshots, + const SequenceNumber& version); + // Check the new list of new snapshots against the old one to see if any of + // the snapshots are released and to do the cleanup for the released snapshot. + void CleanupReleasedSnapshots( + const std::vector<SequenceNumber>& new_snapshots, + const std::vector<SequenceNumber>& old_snapshots); + + // Check an evicted entry against live snapshots to see if it should be kept + // around or it can be safely discarded (and hence assume committed for all + // snapshots). Thread-safety: this function can be called concurrently. If it + // is called concurrently with multiple UpdateSnapshots, the result is the + // same as checking the intersection of the snapshot list before updates with + // the snapshot list of all the concurrent updates. + void CheckAgainstSnapshots(const CommitEntry& evicted); + + // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq < + // commit_seq. Return false if checking the next snapshot(s) is not needed. + // This is the case if none of the next snapshots could satisfy the condition. + // next_is_larger: the next snapshot will be a larger value + bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq, + const uint64_t& commit_seq, + const uint64_t& snapshot_seq, + const bool next_is_larger); + + // A trick to increase the last visible sequence number by one and also wait + // for the in-flight commits to be visible. + void AdvanceSeqByOne(); + + // The list of live snapshots at the last time that max_evicted_seq_ advanced. + // The list stored into two data structures: in snapshot_cache_ that is + // efficient for concurrent reads, and in snapshots_ if the data does not fit + // into snapshot_cache_. The total number of snapshots in the two lists + std::atomic<size_t> snapshots_total_ = {}; + // The list sorted in ascending order. Thread-safety for writes is provided + // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for + // each entry. In x86_64 architecture such reads are compiled to simple read + // instructions. + const size_t SNAPSHOT_CACHE_BITS; + const size_t SNAPSHOT_CACHE_SIZE; + std::unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_; + // 2nd list for storing snapshots. The list sorted in ascending order. + // Thread-safety is provided with snapshots_mutex_. + std::vector<SequenceNumber> snapshots_; + // The list of all snapshots: snapshots_ + snapshot_cache_. This list although + // redundant but simplifies CleanupOldSnapshots implementation. + // Thread-safety is provided with snapshots_mutex_. + std::vector<SequenceNumber> snapshots_all_; + // The version of the latest list of snapshots. This can be used to avoid + // rewriting a list that is concurrently updated with a more recent version. + SequenceNumber snapshots_version_ = 0; + + // A heap of prepared transactions. Thread-safety is provided with + // prepared_mutex_. + PreparedHeap prepared_txns_; + const size_t COMMIT_CACHE_BITS; + const size_t COMMIT_CACHE_SIZE; + const CommitEntry64bFormat FORMAT; + // commit_cache_ must be initialized to zero to tell apart an empty index from + // a filled one. Thread-safety is provided with commit_cache_mutex_. + std::unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_; + // The largest evicted *commit* sequence number from the commit_cache_. If a + // seq is smaller than max_evicted_seq_ is might or might not be present in + // commit_cache_. So commit_cache_ must first be checked before consulting + // with max_evicted_seq_. + std::atomic<uint64_t> max_evicted_seq_ = {}; + // Order: 1) update future_max_evicted_seq_ = new_max, 2) + // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since + // GetSnapshotInternal guarantess that the snapshot seq is larger than + // future_max_evicted_seq_, this guarantes that if a snapshot is not larger + // than max has already being looked at via a GetSnapshotListFromDB(new_max). + std::atomic<uint64_t> future_max_evicted_seq_ = {}; + // Advance max_evicted_seq_ by this value each time it needs an update. The + // larger the value, the less frequent advances we would have. We do not want + // it to be too large either as it would cause stalls by doing too much + // maintenance work under the lock. + size_t INC_STEP_FOR_MAX_EVICTED = 1; + // A map from old snapshots (expected to be used by a few read-only txns) to + // prepared sequence number of the evicted entries from commit_cache_ that + // overlaps with such snapshot. These are the prepared sequence numbers that + // the snapshot, to which they are mapped, cannot assume to be committed just + // because it is no longer in the commit_cache_. The vector must be sorted + // after each update. + // Thread-safety is provided with old_commit_map_mutex_. + std::map<SequenceNumber, std::vector<SequenceNumber>> old_commit_map_; + // A set of long-running prepared transactions that are not finished by the + // time max_evicted_seq_ advances their sequence number. This is expected to + // be empty normally. Thread-safety is provided with prepared_mutex_. + std::set<uint64_t> delayed_prepared_; + // Commit of a delayed prepared: 1) update commit cache, 2) update + // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_. + // delayed_prepared_commits_ will help us tell apart the unprepared txns from + // the ones that are committed but not cleaned up yet. + std::unordered_map<SequenceNumber, SequenceNumber> delayed_prepared_commits_; + // Update when delayed_prepared_.empty() changes. Expected to be true + // normally. + std::atomic<bool> delayed_prepared_empty_ = {true}; + // Update when old_commit_map_.empty() changes. Expected to be true normally. + std::atomic<bool> old_commit_map_empty_ = {true}; + mutable port::RWMutex prepared_mutex_; + mutable port::RWMutex old_commit_map_mutex_; + mutable port::RWMutex commit_cache_mutex_; + mutable port::RWMutex snapshots_mutex_; + // A cache of the cf comparators + // Thread safety: since it is a const it is safe to read it concurrently + std::shared_ptr<std::map<uint32_t, const Comparator*>> cf_map_; + // A cache of the cf handles + // Thread safety: since the handle is read-only object it is a const it is + // safe to read it concurrently + std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_; + // A dummy snapshot object that refers to kMaxSequenceNumber + SnapshotImpl dummy_max_snapshot_; +}; + +class WritePreparedTxnReadCallback : public ReadCallback { + public: + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot) + : ReadCallback(snapshot), + db_(db), + backed_by_snapshot_(kBackedByDBSnapshot) {} + WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + SnapshotBackup backed_by_snapshot) + : ReadCallback(snapshot, min_uncommitted), + db_(db), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WritePreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override { + auto snapshot = max_visible_seq_; + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; + } + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + // TODO(myabandeh): override Refresh when Iterator::Refresh is supported + private: + WritePreparedTxnDB* db_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class AddPreparedCallback : public PreReleaseCallback { + public: + AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl, + size_t sub_batch_cnt, bool two_write_queues, + bool first_prepare_batch) + : db_(db), + db_impl_(db_impl), + sub_batch_cnt_(sub_batch_cnt), + two_write_queues_(two_write_queues), + first_prepare_batch_(first_prepare_batch) { + (void)two_write_queues_; // to silence unused private field warning + } + virtual Status Callback(SequenceNumber prepare_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t log_number, size_t index, + size_t total) override { + assert(index < total); + // To reduce the cost of lock acquisition competing with the concurrent + // prepare requests, lock on the first callback and unlock on the last. + const bool do_lock = !two_write_queues_ || index == 0; + const bool do_unlock = !two_write_queues_ || index + 1 == total; + // Always Prepare from the main queue + assert(!two_write_queues_ || !is_mem_disabled); // implies the 1st queue + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause"); + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume"); + if (do_lock) { + db_->prepared_txns_.push_pop_mutex()->Lock(); + } + const bool kLocked = true; + for (size_t i = 0; i < sub_batch_cnt_; i++) { + db_->AddPrepared(prepare_seq + i, kLocked); + } + if (do_unlock) { + db_->prepared_txns_.push_pop_mutex()->Unlock(); + } + TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end"); + if (first_prepare_batch_) { + assert(log_number != 0); + db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection( + log_number); + } + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + size_t sub_batch_cnt_; + bool two_write_queues_; + // It is 2PC and this is the first prepare batch. Always the case in 2PC + // unless it is WriteUnPrepared. + bool first_prepare_batch_; +}; + +class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WritePreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq, + size_t prep_batch_cnt, size_t data_batch_cnt = 0, + SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + prep_batch_cnt_(prep_batch_cnt), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + aux_seq_(aux_seq), + aux_batch_cnt_(aux_batch_cnt), + includes_aux_batch_(aux_batch_cnt > 0) { + assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor + assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0); + assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber)); // xor + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + // Always commit from the 2nd queue + assert(!db_impl_->immutable_db_options().two_write_queues || + is_mem_disabled); + assert(includes_data_ || prep_seq_ != kMaxSequenceNumber); + // Data batch is what accompanied with the commit marker and affects the + // last seq in the commit batch. + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + if (prep_seq_ != kMaxSequenceNumber) { + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + } // else there was no prepare phase + if (includes_aux_batch_) { + for (size_t i = 0; i < aux_batch_cnt_; i++) { + db_->AddCommitted(aux_seq_ + i, last_commit_seq); + } + } + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + // Note RemovePrepared should be called after publishing the seq. + // Otherwise SmallestUnCommittedSeq optimization breaks. + if (prep_seq_ != kMaxSequenceNumber) { + db_->RemovePrepared(prep_seq_, prep_batch_cnt_); + } // else there was no prepare phase + if (includes_aux_batch_) { + db_->RemovePrepared(aux_seq_, aux_batch_cnt_); + } + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + // kMaxSequenceNumber if there was no prepare phase + SequenceNumber prep_seq_; + size_t prep_batch_cnt_; + size_t data_batch_cnt_; + // Data here is the batch that is written with the commit marker, either + // because it is commit without prepare or commit has a CommitTimeWriteBatch. + bool includes_data_; + // Auxiliary batch (if there is any) is a batch that is written before, but + // gets the same commit seq as prepare batch or data batch. This is used in + // two write queues where the CommitTimeWriteBatch becomes the aux batch and + // we do a separate write to actually commit everything. + SequenceNumber aux_seq_; + size_t aux_batch_cnt_; + bool includes_aux_batch_; +}; + +// For two_write_queues commit both the aborted batch and the cleanup batch and +// then published the seq +class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback { + public: + WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db, + DBImpl* db_impl, + SequenceNumber prep_seq, + SequenceNumber rollback_seq, + size_t prep_batch_cnt) + : db_(db), + db_impl_(db_impl), + prep_seq_(prep_seq), + rollback_seq_(rollback_seq), + prep_batch_cnt_(prep_batch_cnt) { + assert(prep_seq != kMaxSequenceNumber); + assert(rollback_seq != kMaxSequenceNumber); + assert(prep_batch_cnt_ > 0); + } + + Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t, + size_t /*index*/, size_t /*total*/) override { + // Always commit from the 2nd queue + assert(is_mem_disabled); // implies the 2nd queue + assert(db_impl_->immutable_db_options().two_write_queues); +#ifdef NDEBUG + (void)is_mem_disabled; +#endif + const uint64_t last_commit_seq = commit_seq; + db_->AddCommitted(rollback_seq_, last_commit_seq); + for (size_t i = 0; i < prep_batch_cnt_; i++) { + db_->AddCommitted(prep_seq_ + i, last_commit_seq); + } + db_impl_->SetLastPublishedSequence(last_commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + SequenceNumber prep_seq_; + SequenceNumber rollback_seq_; + size_t prep_batch_cnt_; +}; + +// Count the number of sub-batches inside a batch. A sub-batch does not have +// duplicate keys. +struct SubBatchCounter : public WriteBatch::Handler { + explicit SubBatchCounter(std::map<uint32_t, const Comparator*>& comparators) + : comparators_(comparators), batches_(1) {} + std::map<uint32_t, const Comparator*>& comparators_; + using CFKeys = std::set<Slice, SetComparator>; + std::map<uint32_t, CFKeys> keys_; + size_t batches_; + size_t BatchCount() { return batches_; } + void AddKey(const uint32_t cf, const Slice& key); + void InitWithComp(const uint32_t cf); + Status MarkNoop(bool) override { return Status::OK(); } + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + Status MarkCommit(const Slice&) override { return Status::OK(); } + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status DeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + AddKey(cf, key); + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + AddKey(cf, key); + return Status::OK(); + } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { return Status::OK(); } + Handler::OptionState WriteAfterCommit() const override { + return Handler::OptionState::kDisabled; + } +}; + +SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot, + SequenceNumber* min, + SequenceNumber* max) { + if (snapshot != nullptr) { + *min = + static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_; + *max = static_cast_with_check<const SnapshotImpl>(snapshot)->number_; + // A duplicate of the check in EnhanceSnapshot(). + assert(*min <= *max + 1); + return kBackedByDBSnapshot; + } else { + *min = SmallestUnCommittedSeq(); + *max = 0; // to be assigned later after sv is referenced. + return kUnbackedByDBSnapshot; + } +} + +bool WritePreparedTxnDB::ValidateSnapshot( + const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot, + std::memory_order order) { + if (backed_by_snapshot == kBackedByDBSnapshot) { + return true; + } else { + SequenceNumber max = max_evicted_seq_.load(order); + // Validate that max has not advanced the snapshot seq that is not backed + // by a real snapshot. This is a very rare case that should not happen in + // real workloads. + if (UNLIKELY(snap_seq <= max && snap_seq != 0)) { + return false; + } + } + return true; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc new file mode 100644 index 000000000..6c8c62e0e --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc @@ -0,0 +1,790 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/transaction_test.h" +#include "utilities/transactions/write_unprepared_txn.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTransactionTestBase : public TransactionTestBase { + public: + WriteUnpreparedTransactionTestBase(bool use_stackable_db, + bool two_write_queue, + TxnDBWritePolicy write_policy) + : TransactionTestBase(use_stackable_db, two_write_queue, write_policy, + kOrderedWrite) {} +}; + +class WriteUnpreparedTransactionTest + : public WriteUnpreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple<bool, bool, TxnDBWritePolicy>> { + public: + WriteUnpreparedTransactionTest() + : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()), + std::get<1>(GetParam()), + std::get<2>(GetParam())) {} +}; + +INSTANTIATE_TEST_CASE_P( + WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest, + ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED), + std::make_tuple(false, true, WRITE_UNPREPARED))); + +enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT }; +class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple<bool, StressAction>> { + public: + WriteUnpreparedStressTest() + : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()), + WRITE_UNPREPARED), + action_(std::get<1>(GetParam())) {} + StressAction action_; +}; + +INSTANTIATE_TEST_CASE_P( + WriteUnpreparedStressTest, WriteUnpreparedStressTest, + ::testing::Values(std::make_tuple(false, NO_SNAPSHOT), + std::make_tuple(false, RO_SNAPSHOT), + std::make_tuple(false, REFRESH_SNAPSHOT), + std::make_tuple(true, NO_SNAPSHOT), + std::make_tuple(true, RO_SNAPSHOT), + std::make_tuple(true, REFRESH_SNAPSHOT))); + +TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { + // The following tests checks whether reading your own write for + // a transaction works for write unprepared, when there are uncommitted + // values written into DB. + auto verify_state = [](Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); + }; + + // Test always reseeking vs never reseeking. + for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) { + options.max_sequential_skip_in_iterations = max_skip; + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + TransactionOptions txn_options; + WriteOptions woptions; + ReadOptions roptions; + + ASSERT_OK(db->Put(woptions, "a", "")); + ASSERT_OK(db->Put(woptions, "b", "")); + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn); + txn->SetSnapshot(); + + for (int i = 0; i < 5; i++) { + std::string stored_value = "v" + std::to_string(i); + ASSERT_OK(txn->Put("a", stored_value)); + ASSERT_OK(txn->Put("b", stored_value)); + ASSERT_OK(wup_txn->FlushWriteBatchToDB(false)); + + // Test Get() + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, stored_value); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, stored_value); + + // Test Next() + auto iter = txn->GetIterator(roptions); + iter->Seek("a"); + verify_state(iter, "a", stored_value); + + iter->Next(); + verify_state(iter, "b", stored_value); + + iter->SeekToFirst(); + verify_state(iter, "a", stored_value); + + iter->Next(); + verify_state(iter, "b", stored_value); + + delete iter; + + // Test Prev() + iter = txn->GetIterator(roptions); + iter->SeekForPrev("b"); + verify_state(iter, "b", stored_value); + + iter->Prev(); + verify_state(iter, "a", stored_value); + + iter->SeekToLast(); + verify_state(iter, "b", stored_value); + + iter->Prev(); + verify_state(iter, "a", stored_value); + + delete iter; + } + + delete txn; + } +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) { + // This is a stress test where different threads are writing random keys, and + // then before committing or aborting the transaction, it validates to see + // that it can read the keys it wrote, and the keys it did not write respect + // the snapshot. To avoid row lock contention (and simply stressing the + // locking system), each thread is mostly only writing to its own set of keys. + const uint32_t kNumIter = 1000; + const uint32_t kNumThreads = 10; + const uint32_t kNumKeys = 5; + + // Test with + // 1. no snapshots set + // 2. snapshot set on ReadOptions + // 3. snapshot set, and refreshing after every write. + StressAction a = action_; + WriteOptions write_options; + txn_db_options.transaction_lock_timeout = -1; + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + std::vector<std::string> keys; + for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) { + keys.push_back("k" + std::to_string(k)); + } + RandomShuffle(keys.begin(), keys.end()); + + // This counter will act as a "sequence number" to help us validate + // visibility logic with snapshots. If we had direct access to the seqno of + // snapshots and key/values, then we should directly compare those instead. + std::atomic<int64_t> counter(0); + + std::function<void(uint32_t)> stress_thread = [&](int id) { + size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id()); + Random64 rnd(static_cast<uint32_t>(tid)); + + Transaction* txn; + TransactionOptions txn_options; + // batch_size of 1 causes writes to DB for every marker. + txn_options.write_batch_flush_threshold = 1; + ReadOptions read_options; + + for (uint32_t i = 0; i < kNumIter; i++) { + std::set<std::string> owned_keys(keys.begin() + id * kNumKeys, + keys.begin() + (id + 1) * kNumKeys); + // Add unowned keys to make the workload more interesting, but this + // increases row lock contention, so just do it sometimes. + if (rnd.OneIn(2)) { + owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]); + } + + txn = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->SetName(std::to_string(id))); + txn->SetSnapshot(); + if (a >= RO_SNAPSHOT) { + read_options.snapshot = txn->GetSnapshot(); + ASSERT_TRUE(read_options.snapshot != nullptr); + } + + uint64_t buf[2]; + buf[0] = id; + + // When scanning through the database, make sure that all unprepared + // keys have value >= snapshot and all other keys have value < snapshot. + int64_t snapshot_num = counter.fetch_add(1); + + Status s; + for (const auto& key : owned_keys) { + buf[1] = counter.fetch_add(1); + s = txn->Put(key, Slice((const char*)buf, sizeof(buf))); + if (!s.ok()) { + break; + } + if (a == REFRESH_SNAPSHOT) { + txn->SetSnapshot(); + read_options.snapshot = txn->GetSnapshot(); + snapshot_num = counter.fetch_add(1); + } + } + + // Failure is possible due to snapshot validation. In this case, + // rollback and move onto next iteration. + if (!s.ok()) { + ASSERT_TRUE(s.IsBusy()); + ASSERT_OK(txn->Rollback()); + delete txn; + continue; + } + + auto verify_key = [&owned_keys, &a, &id, &snapshot_num]( + const std::string& key, const std::string& value) { + if (owned_keys.count(key) > 0) { + ASSERT_EQ(value.size(), 16); + + // Since this key is part of owned_keys, then this key must be + // unprepared by this transaction identified by 'id' + ASSERT_EQ(((int64_t*)value.c_str())[0], id); + if (a == REFRESH_SNAPSHOT) { + // If refresh snapshot is true, then the snapshot is refreshed + // after every Put(), meaning that the current snapshot in + // snapshot_num must be greater than the "seqno" of any keys + // written by the current transaction. + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } else { + // If refresh snapshot is not on, then the snapshot was taken at + // the beginning of the transaction, meaning all writes must come + // after snapshot_num + ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num); + } + } else if (a >= RO_SNAPSHOT) { + // If this is not an unprepared key, just assert that the key + // "seqno" is smaller than the snapshot seqno. + ASSERT_EQ(value.size(), 16); + ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + } + }; + + // Validate Get()/Next()/Prev(). Do only one of them to save time, and + // reduce lock contention. + switch (rnd.Uniform(3)) { + case 0: // Validate Get() + { + for (const auto& key : keys) { + std::string value; + s = txn->Get(read_options, Slice(key), &value); + if (!s.ok()) { + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(owned_keys.count(key), 0); + } else { + verify_key(key, value); + } + } + break; + } + case 1: // Validate Next() + { + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_OK(iter->status()); + delete iter; + break; + } + case 2: // Validate Prev() + { + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + verify_key(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_OK(iter->status()); + delete iter; + break; + } + default: + FAIL(); + } + + if (rnd.OneIn(2)) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + delete txn; + } + }; + + std::vector<port::Thread> threads; + for (uint32_t i = 0; i < kNumThreads; i++) { + threads.emplace_back(stress_thread, i); + } + + for (auto& t : threads) { + t.join(); + } +} +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +// This tests how write unprepared behaves during recovery when the DB crashes +// after a transaction has either been unprepared or prepared, and tests if +// the changes are correctly applied for prepared transactions if we decide to +// rollback/commit. +TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) { + WriteOptions write_options; + write_options.disableWAL = false; + TransactionOptions txn_options; + std::vector<Transaction*> prepared_trans; + WriteUnpreparedTxnDB* wup_db; + options.disable_auto_compactions = true; + + enum Action { UNPREPARED, ROLLBACK, COMMIT }; + + // batch_size of 1 causes writes to DB for every marker. + for (size_t batch_size : {1, 1000000}) { + txn_options.write_batch_flush_threshold = batch_size; + for (bool empty : {true, false}) { + for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) { + for (int num_batches = 1; num_batches < 10; num_batches++) { + // Reset database. + prepared_trans.clear(); + ASSERT_OK(ReOpen()); + wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db); + if (!empty) { + for (int i = 0; i < num_batches; i++) { + ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), + "before value" + std::to_string(i))); + } + } + + // Write num_batches unprepared batches. + Transaction* txn = db->BeginTransaction(write_options, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn); + ASSERT_OK(txn->SetName("xid")); + for (int i = 0; i < num_batches; i++) { + ASSERT_OK( + txn->Put("k" + std::to_string(i), "value" + std::to_string(i))); + if (txn_options.write_batch_flush_threshold == 1) { + // WriteUnprepared will check write_batch_flush_threshold and + // possibly flush before appending to the write batch. No flush + // will happen at the first write because the batch is still + // empty, so after k puts, there should be k-1 flushed batches. + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i); + } else { + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); + } + } + if (a == UNPREPARED) { + // This is done to prevent the destructor from rolling back the + // transaction for us, since we want to pretend we crashed and + // test that recovery does the rollback. + wup_txn->unprep_seqs_.clear(); + } else { + ASSERT_OK(txn->Prepare()); + } + delete txn; + + // Crash and run recovery code paths. + ASSERT_OK(wup_db->db_impl_->FlushWAL(true)); + wup_db->TEST_Crash(); + ASSERT_OK(ReOpenNoDelete()); + assert(db != nullptr); + + db->GetAllPreparedTransactions(&prepared_trans); + ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1); + if (a == ROLLBACK) { + ASSERT_OK(prepared_trans[0]->Rollback()); + delete prepared_trans[0]; + } else if (a == COMMIT) { + ASSERT_OK(prepared_trans[0]->Commit()); + delete prepared_trans[0]; + } + + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + // Check that DB has before values. + if (!empty || a == COMMIT) { + for (int i = 0; i < num_batches; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i)); + if (a == COMMIT) { + ASSERT_EQ(iter->value().ToString(), + "value" + std::to_string(i)); + } else { + ASSERT_EQ(iter->value().ToString(), + "before value" + std::to_string(i)); + } + iter->Next(); + } + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } + } + } + } +} + +// Basic test to see that unprepared batch gets written to DB when batch size +// is exceeded. It also does some basic checks to see if commit/rollback works +// as expected for write unprepared. +TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) { + WriteOptions write_options; + TransactionOptions txn_options; + const int kNumKeys = 10; + + // batch_size of 1 causes writes to DB for every marker. + for (size_t batch_size : {1, 1000000}) { + txn_options.write_batch_flush_threshold = batch_size; + for (bool prepare : {false, true}) { + for (bool commit : {false, true}) { + ASSERT_OK(ReOpen()); + Transaction* txn = db->BeginTransaction(write_options, txn_options); + WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn); + ASSERT_OK(txn->SetName("xid")); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(txn->Put("k" + std::to_string(i), "v" + std::to_string(i))); + if (txn_options.write_batch_flush_threshold == 1) { + // WriteUnprepared will check write_batch_flush_threshold and + // possibly flush before appending to the write batch. No flush will + // happen at the first write because the batch is still empty, so + // after k puts, there should be k-1 flushed batches. + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i); + } else { + ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0); + } + } + + if (prepare) { + ASSERT_OK(txn->Prepare()); + } + + Iterator* iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + assert(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + if (commit) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + delete txn; + + iter = db->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + + for (int i = 0; i < (commit ? kNumKeys : 0); i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i)); + ASSERT_EQ(iter->value().ToString(), "v" + std::to_string(i)); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + } + } + } +} + +// Test whether logs containing unprepared/prepared batches are kept even +// after memtable finishes flushing, and whether they are removed when +// transaction commits/aborts. +// +// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests. +TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) { + WriteOptions write_options; + TransactionOptions txn_options; + // batch_size of 1 causes writes to DB for every marker. + txn_options.write_batch_flush_threshold = 1; + const int kNumKeys = 10; + + WriteOptions wopts; + wopts.sync = true; + + for (bool prepare : {false, true}) { + for (bool commit : {false, true}) { + ASSERT_OK(ReOpen()); + auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db); + auto db_impl = wup_db->db_impl_; + + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn1->SetName("xid1")); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn2->SetName("xid2")); + + // Spread this transaction across multiple log files. + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(txn1->Put("k1" + std::to_string(i), "v" + std::to_string(i))); + if (i >= kNumKeys / 2) { + ASSERT_OK( + txn2->Put("k2" + std::to_string(i), "v" + std::to_string(i))); + } + + if (i > 0) { + ASSERT_OK(db_impl->TEST_SwitchWAL()); + } + } + + ASSERT_GT(txn1->GetLogNumber(), 0); + ASSERT_GT(txn2->GetLogNumber(), 0); + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber()); + + if (prepare) { + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn2->Prepare()); + } + + ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber()); + ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber()); + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn1->GetLogNumber()); + if (commit) { + ASSERT_OK(txn1->Commit()); + } else { + ASSERT_OK(txn1->Rollback()); + } + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), + txn2->GetLogNumber()); + + if (commit) { + ASSERT_OK(txn2->Commit()); + } else { + ASSERT_OK(txn2->Rollback()); + } + + ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0); + + delete txn1; + delete txn2; + } + } +} + +TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + + // Do some writes with no snapshot + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Put("c", "c")); + + // Test that it is still possible to create iterators after writes with no + // snapshot, if iterator snapshot is fresh enough. + ReadOptions roptions; + auto iter = txn->GetIterator(roptions); + ASSERT_OK(iter->status()); + int keys = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), iter->value().ToString()); + } + ASSERT_EQ(keys, 3); + ASSERT_OK(iter->status()); + + delete iter; + delete txn; +} + +// Test whether write to a transaction while iterating is supported. +TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + enum Action { DO_DELETE, DO_UPDATE }; + + for (Action a : {DO_DELETE, DO_UPDATE}) { + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i))); + } + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + // write_batch_ now contains 1 key. + ASSERT_OK(txn->Put("9", "a")); + + ReadOptions roptions; + auto iter = txn->GetIterator(roptions); + ASSERT_OK(iter->status()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + if (iter->key() == "9") { + ASSERT_EQ(iter->value().ToString(), "a"); + } else { + ASSERT_EQ(iter->key().ToString(), iter->value().ToString()); + } + + if (a == DO_DELETE) { + ASSERT_OK(txn->Delete(iter->key())); + } else { + ASSERT_OK(txn->Put(iter->key(), "b")); + } + } + ASSERT_OK(iter->status()); + + delete iter; + ASSERT_OK(txn->Commit()); + + iter = db->NewIterator(roptions); + ASSERT_OK(iter->status()); + if (a == DO_DELETE) { + // Check that db is empty. + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + } else { + int keys = 0; + // Check that all values are updated to b. + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->value().ToString(), "b"); + } + ASSERT_EQ(keys, 100); + } + ASSERT_OK(iter->status()); + + delete iter; + delete txn; + } +} + +// Test that using an iterator after transaction clear is not supported +TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + enum Action { kCommit, kRollback }; + + for (Action a : {kCommit, kRollback}) { + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i))); + } + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + ASSERT_OK(txn->Put("9", "a")); + + ReadOptions roptions; + auto iter1 = txn->GetIterator(roptions); + auto iter2 = txn->GetIterator(roptions); + iter1->SeekToFirst(); + iter2->Seek("9"); + + // Check that iterators are valid before transaction finishes. + ASSERT_TRUE(iter1->Valid()); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter1->status()); + ASSERT_OK(iter2->status()); + + if (a == kCommit) { + ASSERT_OK(txn->Commit()); + } else { + ASSERT_OK(txn->Rollback()); + } + + // Check that iterators are invalidated after transaction finishes. + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + ASSERT_TRUE(iter1->status().IsInvalidArgument()); + ASSERT_TRUE(iter2->status().IsInvalidArgument()); + + delete iter1; + delete iter2; + delete txn; + } +} + +TEST_P(WriteUnpreparedTransactionTest, SavePoint) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + txn->SetSavePoint(); + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Commit()); + + ReadOptions roptions; + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, "b"); + delete txn; +} + +TEST_P(WriteUnpreparedTransactionTest, UntrackedKeys) { + WriteOptions woptions; + TransactionOptions txn_options; + txn_options.write_batch_flush_threshold = 1; + + Transaction* txn = db->BeginTransaction(woptions, txn_options); + auto wb = txn->GetWriteBatch()->GetWriteBatch(); + ASSERT_OK(txn->Put("a", "a")); + ASSERT_OK(wb->Put("a_untrack", "a_untrack")); + txn->SetSavePoint(); + ASSERT_OK(txn->Put("b", "b")); + ASSERT_OK(txn->Put("b_untrack", "b_untrack")); + + ReadOptions roptions; + std::string value; + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "a_untrack", &value)); + ASSERT_EQ(value, "a_untrack"); + ASSERT_OK(txn->Get(roptions, "b", &value)); + ASSERT_EQ(value, "b"); + ASSERT_OK(txn->Get(roptions, "b_untrack", &value)); + ASSERT_EQ(value, "b_untrack"); + + // b and b_untrack should be rolled back. + ASSERT_OK(txn->RollbackToSavePoint()); + ASSERT_OK(txn->Get(roptions, "a", &value)); + ASSERT_EQ(value, "a"); + ASSERT_OK(txn->Get(roptions, "a_untrack", &value)); + ASSERT_EQ(value, "a_untrack"); + auto s = txn->Get(roptions, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Everything should be rolled back. + ASSERT_OK(txn->Rollback()); + s = txn->Get(roptions, "a", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "a_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn->Get(roptions, "b_untrack", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc new file mode 100644 index 000000000..6e04d3344 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc @@ -0,0 +1,1053 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_unprepared_txn.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" +#include "utilities/transactions/write_unprepared_txn_db.h" +#include "utilities/write_batch_with_index/write_batch_with_index_internal.h" + +namespace ROCKSDB_NAMESPACE { + +bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) { + // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is + // in unprep_seqs, we have to check if seq is equal to prep_seq or any of + // the prepare_batch_cnt seq nums after it. + // + // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is + // large. + for (const auto& it : unprep_seqs_) { + if (it.first <= seq && seq < it.first + it.second) { + return true; + } + } + + bool snap_released = false; + auto ret = + db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released); + assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot); + snap_released_ |= snap_released; + return ret; +} + +WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) + : WritePreparedTxn(txn_db, write_options, txn_options), + wupt_db_(txn_db), + last_log_number_(0), + recovered_txn_(false), + largest_validated_seq_(0) { + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } +} + +WriteUnpreparedTxn::~WriteUnpreparedTxn() { + if (!unprep_seqs_.empty()) { + assert(log_number_ > 0); + assert(GetId() > 0); + assert(!name_.empty()); + + // We should rollback regardless of GetState, but some unit tests that + // test crash recovery run the destructor assuming that rollback does not + // happen, so that rollback during recovery can be exercised. + if (GetState() == STARTED || GetState() == LOCKS_STOLEN) { + auto s = RollbackInternal(); + assert(s.ok()); + if (!s.ok()) { + ROCKS_LOG_FATAL( + wupt_db_->info_log_, + "Rollback of WriteUnprepared transaction failed in destructor: %s", + s.ToString().c_str()); + } + dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed( + log_number_); + } + } + + // Clear the tracked locks so that ~PessimisticTransaction does not + // try to unlock keys for recovered transactions. + if (recovered_txn_) { + tracked_locks_->Clear(); + } +} + +void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) { + PessimisticTransaction::Initialize(txn_options); + if (txn_options.write_batch_flush_threshold < 0) { + write_batch_flush_threshold_ = + txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold; + } else { + write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold; + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + assert(active_iterators_.empty()); + active_iterators_.clear(); + untracked_keys_.clear(); +} + +Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) { + Status s; + if (active_iterators_.empty()) { + s = MaybeFlushWriteBatchToDB(); + if (!s.ok()) { + return s; + } + } + s = do_write(); + if (s.ok()) { + if (snapshot_) { + largest_validated_seq_ = + std::max(largest_validated_seq_, snapshot_->GetSequenceNumber()); + } else { + // TODO(lth): We should use the same number as tracked_at_seq in TryLock, + // because what is actually being tracked is the sequence number at which + // this key was locked at. + largest_validated_seq_ = db_impl_->GetLastPublishedSequence(); + } + } + return s; +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Put(column_family, key, value, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Merge(column_family, key, value, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const Slice& key, const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::Delete(column_family, key, assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked) { + return HandleWrite([&]() { + return TransactionBaseImpl::SingleDelete(column_family, key, + assume_tracked); + }); +} + +// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For +// WriteUnprepared, the write batches have already been written into the +// database during WAL replay, so all we have to do is just to "retrack" the key +// so that rollbacks are possible. +// +// Calling TryLock instead of TrackKey is also possible, but as an optimization, +// recovered transactions do not hold locks on their keys. This follows the +// implementation in PessimisticTransactionDB::Initialize where we set +// skip_concurrency_control to true. +Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) { + struct TrackKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + return Status::OK(); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber, + false /* read_only */, true /* exclusive */); + } + return Status::OK(); + } + + // Recovered batches do not contain 2PC markers. + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + TrackKeyHandler handler(this, + wupt_db_->txn_db_options_.rollback_merge_operands); + return wb->Iterate(&handler); +} + +Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() { + const bool kPrepared = true; + Status s; + if (write_batch_flush_threshold_ > 0 && + write_batch_.GetWriteBatch()->Count() > 0 && + write_batch_.GetDataSize() > + static_cast<size_t>(write_batch_flush_threshold_)) { + assert(GetState() != PREPARED); + s = FlushWriteBatchToDB(!kPrepared); + } + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) { + // If the current write batch contains savepoints, then some special handling + // is required so that RollbackToSavepoint can work. + // + // RollbackToSavepoint is not supported after Prepare() is called, so only do + // this for unprepared batches. + if (!prepared && unflushed_save_points_ != nullptr && + !unflushed_save_points_->empty()) { + return FlushWriteBatchWithSavePointToDB(); + } + + return FlushWriteBatchToDBInternal(prepared); +} + +Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) { + if (name_.empty()) { + assert(!prepared); +#ifndef NDEBUG + static std::atomic_ullong autogen_id{0}; + // To avoid changing all tests to call SetName, just autogenerate one. + if (wupt_db_->txn_db_options_.autogenerate_name) { + auto s = SetName(std::string("autoxid") + + std::to_string(autogen_id.fetch_add(1))); + assert(s.ok()); + } else +#endif + { + return Status::InvalidArgument("Cannot write to DB without SetName."); + } + } + + struct UntrackedKeyHandler : public WriteBatch::Handler { + WriteUnpreparedTxn* txn_; + bool rollback_merge_operands_; + + UntrackedKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands) + : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {} + + Status AddUntrackedKey(uint32_t cf, const Slice& key) { + auto str = key.ToString(); + PointLockStatus lock_status = + txn_->tracked_locks_->GetPointLockStatus(cf, str); + if (!lock_status.locked) { + txn_->untracked_keys_[cf].push_back(str); + } + return Status::OK(); + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return AddUntrackedKey(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return AddUntrackedKey(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + if (rollback_merge_operands_) { + return AddUntrackedKey(cf, key); + } + return Status::OK(); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + UntrackedKeyHandler handler( + this, wupt_db_->txn_db_options_.rollback_merge_operands); + auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&handler); + assert(s.ok()); + + // TODO(lth): Reduce duplicate code with WritePrepared prepare logic. + WriteOptions write_options = write_options_; + write_options.disableWAL = false; + const bool WRITE_AFTER_COMMIT = true; + const bool first_prepare_batch = log_number_ == 0; + // MarkEndPrepare will change Noop marker to the appropriate marker. + s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), + name_, !WRITE_AFTER_COMMIT, !prepared); + assert(s.ok()); + // For each duplicate key we account for a new sub-batch + prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); + // AddPrepared better to be called in the pre-release callback otherwise there + // is a non-zero chance of max advancing prepare_seq and readers assume the + // data as committed. + // Also having it in the PreReleaseCallback allows in-order addition of + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, prepare_batch_cnt_, + db_impl_->immutable_db_options().two_write_queues, first_prepare_batch); + const bool DISABLE_MEMTABLE = true; + uint64_t seq_used = kMaxSequenceNumber; + // log_number_ should refer to the oldest log containing uncommitted data + // from the current transaction. This means that if log_number_ is set, + // WriteImpl should not overwrite that value, so set log_used to nullptr if + // log_number_ is already set. + s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(), + /*callback*/ nullptr, &last_log_number_, + /*log ref*/ 0, !DISABLE_MEMTABLE, &seq_used, + prepare_batch_cnt_, &add_prepared_callback); + if (log_number_ == 0) { + log_number_ = last_log_number_; + } + assert(!s.ok() || seq_used != kMaxSequenceNumber); + auto prepare_seq = seq_used; + + // Only call SetId if it hasn't been set yet. + if (GetId() == 0) { + SetId(prepare_seq); + } + // unprep_seqs_ will also contain prepared seqnos since they are treated in + // the same way in the prepare/commit callbacks. See the comment on the + // definition of unprep_seqs_. + unprep_seqs_[prepare_seq] = prepare_batch_cnt_; + + // Reset transaction state. + if (!prepared) { + prepare_batch_cnt_ = 0; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + return s; +} + +Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() { + assert(unflushed_save_points_ != nullptr && + unflushed_save_points_->size() > 0); + assert(save_points_ != nullptr && save_points_->size() > 0); + assert(save_points_->size() >= unflushed_save_points_->size()); + + // Handler class for creating an unprepared batch from a savepoint. + struct SavePointBatchHandler : public WriteBatch::Handler { + WriteBatchWithIndex* wb_; + const std::map<uint32_t, ColumnFamilyHandle*>& handles_; + + SavePointBatchHandler( + WriteBatchWithIndex* wb, + const std::map<uint32_t, ColumnFamilyHandle*>& handles) + : wb_(wb), handles_(handles) {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Put(handles_.at(cf), key, value); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return wb_->Delete(handles_.at(cf), key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return wb_->SingleDelete(handles_.at(cf), key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + return wb_->Merge(handles_.at(cf), key, value); + } + + // The only expected 2PC marker is the initial Noop marker. + Status MarkNoop(bool empty_batch) override { + return empty_batch ? Status::OK() : Status::InvalidArgument(); + } + + Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); } + + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + }; + + // The comparator of the default cf is passed in, similar to the + // initialization of TransactionBaseImpl::write_batch_. This comparator is + // only used if the write batch encounters an invalid cf id, and falls back to + // this comparator. + WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0, + true, 0, write_options_.protection_bytes_per_key); + // Swap with write_batch_ so that wb contains the complete write batch. The + // actual write batch that will be flushed to DB will be built in + // write_batch_, and will be read by FlushWriteBatchToDBInternal. + std::swap(wb, write_batch_); + TransactionBaseImpl::InitWriteBatch(); + + size_t prev_boundary = WriteBatchInternal::kHeader; + const bool kPrepared = true; + for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) { + bool trailing_batch = i == unflushed_save_points_->size(); + SavePointBatchHandler sp_handler(&write_batch_, + *wupt_db_->GetCFHandleMap().get()); + size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize() + : (*unflushed_save_points_)[i]; + + // Construct the partial write batch up to the savepoint. + // + // Theoretically, a memcpy between the write batches should be sufficient + // since the rewriting into the batch should produce the exact same byte + // representation. Rebuilding the WriteBatchWithIndex index is still + // necessary though, and would imply doing two passes over the batch though. + Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler, + prev_boundary, curr_boundary); + if (!s.ok()) { + return s; + } + + if (write_batch_.GetWriteBatch()->Count() > 0) { + // Flush the write batch. + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + } + + if (!trailing_batch) { + if (flushed_save_points_ == nullptr) { + flushed_save_points_.reset( + new autovector<WriteUnpreparedTxn::SavePoint>()); + } + flushed_save_points_->emplace_back( + unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot())); + } + + prev_boundary = curr_boundary; + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + } + + unflushed_save_points_->clear(); + return Status::OK(); +} + +Status WriteUnpreparedTxn::PrepareInternal() { + const bool kPrepared = true; + return FlushWriteBatchToDB(kPrepared); +} + +Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() { + if (unprep_seqs_.empty()) { + assert(log_number_ == 0); + assert(GetId() == 0); + return WritePreparedTxn::CommitWithoutPrepareInternal(); + } + + // TODO(lth): We should optimize commit without prepare to not perform + // a prepare under the hood. + auto s = PrepareInternal(); + if (!s.ok()) { + return s; + } + return CommitInternal(); +} + +Status WriteUnpreparedTxn::CommitInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared commit logic. + + // We take the commit-time batch and append the Commit marker. The Memtable + // will ignore the Commit marker in non-recovery mode + WriteBatch* working_batch = GetCommitTimeWriteBatch(); + const bool empty = working_batch->Count() == 0; + auto s = WriteBatchInternal::MarkCommit(working_batch, name_); + assert(s.ok()); + + const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_; + if (!empty) { + // When not writing to memtable, we can still cache the latest write batch. + // The cached batch will be written to memtable in WriteRecoverableState + // during FlushMemTable + if (for_recovery) { + WriteBatchInternal::SetAsLatestPersistentState(working_batch); + } else { + return Status::InvalidArgument( + "Commit-time-batch can only be used if " + "use_only_the_last_commit_time_batch_for_recovery is true"); + } + } + + const bool includes_data = !empty && !for_recovery; + size_t commit_batch_cnt = 0; + if (UNLIKELY(includes_data)) { + ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log, + "Duplicate key overhead"); + SubBatchCounter counter(*wpt_db_->GetCFComparatorMap()); + s = working_batch->Iterate(&counter); + assert(s.ok()); + commit_batch_cnt = counter.BatchCount(); + } + const bool disable_memtable = !includes_data; + const bool do_one_write = + !db_impl_->immutable_db_options().two_write_queues || disable_memtable; + + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt); + const bool kFirstPrepareBatch = true; + AddPreparedCallback add_prepared_callback( + wpt_db_, db_impl_, commit_batch_cnt, + db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch); + PreReleaseCallback* pre_release_callback; + if (do_one_write) { + pre_release_callback = &update_commit_map; + } else { + pre_release_callback = &add_prepared_callback; + } + uint64_t seq_used = kMaxSequenceNumber; + // Since the prepared batch is directly written to memtable, there is + // already a connection between the memtable and its WAL, so there is no + // need to redundantly reference the log that contains the prepared data. + const uint64_t zero_log_number = 0ull; + size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1; + s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, + zero_log_number, disable_memtable, &seq_used, + batch_cnt, pre_release_callback); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + const SequenceNumber commit_batch_seq = seq_used; + if (LIKELY(do_one_write || !s.ok())) { + if (LIKELY(s.ok())) { + // Note RemovePrepared should be called after WriteImpl that publishsed + // the seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + if (UNLIKELY(!do_one_write)) { + wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write to publish seq + + // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the + // commit write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[commit_batch_seq] = commit_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + // Note: the 2nd write comes with a performance penality. So if we have too + // many of commits accompanied with ComitTimeWriteBatch and yet we cannot + // enable use_only_the_last_commit_time_batch_for_recovery_ optimization, + // two_write_queues should be disabled to avoid many additional writes here. + + // Update commit map only from the 2nd queue + WriteBatch empty_batch; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + const bool DISABLE_MEMTABLE = true; + const size_t ONE_BATCH = 1; + const uint64_t NO_REF_LOG = 0; + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_commit_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Note RemovePrepared should be called after WriteImpl that publishsed the + // seq. Otherwise SmallestUnCommittedSeq optimization breaks. + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +Status WriteUnpreparedTxn::WriteRollbackKeys( + const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions) { + // This assertion can be removed when range lock is supported. + assert(lock_tracker.IsPointLockSupported()); + const auto& cf_map = *wupt_db_->GetCFHandleMap(); + auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) { + const auto& cf_handle = cf_map.at(cfid); + PinnableSlice pinnable_val; + bool not_used; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = callback; + auto s = db_impl_->GetImpl(roptions, key, get_impl_options); + + if (s.ok()) { + s = rollback_batch->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + if (wupt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) { + s = rollback_batch->SingleDelete(cf_handle, key); + } else { + s = rollback_batch->Delete(cf_handle, key); + } + assert(s.ok()); + } else { + return s; + } + + return Status::OK(); + }; + + std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it( + lock_tracker.GetColumnFamilyIterator()); + assert(cf_it != nullptr); + while (cf_it->HasNext()) { + ColumnFamilyId cf = cf_it->Next(); + std::unique_ptr<LockTracker::KeyIterator> key_it( + lock_tracker.GetKeyIterator(cf)); + assert(key_it != nullptr); + while (key_it->HasNext()) { + const std::string& key = key_it->Next(); + auto s = WriteRollbackKey(key, cf); + if (!s.ok()) { + return s; + } + } + } + + for (const auto& cfkey : untracked_keys_) { + const auto cfid = cfkey.first; + const auto& keys = cfkey.second; + for (const auto& key : keys) { + auto s = WriteRollbackKey(key, cfid); + if (!s.ok()) { + return s; + } + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxn::RollbackInternal() { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + WriteBatchWithIndex rollback_batch( + wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0, + write_options_.protection_bytes_per_key); + assert(GetId() != kMaxSequenceNumber); + assert(GetId() > 0); + Status s; + auto read_at_seq = kMaxSequenceNumber; + ReadOptions roptions; + // to prevent callback's seq to be overrriden inside DBImpk::Get + roptions.snapshot = wpt_db_->GetMaxSnapshot(); + // Note that we do not use WriteUnpreparedTxnReadCallback because we do not + // need to read our own writes when reading prior versions of the key for + // rollback. + WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq); + // TODO(lth): We write rollback batch all in a single batch here, but this + // should be subdivded into multiple batches as well. In phase 2, when key + // sets are read from WAL, this will happen naturally. + s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_); + assert(s.ok()); + bool do_one_write = !db_impl_->immutable_db_options().two_write_queues; + const bool DISABLE_MEMTABLE = true; + const uint64_t NO_REF_LOG = 0; + uint64_t seq_used = kMaxSequenceNumber; + // Rollback batch may contain duplicate keys, because tracked_keys_ is not + // comparator aware. + auto rollback_batch_cnt = rollback_batch.SubBatchCnt(); + // We commit the rolled back prepared batches. Although this is + // counter-intuitive, i) it is safe to do so, since the prepared batches are + // already canceled out by the rollback batch, ii) adding the commit entry to + // CommitCache will allow us to benefit from the existing mechanism in + // CommitCache that keeps an entry evicted due to max advance and yet overlaps + // with a live snapshot around so that the live snapshot properly skips the + // entry even if its prepare seq is lower than max_evicted_seq_. + // + // TODO(lth): RollbackInternal is conceptually very similar to + // CommitInternal, with the rollback batch simply taking on the role of + // CommitTimeWriteBatch. We should be able to merge the two code paths. + WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map( + wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt); + // Note: the rollback batch does not need AddPrepared since it is written to + // DB in one shot. min_uncommitted still works since it requires capturing + // data that is written to DB but not yet committed, while the rollback + // batch commits with PreReleaseCallback. + s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(), + nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE, + &seq_used, rollback_batch_cnt, + do_one_write ? &update_commit_map : nullptr); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + if (!s.ok()) { + return s; + } + if (do_one_write) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; + } // else do the 2nd write for commit + + uint64_t& prepare_seq = seq_used; + // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the + // rollback write batch as just another "unprepared" batch. This will also + // update the unprep_seqs_ in the update_commit_map callback. + unprep_seqs_[prepare_seq] = rollback_batch_cnt; + WriteUnpreparedCommitEntryPreReleaseCallback + update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0); + + ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log, + "RollbackInternal 2nd write prepare_seq: %" PRIu64, + prepare_seq); + WriteBatch empty_batch; + const size_t ONE_BATCH = 1; + s = empty_batch.PutLogData(Slice()); + assert(s.ok()); + // In the absence of Prepare markers, use Noop as a batch separator + s = WriteBatchInternal::InsertNoop(&empty_batch); + assert(s.ok()); + s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr, + NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH, + &update_commit_map_with_rollback_batch); + assert(!s.ok() || seq_used != kMaxSequenceNumber); + // Mark the txn as rolled back + if (s.ok()) { + for (const auto& seq : unprep_seqs_) { + wpt_db_->RemovePrepared(seq.first, seq.second); + } + } + + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + return s; +} + +void WriteUnpreparedTxn::Clear() { + if (!recovered_txn_) { + txn_db_impl_->UnLock(this, *tracked_locks_); + } + unprep_seqs_.clear(); + flushed_save_points_.reset(nullptr); + unflushed_save_points_.reset(nullptr); + recovered_txn_ = false; + largest_validated_seq_ = 0; + for (auto& it : active_iterators_) { + auto bdit = static_cast<BaseDeltaIterator*>(it); + bdit->Invalidate(Status::InvalidArgument( + "Cannot use iterator after transaction has finished")); + } + active_iterators_.clear(); + untracked_keys_.clear(); + TransactionBaseImpl::Clear(); +} + +void WriteUnpreparedTxn::SetSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + PessimisticTransaction::SetSavePoint(); + if (unflushed_save_points_ == nullptr) { + unflushed_save_points_.reset(new autovector<size_t>()); + } + unflushed_save_points_->push_back(write_batch_.GetDataSize()); +} + +Status WriteUnpreparedTxn::RollbackToSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::RollbackToSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + return RollbackToSavePointInternal(); + } + + return Status::NotFound(); +} + +Status WriteUnpreparedTxn::RollbackToSavePointInternal() { + Status s; + + const bool kClear = true; + TransactionBaseImpl::InitWriteBatch(kClear); + + assert(flushed_save_points_->size() > 0); + WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back(); + + assert(save_points_ != nullptr && save_points_->size() > 0); + const LockTracker& tracked_keys = *save_points_->top().new_locks_; + + ReadOptions roptions; + roptions.snapshot = top.snapshot_->snapshot(); + SequenceNumber min_uncommitted = + static_cast_with_check<const SnapshotImpl>(roptions.snapshot) + ->min_uncommitted_; + SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber(); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + top.unprep_seqs_, + kBackedByDBSnapshot); + s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions); + if (!s.ok()) { + return s; + } + + const bool kPrepared = true; + s = FlushWriteBatchToDBInternal(!kPrepared); + if (!s.ok()) { + return s; + } + + // PessimisticTransaction::RollbackToSavePoint will call also call + // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has + // no savepoints because this savepoint has already been flushed. Work around + // this by setting a fake savepoint. + write_batch_.SetSavePoint(); + s = PessimisticTransaction::RollbackToSavePoint(); + assert(s.ok()); + if (!s.ok()) { + return s; + } + + flushed_save_points_->pop_back(); + return s; +} + +Status WriteUnpreparedTxn::PopSavePoint() { + assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) + + (flushed_save_points_ ? flushed_save_points_->size() : 0) == + (save_points_ ? save_points_->size() : 0)); + if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) { + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + unflushed_save_points_->pop_back(); + return s; + } + + if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) { + // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on + // write_batch_. However, write_batch_ is empty and has no savepoints + // because this savepoint has already been flushed. Work around this by + // setting a fake savepoint. + write_batch_.SetSavePoint(); + Status s = PessimisticTransaction::PopSavePoint(); + assert(!s.IsNotFound()); + flushed_save_points_->pop_back(); + return s; + } + + return Status::NotFound(); +} + +void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, + keys, values, statuses, sorted_input, + &callback); + if (UNLIKELY(!callback.valid() || + !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + for (size_t i = 0; i < num_keys; i++) { + statuses[i] = Status::TryAgain(); + } + } +} + +Status WriteUnpreparedTxn::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { + SequenceNumber min_uncommitted, snap_seq; + const SnapshotBackup backed_by_snapshot = + wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, + unprep_seqs_, backed_by_snapshot); + auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key, + value, &callback); + if (LIKELY(callback.valid() && + wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { + return res; + } else { + res.PermitUncheckedError(); + wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); + return Status::TryAgain(); + } +} + +namespace { +static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) { + auto txn = reinterpret_cast<WriteUnpreparedTxn*>(arg1); + auto iter = reinterpret_cast<Iterator*>(arg2); + txn->RemoveActiveIterator(iter); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) { + return GetIterator(options, wupt_db_->DefaultColumnFamily()); +} + +Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + // Make sure to get iterator from WriteUnprepareTxnDB, not the root db. + Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this); + assert(db_iter); + + auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter); + active_iterators_.push_back(iter); + iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter); + return iter; +} + +Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) { + // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic. + assert(snapshot_); + + SequenceNumber min_uncommitted = + static_cast_with_check<const SnapshotImpl>(snapshot_.get()) + ->min_uncommitted_; + SequenceNumber snap_seq = snapshot_->GetSequenceNumber(); + // tracked_at_seq is either max or the last snapshot with which this key was + // trackeed so there is no need to apply the IsInSnapshot to this comparison + // here as tracked_at_seq is not a prepare seq. + if (*tracked_at_seq <= snap_seq) { + // If the key has been previous validated at a sequence number earlier + // than the curent snapshot's sequence number, we already know it has not + // been modified. + return Status::OK(); + } + + *tracked_at_seq = snap_seq; + + ColumnFamilyHandle* cfh = + column_family ? column_family : db_impl_->DefaultColumnFamily(); + + WriteUnpreparedTxnReadCallback snap_checker( + wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot); + // TODO(yanqin): Support user-defined timestamp. + return TransactionUtil::CheckKeyForConflicts( + db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr, + false /* cache_only */, &snap_checker, min_uncommitted); +} + +const std::map<SequenceNumber, size_t>& +WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { + return unprep_seqs_; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.h b/src/rocksdb/utilities/transactions/write_unprepared_txn.h new file mode 100644 index 000000000..5a3227f4e --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.h @@ -0,0 +1,341 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <set> + +#include "utilities/transactions/write_prepared_txn.h" +#include "utilities/transactions/write_unprepared_txn_db.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxnDB; +class WriteUnpreparedTxn; + +// WriteUnprepared transactions needs to be able to read their own uncommitted +// writes, and supporting this requires some careful consideration. Because +// writes in the current transaction may be flushed to DB already, we cannot +// rely on the contents of WriteBatchWithIndex to determine whether a key should +// be visible or not, so we have to remember to check the DB for any uncommitted +// keys that should be visible to us. First, we will need to change the seek to +// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq). +// Any key greater than max_visible_seq should not be visible because they +// cannot be unprepared by the current transaction and they are not in its +// snapshot. +// +// When we seek to max_visible_seq, one of these cases will happen: +// 1. We hit a unprepared key from the current transaction. +// 2. We hit a unprepared key from the another transaction. +// 3. We hit a committed key with snap_seq < seq < max_unprep_seq. +// 4. We hit a committed key with seq <= snap_seq. +// +// IsVisibleFullCheck handles all cases correctly. +// +// Other notes: +// Note that max_visible_seq is only calculated once at iterator construction +// time, meaning if the same transaction is adding more unprep seqs through +// writes during iteration, these newer writes may not be visible. This is not a +// problem for MySQL though because it avoids modifying the index as it is +// scanning through it to avoid the Halloween Problem. Instead, it scans the +// index once up front, and modifies based on a temporary copy. +// +// In DBIter, there is a "reseek" optimization if the iterator skips over too +// many keys. However, this assumes that the reseek seeks exactly to the +// required key. In write unprepared, even after seeking directly to +// max_visible_seq, some iteration may be required before hitting a visible key, +// and special precautions must be taken to avoid performing another reseek, +// leading to an infinite loop. +// +class WriteUnpreparedTxnReadCallback : public ReadCallback { + public: + WriteUnpreparedTxnReadCallback( + WritePreparedTxnDB* db, SequenceNumber snapshot, + SequenceNumber min_uncommitted, + const std::map<SequenceNumber, size_t>& unprep_seqs, + SnapshotBackup backed_by_snapshot) + // Pass our last uncommitted seq as the snapshot to the parent class to + // ensure that the parent will not prematurely filter out own writes. We + // will do the exact comparison against snapshots in IsVisibleFullCheck + // override. + : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted), + db_(db), + unprep_seqs_(unprep_seqs), + wup_snapshot_(snapshot), + backed_by_snapshot_(backed_by_snapshot) { + (void)backed_by_snapshot_; // to silence unused private field warning + } + + virtual ~WriteUnpreparedTxnReadCallback() { + // If it is not backed by snapshot, the caller must check validity + assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot); + } + + virtual bool IsVisibleFullCheck(SequenceNumber seq) override; + + inline bool valid() { + valid_checked_ = true; + return snap_released_ == false; + } + + void Refresh(SequenceNumber seq) override { + max_visible_seq_ = std::max(max_visible_seq_, seq); + wup_snapshot_ = seq; + } + + static SequenceNumber CalcMaxVisibleSeq( + const std::map<SequenceNumber, size_t>& unprep_seqs, + SequenceNumber snapshot_seq) { + SequenceNumber max_unprepared = 0; + if (unprep_seqs.size()) { + max_unprepared = + unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1; + } + return std::max(max_unprepared, snapshot_seq); + } + + private: + WritePreparedTxnDB* db_; + const std::map<SequenceNumber, size_t>& unprep_seqs_; + SequenceNumber wup_snapshot_; + // Whether max_visible_seq_ is backed by a snapshot + const SnapshotBackup backed_by_snapshot_; + bool snap_released_ = false; + // Safety check to ensure that the caller has checked invalid statuses + bool valid_checked_ = false; +}; + +class WriteUnpreparedTxn : public WritePreparedTxn { + public: + WriteUnpreparedTxn(WriteUnpreparedTxnDB* db, + const WriteOptions& write_options, + const TransactionOptions& txn_options); + + virtual ~WriteUnpreparedTxn(); + + using TransactionBaseImpl::Put; + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Merge; + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::Delete; + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) override; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + using TransactionBaseImpl::SingleDelete; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) override; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) override; + + // In WriteUnprepared, untracked writes will break snapshot validation logic. + // Snapshot validation will only check the largest sequence number of a key to + // see if it was committed or not. However, an untracked unprepared write will + // hide smaller committed sequence numbers. + // + // TODO(lth): Investigate whether it is worth having snapshot validation + // validate all values larger than snap_seq. Otherwise, we should return + // Status::NotSupported for untracked writes. + + virtual Status RebuildFromWriteBatch(WriteBatch*) override; + + virtual uint64_t GetLastLogNumber() const override { + return last_log_number_; + } + + void RemoveActiveIterator(Iterator* iter) { + active_iterators_.erase( + std::remove(active_iterators_.begin(), active_iterators_.end(), iter), + active_iterators_.end()); + } + + protected: + void Initialize(const TransactionOptions& txn_options) override; + + Status PrepareInternal() override; + + Status CommitWithoutPrepareInternal() override; + Status CommitInternal() override; + + Status RollbackInternal() override; + + void Clear() override; + + void SetSavePoint() override; + Status RollbackToSavePoint() override; + Status PopSavePoint() override; + + // Get and GetIterator needs to be overridden so that a ReadCallback to + // handle read-your-own-write is used. + using Transaction::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using Transaction::MultiGet; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + using Transaction::GetIterator; + virtual Iterator* GetIterator(const ReadOptions& options) override; + virtual Iterator* GetIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + + virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family, + const Slice& key, + SequenceNumber* tracked_at_seq) override; + + private: + friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test; + friend class WriteUnpreparedTxnDB; + + const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers(); + Status WriteRollbackKeys(const LockTracker& tracked_keys, + WriteBatchWithIndex* rollback_batch, + ReadCallback* callback, const ReadOptions& roptions); + + Status MaybeFlushWriteBatchToDB(); + Status FlushWriteBatchToDB(bool prepared); + Status FlushWriteBatchToDBInternal(bool prepared); + Status FlushWriteBatchWithSavePointToDB(); + Status RollbackToSavePointInternal(); + Status HandleWrite(std::function<Status()> do_write); + + // For write unprepared, we check on every writebatch append to see if + // write_batch_flush_threshold_ has been exceeded, and then call + // FlushWriteBatchToDB if so. This logic is encapsulated in + // MaybeFlushWriteBatchToDB. + int64_t write_batch_flush_threshold_; + WriteUnpreparedTxnDB* wupt_db_; + + // Ordered list of unprep_seq sequence numbers that we have already written + // to DB. + // + // This maps unprep_seq => prepare_batch_cnt for each unprepared batch + // written by this transaction. + // + // Note that this contains both prepared and unprepared batches, since they + // are treated similarily in prepare heap/commit map, so it simplifies the + // commit callbacks. + std::map<SequenceNumber, size_t> unprep_seqs_; + + uint64_t last_log_number_; + + // Recovered transactions have tracked_keys_ populated, but are not actually + // locked for efficiency reasons. For recovered transactions, skip unlocking + // keys when transaction ends. + bool recovered_txn_; + + // Track the largest sequence number at which we performed snapshot + // validation. If snapshot validation was skipped because no snapshot was set, + // then this is set to GetLastPublishedSequence. This value is useful because + // it means that for keys that have unprepared seqnos, we can guarantee that + // no committed keys by other transactions can exist between + // largest_validated_seq_ and max_unprep_seq. See + // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is + // necessary for iterator Prev(). + // + // Currently this value only increases during the lifetime of a transaction, + // but in some cases, we should be able to restore the previously largest + // value when calling RollbackToSavepoint. + SequenceNumber largest_validated_seq_; + + struct SavePoint { + // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is + // used during RollbackToSavepoint to determine visibility when restoring + // old values. + // + // TODO(lth): Since all unprep_seqs_ sets further down the stack must be + // subsets, this can potentially be deduplicated by just storing set + // difference. Investigate if this is worth it. + std::map<SequenceNumber, size_t> unprep_seqs_; + + // This snapshot will be used to read keys at this savepoint if we call + // RollbackToSavePoint. + std::unique_ptr<ManagedSnapshot> snapshot_; + + SavePoint(const std::map<SequenceNumber, size_t>& seqs, + ManagedSnapshot* snapshot) + : unprep_seqs_(seqs), snapshot_(snapshot){}; + }; + + // We have 3 data structures holding savepoint information: + // 1. TransactionBaseImpl::save_points_ + // 2. WriteUnpreparedTxn::flushed_save_points_ + // 3. WriteUnpreparecTxn::unflushed_save_points_ + // + // TransactionBaseImpl::save_points_ holds information about all write + // batches, including the current in-memory write_batch_, or unprepared + // batches that have been written out. Its responsibility is just to track + // which keys have been modified in every savepoint. + // + // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints + // set on unprepared batches that have already flushed. It holds the snapshot + // and unprep_seqs at that savepoint, so that the rollback process can + // determine which keys were visible at that point in time. + // + // WriteUnpreparecTxn::unflushed_save_points_ holds information about + // savepoints on the current in-memory write_batch_. It simply records the + // size of the write batch at every savepoint. + // + // TODO(lth): Remove the redundancy between save_point_boundaries_ and + // write_batch_.save_points_. + // + // Based on this information, here are some invariants: + // size(unflushed_save_points_) = size(write_batch_.save_points_) + // size(flushed_save_points_) + size(unflushed_save_points_) + // = size(save_points_) + // + std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>> + flushed_save_points_; + std::unique_ptr<autovector<size_t>> unflushed_save_points_; + + // It is currently unsafe to flush a write batch if there are active iterators + // created from this transaction. This is because we use WriteBatchWithIndex + // to do merging reads from the DB and the write batch. If we flush the write + // batch, it is possible that the delta iterator on the iterator will point to + // invalid memory. + std::vector<Iterator*> active_iterators_; + + // Untracked keys that we have to rollback. + // + // TODO(lth): Currently we we do not record untracked keys per-savepoint. + // This means that when rolling back to savepoints, we have to check all + // keys in the current transaction for rollback. Note that this is only + // inefficient, but still correct because we take a snapshot at every + // savepoint, and we will use that snapshot to construct the rollback batch. + // The rollback batch will then contain a reissue of the same marker. + // + // A more optimal solution would be to only check keys changed since the + // last savepoint. Also, it may make sense to merge this into tracked_keys_ + // and differentiate between tracked but not locked keys to avoid having two + // very similar data structures. + using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>; + KeySet untracked_keys_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc new file mode 100644 index 000000000..2ed2d5c59 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc @@ -0,0 +1,473 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_unprepared_txn_db.h" + +#include "db/arena_wrapped_db_iter.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Instead of reconstructing a Transaction object, and calling rollback on it, +// we can be more efficient with RollbackRecoveredTransaction by skipping +// unnecessary steps (eg. updating CommitMap, reconstructing keyset) +Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction( + const DBImpl::RecoveredTransaction* rtxn) { + // TODO(lth): Reduce duplicate code with WritePrepared rollback logic. + assert(rtxn->unprepared_); + auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap(); + auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap(); + // In theory we could write with disableWAL = true during recovery, and + // assume that if we crash again during recovery, we can just replay from + // the very beginning. Unfortunately, the XIDs from the application may not + // necessarily be unique across restarts, potentially leading to situations + // like this: + // + // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1) + // -- crash and recover with Put(a) rolled back as it was not prepared + // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1) + // COMMIT(xid = 1) + // -- crash and recover with both a, b + // + // We could just write the rollback marker, but then we would have to extend + // MemTableInserter during recovery to actually do writes into the DB + // instead of just dropping the in-memory write batch. + // + WriteOptions w_options; + + class InvalidSnapshotReadCallback : public ReadCallback { + public: + InvalidSnapshotReadCallback(SequenceNumber snapshot) + : ReadCallback(snapshot) {} + + inline bool IsVisibleFullCheck(SequenceNumber) override { + // The seq provided as snapshot is the seq right before we have locked and + // wrote to it, so whatever is there, it is committed. + return true; + } + + // Ignore the refresh request since we are confident that our snapshot seq + // is not going to be affected by concurrent compactions (not enabled yet.) + void Refresh(SequenceNumber) override {} + }; + + // Iterate starting with largest sequence number. + for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) { + auto last_visible_txn = it->first - 1; + const auto& batch = it->second.batch_; + WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */, + w_options.protection_bytes_per_key, + 0 /* default_cf_ts_sz */); + + struct RollbackWriteBatchBuilder : public WriteBatch::Handler { + DBImpl* db_; + ReadOptions roptions; + InvalidSnapshotReadCallback callback; + WriteBatch* rollback_batch_; + std::map<uint32_t, const Comparator*>& comparators_; + std::map<uint32_t, ColumnFamilyHandle*>& handles_; + using CFKeys = std::set<Slice, SetComparator>; + std::map<uint32_t, CFKeys> keys_; + bool rollback_merge_operands_; + RollbackWriteBatchBuilder( + DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch, + std::map<uint32_t, const Comparator*>& comparators, + std::map<uint32_t, ColumnFamilyHandle*>& handles, + bool rollback_merge_operands) + : db_(db), + callback(snap_seq), + // disable min_uncommitted optimization + rollback_batch_(dst_batch), + comparators_(comparators), + handles_(handles), + rollback_merge_operands_(rollback_merge_operands) {} + + Status Rollback(uint32_t cf, const Slice& key) { + Status s; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + auto cmp = comparators_[cf]; + keys_[cf] = CFKeys(SetComparator(cmp)); + } + auto res = cf_keys.insert(key); + if (res.second == + false) { // second is false if a element already existed. + return s; + } + + PinnableSlice pinnable_val; + bool not_used; + auto cf_handle = handles_[cf]; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cf_handle; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = ¬_used; + get_impl_options.callback = &callback; + s = db_->GetImpl(roptions, key, get_impl_options); + assert(s.ok() || s.IsNotFound()); + if (s.ok()) { + s = rollback_batch_->Put(cf_handle, key, pinnable_val); + assert(s.ok()); + } else if (s.IsNotFound()) { + // There has been no readable value before txn. By adding a delete we + // make sure that there will be none afterwards either. + s = rollback_batch_->Delete(cf_handle, key); + assert(s.ok()); + } else { + // Unexpected status. Return it to the user. + } + return s; + } + + Status PutCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + return Rollback(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return Rollback(cf, key); + } + + Status MergeCF(uint32_t cf, const Slice& key, + const Slice& /*val*/) override { + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } + } + + // Recovered batches do not contain 2PC markers. + Status MarkNoop(bool) override { return Status::InvalidArgument(); } + Status MarkBeginPrepare(bool) override { + return Status::InvalidArgument(); + } + Status MarkEndPrepare(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkCommit(const Slice&) override { + return Status::InvalidArgument(); + } + Status MarkRollback(const Slice&) override { + return Status::InvalidArgument(); + } + } rollback_handler(db_impl_, last_visible_txn, &rollback_batch, + *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(), + txn_db_options_.rollback_merge_operands); + + auto s = batch->Iterate(&rollback_handler); + if (!s.ok()) { + return s; + } + + // The Rollback marker will be used as a batch separator + s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_); + if (!s.ok()) { + return s; + } + + const uint64_t kNoLogRef = 0; + const bool kDisableMemtable = true; + const size_t kOneBatch = 1; + uint64_t seq_used = kMaxSequenceNumber; + s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr, + kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch); + if (!s.ok()) { + return s; + } + + // If two_write_queues, we must manually release the sequence number to + // readers. + if (db_impl_->immutable_db_options().two_write_queues) { + db_impl_->SetLastPublishedSequence(seq_used); + } + } + + return Status::OK(); +} + +Status WriteUnpreparedTxnDB::Initialize( + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles) { + // TODO(lth): Reduce code duplication in this function. + auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB()); + assert(dbimpl != nullptr); + + db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this)); + // A callback to commit a single sub-batch + class CommitSubBatchPreReleaseCallback : public PreReleaseCallback { + public: + explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db) + : db_(db) {} + Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), uint64_t, + size_t /*index*/, size_t /*total*/) override { + assert(!is_mem_disabled); + db_->AddCommitted(commit_seq, commit_seq); + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + }; + db_impl_->SetRecoverableStatePreReleaseCallback( + new CommitSubBatchPreReleaseCallback(this)); + + // PessimisticTransactionDB::Initialize + for (auto cf_ptr : handles) { + AddColumnFamily(cf_ptr); + } + // Verify cf options + for (auto handle : handles) { + ColumnFamilyDescriptor cfd; + Status s = handle->GetDescriptor(&cfd); + if (!s.ok()) { + return s; + } + s = VerifyCFOptions(cfd.options); + if (!s.ok()) { + return s; + } + } + + // Re-enable compaction for the column families that initially had + // compaction enabled. + std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles; + compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); + for (auto index : compaction_enabled_cf_indices) { + compaction_enabled_cf_handles.push_back(handles[index]); + } + + // create 'real' transactions from recovered shell transactions + auto rtxns = dbimpl->recovered_transactions(); + std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt; + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + assert(recovered_trx); + assert(recovered_trx->batches_.size() >= 1); + assert(recovered_trx->name_.length()); + + // We can only rollback transactions after AdvanceMaxEvictedSeq is called, + // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why + // two iterations is required. + if (recovered_trx->unprepared_) { + continue; + } + + WriteOptions w_options; + w_options.sync = true; + TransactionOptions t_options; + + auto first_log_number = recovered_trx->batches_.begin()->second.log_number_; + auto first_seq = recovered_trx->batches_.begin()->first; + auto last_prepare_batch_cnt = + recovered_trx->batches_.begin()->second.batch_cnt_; + + Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr); + assert(real_trx); + auto wupt = static_cast_with_check<WriteUnpreparedTxn>(real_trx); + wupt->recovered_txn_ = true; + + real_trx->SetLogNumber(first_log_number); + real_trx->SetId(first_seq); + Status s = real_trx->SetName(recovered_trx->name_); + if (!s.ok()) { + return s; + } + wupt->prepare_batch_cnt_ = last_prepare_batch_cnt; + + for (auto batch : recovered_trx->batches_) { + const auto& seq = batch.first; + const auto& batch_info = batch.second; + auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1; + assert(batch_info.log_number_); + + ordered_seq_cnt[seq] = cnt; + assert(wupt->unprep_seqs_.count(seq) == 0); + wupt->unprep_seqs_[seq] = cnt; + + s = wupt->RebuildFromWriteBatch(batch_info.batch_); + assert(s.ok()); + if (!s.ok()) { + return s; + } + } + + const bool kClear = true; + wupt->InitWriteBatch(kClear); + + real_trx->SetState(Transaction::PREPARED); + if (!s.ok()) { + return s; + } + } + // AddPrepared must be called in order + for (auto seq_cnt : ordered_seq_cnt) { + auto seq = seq_cnt.first; + auto cnt = seq_cnt.second; + for (size_t i = 0; i < cnt; i++) { + AddPrepared(seq + i); + } + } + + SequenceNumber prev_max = max_evicted_seq_; + SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber(); + AdvanceMaxEvictedSeq(prev_max, last_seq); + // Create a gap between max and the next snapshot. This simplifies the logic + // in IsInSnapshot by not having to consider the special case of max == + // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest. + if (last_seq) { + db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1); + db_impl_->versions_->SetLastSequence(last_seq + 1); + db_impl_->versions_->SetLastPublishedSequence(last_seq + 1); + } + + Status s; + // Rollback unprepared transactions. + for (auto rtxn : rtxns) { + auto recovered_trx = rtxn.second; + if (recovered_trx->unprepared_) { + s = RollbackRecoveredTransaction(recovered_trx); + if (!s.ok()) { + return s; + } + continue; + } + } + + if (s.ok()) { + dbimpl->DeleteAllRecoveredTransactions(); + + // Compaction should start only after max_evicted_seq_ is set AND recovered + // transactions are either added to PrepareHeap or rolled back. + s = EnableAutoCompaction(compaction_enabled_cf_handles); + } + + return s; +} + +Transaction* WriteUnpreparedTxnDB::BeginTransaction( + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new WriteUnpreparedTxn(this, write_options, txn_options); + } +} + +// Struct to hold ownership of snapshot and read callback for iterator cleanup. +struct WriteUnpreparedTxnDB::IteratorState { + IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence, + std::shared_ptr<ManagedSnapshot> s, + SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn) + : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_, + kBackedByDBSnapshot), + snapshot(s) {} + SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); } + + WriteUnpreparedTxnReadCallback callback; + std::shared_ptr<ManagedSnapshot> snapshot; +}; + +namespace { +static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { + delete reinterpret_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1); +} +} // anonymous namespace + +Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn) { + // TODO(lth): Refactor so that this logic is shared with WritePrepared. + constexpr bool expose_blob_index = false; + constexpr bool allow_refresh = false; + std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr; + SequenceNumber snapshot_seq = kMaxSequenceNumber; + SequenceNumber min_uncommitted = 0; + + // Currently, the Prev() iterator logic does not work well without snapshot + // validation. The logic simply iterates through values of a key in + // ascending seqno order, stopping at the first non-visible value and + // returning the last visible value. + // + // For example, if snapshot sequence is 3, and we have the following keys: + // foo: v1 1 + // foo: v2 2 + // foo: v3 3 + // foo: v4 4 + // foo: v5 5 + // + // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3, + // which is the last visible value. + // + // For unprepared transactions, if we have snap_seq = 3, but the current + // transaction has unprep_seq 5, then returning the first non-visible value + // would be incorrect, as we should return v5, and not v3. The problem is that + // there are committed values at snapshot_seq < commit_seq < unprep_seq. + // + // Snapshot validation can prevent this problem by ensuring that no committed + // values exist at snapshot_seq < commit_seq, and thus any value with a + // sequence number greater than snapshot_seq must be unprepared values. For + // example, if the transaction had a snapshot at 3, then snapshot validation + // would be performed during the Put(v5) call. It would find v4, and the Put + // would fail with snapshot validation failure. + // + // TODO(lth): Improve Prev() logic to continue iterating until + // max_visible_seq, and then return the last visible value, so that this + // restriction can be lifted. + const Snapshot* snapshot = nullptr; + if (options.snapshot == nullptr) { + snapshot = GetSnapshot(); + own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot); + } else { + snapshot = options.snapshot; + } + + snapshot_seq = snapshot->GetSequenceNumber(); + assert(snapshot_seq != kMaxSequenceNumber); + // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are + // guaranteed that for keys that were modified by this transaction (and thus + // might have unprepared values), no committed values exist at + // largest_validated_seq < commit_seq (or the contrapositive: any committed + // value must exist at commit_seq <= largest_validated_seq). This implies + // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <= + // snapshot_seq. As explained above, the problem with Prev() only happens when + // snapshot_seq < commit_seq. + // + // For keys that were not modified by this transaction, largest_validated_seq_ + // is meaningless, and Prev() should just work with the existing visibility + // logic. + if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() && + !txn->unprep_seqs_.empty()) { + ROCKS_LOG_ERROR(info_log_, + "WriteUnprepared iterator creation failed since the " + "transaction has performed unvalidated writes"); + return nullptr; + } + min_uncommitted = + static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_; + + auto* cfd = + static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd(); + auto* state = + new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); + auto* db_iter = db_impl_->NewIteratorImpl( + options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index, + allow_refresh); + db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h new file mode 100644 index 000000000..c40e96d49 --- /dev/null +++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include "utilities/transactions/write_prepared_txn_db.h" +#include "utilities/transactions/write_unprepared_txn.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteUnpreparedTxn; + +class WriteUnpreparedTxnDB : public WritePreparedTxnDB { + public: + using WritePreparedTxnDB::WritePreparedTxnDB; + + Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles) override; + + Transaction* BeginTransaction(const WriteOptions& write_options, + const TransactionOptions& txn_options, + Transaction* old_txn) override; + + // Struct to hold ownership of snapshot and read callback for cleanup. + struct IteratorState; + + using WritePreparedTxnDB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family, + WriteUnpreparedTxn* txn); + + private: + Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn); +}; + +class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { + // TODO(lth): Reduce code duplication with + // WritePreparedCommitEntryPreReleaseCallback + public: + // includes_data indicates that the commit also writes non-empty + // CommitTimeWriteBatch to memtable, which needs to be committed separately. + WriteUnpreparedCommitEntryPreReleaseCallback( + WritePreparedTxnDB* db, DBImpl* db_impl, + const std::map<SequenceNumber, size_t>& unprep_seqs, + size_t data_batch_cnt = 0, bool publish_seq = true) + : db_(db), + db_impl_(db_impl), + unprep_seqs_(unprep_seqs), + data_batch_cnt_(data_batch_cnt), + includes_data_(data_batch_cnt_ > 0), + publish_seq_(publish_seq) { + assert(unprep_seqs.size() > 0); + } + + virtual Status Callback(SequenceNumber commit_seq, + bool is_mem_disabled __attribute__((__unused__)), + uint64_t, size_t /*index*/, + size_t /*total*/) override { + const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1) + ? commit_seq + : commit_seq + data_batch_cnt_ - 1; + // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt. + for (const auto& s : unprep_seqs_) { + for (size_t i = 0; i < s.second; i++) { + db_->AddCommitted(s.first + i, last_commit_seq); + } + } + + if (includes_data_) { + assert(data_batch_cnt_); + // Commit the data that is accompanied with the commit request + for (size_t i = 0; i < data_batch_cnt_; i++) { + // For commit seq of each batch use the commit seq of the last batch. + // This would make debugging easier by having all the batches having + // the same sequence number. + db_->AddCommitted(commit_seq + i, last_commit_seq); + } + } + if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) { + assert(is_mem_disabled); // implies the 2nd queue + // Publish the sequence number. We can do that here assuming the callback + // is invoked only from one write queue, which would guarantee that the + // publish sequence numbers will be in order, i.e., once a seq is + // published all the seq prior to that are also publishable. + db_impl_->SetLastPublishedSequence(last_commit_seq); + } + // else SequenceNumber that is updated as part of the write already does the + // publishing + return Status::OK(); + } + + private: + WritePreparedTxnDB* db_; + DBImpl* db_impl_; + const std::map<SequenceNumber, size_t>& unprep_seqs_; + size_t data_batch_cnt_; + // Either because it is commit without prepare or it has a + // CommitTimeWriteBatch + bool includes_data_; + // Should the callback also publishes the commit seq number + bool publish_seq_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE |