diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/utilities/persistent_cache/hash_table.h | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/utilities/persistent_cache/hash_table.h')
-rw-r--r-- | src/rocksdb/utilities/persistent_cache/hash_table.h | 238 |
1 files changed, 238 insertions, 0 deletions
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table.h b/src/rocksdb/utilities/persistent_cache/hash_table.h new file mode 100644 index 00000000..36d8327e --- /dev/null +++ b/src/rocksdb/utilities/persistent_cache/hash_table.h @@ -0,0 +1,238 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include <assert.h> +#include <list> +#include <vector> + +#ifdef OS_LINUX +#include <sys/mman.h> +#endif + +#include "include/rocksdb/env.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +// HashTable<T, Hash, Equal> +// +// Traditional implementation of hash table with synchronization built on top +// don't perform very well in multi-core scenarios. This is an implementation +// designed for multi-core scenarios with high lock contention. +// +// |<-------- alpha ------------->| +// Buckets Collision list +// ---- +----+ +---+---+--- ...... ---+---+---+ +// / | |--->| | | | | | +// / +----+ +---+---+--- ...... ---+---+---+ +// / | | +// Locks/ +----+ +// +--+/ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// | | . . +// +--+ . . +// \ +----+ +// \ | | +// \ +----+ +// \ | | +// \---- +----+ +// +// The lock contention is spread over an array of locks. This helps improve +// concurrent access. The spine is designed for a certain capacity and load +// factor. When the capacity planning is done correctly we can expect +// O(load_factor = 1) insert, access and remove time. +// +// Micro benchmark on debug build gives about .5 Million/sec rate of insert, +// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the +// blocks were of 4K, the hash table can support a virtual throughput of +// 6 GB/s. +// +// T Object type (contains both key and value) +// Hash Function that returns an hash from type T +// Equal Returns if two objects are equal +// (We need explicit equal for pointer type) +// +template <class T, class Hash, class Equal> +class HashTable { + public: + explicit HashTable(const size_t capacity = 1024 * 1024, + const float load_factor = 2.0, const uint32_t nlocks = 256) + : nbuckets_( + static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)), + nlocks_(nlocks) { + // pre-conditions + assert(capacity); + assert(load_factor); + assert(nbuckets_); + assert(nlocks_); + + buckets_.reset(new Bucket[nbuckets_]); +#ifdef OS_LINUX + mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); +#endif + + // initialize locks + locks_.reset(new port::RWMutex[nlocks_]); +#ifdef OS_LINUX + mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); +#endif + + // post-conditions + assert(buckets_); + assert(locks_); + } + + virtual ~HashTable() { AssertEmptyBuckets(); } + + // + // Insert given record to hash table + // + bool Insert(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + auto& bucket = buckets_[bucket_idx]; + return Insert(&bucket, t); + } + + // Lookup hash table + // + // Please note that read lock should be held by the caller. This is because + // the caller owns the data, and should hold the read lock as long as he + // operates on the data. + bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + port::RWMutex& lock = locks_[lock_idx]; + lock.ReadLock(); + + auto& bucket = buckets_[bucket_idx]; + if (Find(&bucket, t, ret)) { + *ret_lock = &lock; + return true; + } + + lock.ReadUnlock(); + return false; + } + + // + // Erase a given key from the hash table + // + bool Erase(const T& t, T* ret) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + WriteLock _(&locks_[lock_idx]); + + auto& bucket = buckets_[bucket_idx]; + return Erase(&bucket, t, ret); + } + + // Fetch the mutex associated with a key + // This call is used to hold the lock for a given data for extended period of + // time. + port::RWMutex* GetMutex(const T& t) { + const uint64_t h = Hash()(t); + const uint32_t bucket_idx = h % nbuckets_; + const uint32_t lock_idx = bucket_idx % nlocks_; + + return &locks_[lock_idx]; + } + + void Clear(void (*fn)(T)) { + for (uint32_t i = 0; i < nbuckets_; ++i) { + const uint32_t lock_idx = i % nlocks_; + WriteLock _(&locks_[lock_idx]); + for (auto& t : buckets_[i].list_) { + (*fn)(t); + } + buckets_[i].list_.clear(); + } + } + + protected: + // Models bucket of keys that hash to the same bucket number + struct Bucket { + std::list<T> list_; + }; + + // Substitute for std::find with custom comparator operator + typename std::list<T>::iterator Find(std::list<T>* list, const T& t) { + for (auto it = list->begin(); it != list->end(); ++it) { + if (Equal()(*it, t)) { + return it; + } + } + return list->end(); + } + + bool Insert(Bucket* bucket, const T& t) { + // Check if the key already exists + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + return false; + } + + // insert to bucket + bucket->list_.push_back(t); + return true; + } + + bool Find(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + return true; + } + return false; + } + + bool Erase(Bucket* bucket, const T& t, T* ret) { + auto it = Find(&bucket->list_, t); + if (it != bucket->list_.end()) { + if (ret) { + *ret = *it; + } + + bucket->list_.erase(it); + return true; + } + return false; + } + + // assert that all buckets are empty + void AssertEmptyBuckets() { +#ifndef NDEBUG + for (size_t i = 0; i < nbuckets_; ++i) { + WriteLock _(&locks_[i % nlocks_]); + assert(buckets_[i].list_.empty()); + } +#endif + } + + const uint32_t nbuckets_; // No. of buckets in the spine + std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets + const uint32_t nlocks_; // No. of locks + std::unique_ptr<port::RWMutex[]> locks_; // Granular locks +}; + +} // namespace rocksdb + +#endif |