diff options
Diffstat (limited to 'src/kv/rocksdb_cache/BinnedLRUCache.h')
-rw-r--r-- | src/kv/rocksdb_cache/BinnedLRUCache.h | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h new file mode 100644 index 00000000..96023ce2 --- /dev/null +++ b/src/kv/rocksdb_cache/BinnedLRUCache.h @@ -0,0 +1,335 @@ +// Copyright (c) 2018-Present Red Hat Inc. All rights reserved. +// +// Copyright (c) 2011-2018, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 and Apache 2.0 License +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_BINNED_LRU_CACHE +#define ROCKSDB_BINNED_LRU_CACHE + +#include <string> +#include <mutex> + +#include "ShardedCache.h" +#include "common/autovector.h" +#include "common/dout.h" +#include "include/ceph_assert.h" +#include "common/ceph_context.h" + +namespace rocksdb_cache { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. +// Entries are referenced by cache and/or by any external entity. +// The cache keeps all its entries in table. Some elements +// are also stored on LRU list. +// +// BinnedLRUHandle can be in these states: +// 1. Referenced externally AND in hash table. +// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) +// 2. Not referenced externally and in hash table. In that case the entry is +// in the LRU and can be freed. (refs == 1 && in_cache == true) +// 3. Referenced externally and not in hash table. In that case the entry is +// in not on LRU and not in table. (refs >= 1 && in_cache == false) +// +// All newly created BinnedLRUHandles are in state 1. If you call +// BinnedLRUCacheShard::Release +// on entry in state 1, it will go into state 2. To move from state 1 to +// state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the +// same key. +// To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup. +// Before destruction, make sure that no handles are in state 1. This means +// that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a +// matching +// RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3) + +std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache( + CephContext *c, + size_t capacity, + int num_shard_bits = -1, + bool strict_capacity_limit = false, + double high_pri_pool_ratio = 0.0); + +struct BinnedLRUHandle { + void* value; + void (*deleter)(const rocksdb::Slice&, void* value); + BinnedLRUHandle* next_hash; + BinnedLRUHandle* next; + BinnedLRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; // a number of refs to this entry + // cache itself is counted as 1 + + // Include the following flags: + // in_cache: whether this entry is referenced by the hash table. + // is_high_pri: whether this entry is high priority entry. + // in_high_pri_pool: whether this entry is in high-pri pool. + char flags; + + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + + char* key_data = nullptr; // Beginning of key + + rocksdb::Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast<rocksdb::Slice*>(value)); + } else { + return rocksdb::Slice(key_data, key_length); + } + } + + bool InCache() { return flags & 1; } + bool IsHighPri() { return flags & 2; } + bool InHighPriPool() { return flags & 4; } + bool HasHit() { return flags & 8; } + + void SetInCache(bool in_cache) { + if (in_cache) { + flags |= 1; + } else { + flags &= ~1; + } + } + + void SetPriority(rocksdb::Cache::Priority priority) { + if (priority == rocksdb::Cache::Priority::HIGH) { + flags |= 2; + } else { + flags &= ~2; + } + } + + void SetInHighPriPool(bool in_high_pri_pool) { + if (in_high_pri_pool) { + flags |= 4; + } else { + flags &= ~4; + } + } + + void SetHit() { flags |= 8; } + + void Free() { + ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); + if (deleter) { + (*deleter)(key(), value); + } + delete[] key_data; + delete this; + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class BinnedLRUHandleTable { + public: + BinnedLRUHandleTable(); + ~BinnedLRUHandleTable(); + + BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash); + BinnedLRUHandle* Insert(BinnedLRUHandle* h); + BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash); + + template <typename T> + void ApplyToAllCacheEntries(T func) { + for (uint32_t i = 0; i < length_; i++) { + BinnedLRUHandle* h = list_[i]; + while (h != nullptr) { + auto n = h->next_hash; + ceph_assert(h->InCache()); + func(h); + h = n; + } + } + } + + private: + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash); + + void Resize(); + + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + BinnedLRUHandle** list_; + uint32_t length_; + uint32_t elems_; +}; + +// A single shard of sharded cache. +class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { + public: + BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit, + double high_pri_pool_ratio); + virtual ~BinnedLRUCacheShard(); + + // Separate from constructor so caller can easily make an array of BinnedLRUCache + // if current usage is more than new capacity, the function will attempt to + // free the needed space + virtual void SetCapacity(size_t capacity) override; + + // Set the flag to reject insertion if cache if full. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override; + + // Set percentage of capacity reserved for high-pri cache entries. + void SetHighPriPoolRatio(double high_pri_pool_ratio); + + // Like Cache methods, but with an extra "hash" parameter. + virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value, + size_t charge, + void (*deleter)(const rocksdb::Slice& key, void* value), + rocksdb::Cache::Handle** handle, + rocksdb::Cache::Priority priority) override; + virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override; + virtual bool Ref(rocksdb::Cache::Handle* handle) override; + virtual bool Release(rocksdb::Cache::Handle* handle, + bool force_erase = false) override; + virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override; + + // Although in some platforms the update of size_t is atomic, to make sure + // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll + // protect them with mutex_. + + virtual size_t GetUsage() const override; + virtual size_t GetPinnedUsage() const override; + + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override; + + virtual void EraseUnRefEntries() override; + + virtual std::string GetPrintableOptions() const override; + + void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri); + + // Retrieves number of elements in LRU, for unit test purpose only + // not threadsafe + size_t TEST_GetLRUSize(); + + // Retrieves high pri pool ratio + double GetHighPriPoolRatio() const; + + // Retrieves high pri pool usage + size_t GetHighPriPoolUsage() const; + + private: + void LRU_Remove(BinnedLRUHandle* e); + void LRU_Insert(BinnedLRUHandle* e); + + // Overflow the last entry in high-pri pool to low-pri pool until size of + // high-pri pool is no larger than the size specify by high_pri_pool_pct. + void MaintainPoolSize(); + + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(BinnedLRUHandle* e); + + // Free some space following strict LRU policy until enough space + // to hold (usage_ + charge) is freed or the lru list is empty + // This function is not thread safe - it needs to be executed while + // holding the mutex_ + void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted); + + // Initialized before use. + size_t capacity_; + + // Memory size for entries in high-pri pool. + size_t high_pri_pool_usage_; + + // Whether to reject insertion if cache reaches its full capacity. + bool strict_capacity_limit_; + + // Ratio of capacity reserved for high priority cache entries. + double high_pri_pool_ratio_; + + // High-pri pool size, equals to capacity * high_pri_pool_ratio. + // Remember the value to avoid recomputing each time. + double high_pri_pool_capacity_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + // LRU contains items which can be evicted, ie reference only by cache + BinnedLRUHandle lru_; + + // Pointer to head of low-pri pool in LRU list. + BinnedLRUHandle* lru_low_pri_; + + // ------------^^^^^^^^^^^^^----------- + // Not frequently modified data members + // ------------------------------------ + // + // We separate data members that are updated frequently from the ones that + // are not frequently updated so that they don't share the same cache line + // which will lead into false cache sharing + // + // ------------------------------------ + // Frequently modified data members + // ------------vvvvvvvvvvvvv----------- + BinnedLRUHandleTable table_; + + // Memory size for entries residing in the cache + size_t usage_; + + // Memory size for entries residing only in the LRU list + size_t lru_usage_; + + // mutex_ protects the following state. + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable std::mutex mutex_; +}; + +class BinnedLRUCache : public ShardedCache { + public: + BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits, + bool strict_capacity_limit, double high_pri_pool_ratio); + virtual ~BinnedLRUCache(); + virtual const char* Name() const override { return "BinnedLRUCache"; } + virtual CacheShard* GetShard(int shard) override; + virtual const CacheShard* GetShard(int shard) const override; + virtual void* Value(Handle* handle) override; + virtual size_t GetCharge(Handle* handle) const override; + virtual uint32_t GetHash(Handle* handle) const override; + virtual void DisownData() override; + + // Retrieves number of elements in LRU, for unit test purpose only + size_t TEST_GetLRUSize(); + // Sets the high pri pool ratio + void SetHighPriPoolRatio(double high_pri_pool_ratio); + // Retrieves high pri pool ratio + double GetHighPriPoolRatio() const; + // Retrieves high pri pool usage + size_t GetHighPriPoolUsage() const; + + // PriorityCache + virtual int64_t request_cache_bytes( + PriorityCache::Priority pri, uint64_t total_cache) const; + virtual int64_t commit_cache_size(uint64_t total_cache); + virtual int64_t get_committed_size() const { + return GetCapacity(); + } + virtual std::string get_cache_name() const { + return "RocksDB Binned LRU Cache"; + } + + private: + CephContext *cct; + BinnedLRUCacheShard* shards_; + int num_shards_ = 0; +}; + +} // namespace rocksdb_cache + +#endif // ROCKSDB_BINNED_LRU_CACHE |