// Copyright (c) 2018-Present Red Hat Inc. All rights reserved. // // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 and Apache 2.0 License // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_BINNED_LRU_CACHE #define ROCKSDB_BINNED_LRU_CACHE #include #include #include "ShardedCache.h" #include "common/autovector.h" #include "common/dout.h" #include "include/ceph_assert.h" #include "common/ceph_context.h" namespace rocksdb_cache { // LRU cache implementation // An entry is a variable length heap-allocated structure. // Entries are referenced by cache and/or by any external entity. // The cache keeps all its entries in table. Some elements // are also stored on LRU list. // // BinnedLRUHandle can be in these states: // 1. Referenced externally AND in hash table. // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) // 2. Not referenced externally and in hash table. In that case the entry is // in the LRU and can be freed. (refs == 1 && in_cache == true) // 3. Referenced externally and not in hash table. In that case the entry is // in not on LRU and not in table. (refs >= 1 && in_cache == false) // // All newly created BinnedLRUHandles are in state 1. If you call // BinnedLRUCacheShard::Release // on entry in state 1, it will go into state 2. To move from state 1 to // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the // same key. // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup. // Before destruction, make sure that no handles are in state 1. This means // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a // matching // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3) std::shared_ptr NewBinnedLRUCache( CephContext *c, size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0); struct BinnedLRUHandle { void* value; void (*deleter)(const rocksdb::Slice&, void* value); BinnedLRUHandle* next_hash; BinnedLRUHandle* next; BinnedLRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? size_t key_length; uint32_t refs; // a number of refs to this entry // cache itself is counted as 1 // Include the following flags: // in_cache: whether this entry is referenced by the hash table. // is_high_pri: whether this entry is high priority entry. // in_high_pri_pool: whether this entry is in high-pri pool. char flags; uint32_t hash; // Hash of key(); used for fast sharding and comparisons char* key_data = nullptr; // Beginning of key rocksdb::Slice key() const { // For cheaper lookups, we allow a temporary Handle object // to store a pointer to a key in "value". if (next == this) { return *(reinterpret_cast(value)); } else { return rocksdb::Slice(key_data, key_length); } } bool InCache() { return flags & 1; } bool IsHighPri() { return flags & 2; } bool InHighPriPool() { return flags & 4; } bool HasHit() { return flags & 8; } void SetInCache(bool in_cache) { if (in_cache) { flags |= 1; } else { flags &= ~1; } } void SetPriority(rocksdb::Cache::Priority priority) { if (priority == rocksdb::Cache::Priority::HIGH) { flags |= 2; } else { flags &= ~2; } } void SetInHighPriPool(bool in_high_pri_pool) { if (in_high_pri_pool) { flags |= 4; } else { flags &= ~4; } } void SetHit() { flags |= 8; } void Free() { ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); if (deleter) { (*deleter)(key(), value); } delete[] key_data; delete this; } }; // We provide our own simple hash table since it removes a whole bunch // of porting hacks and is also faster than some of the built-in hash // table implementations in some of the compiler/runtime combinations // we have tested. E.g., readrandom speeds up by ~5% over the g++ // 4.4.3's builtin hashtable. class BinnedLRUHandleTable { public: BinnedLRUHandleTable(); ~BinnedLRUHandleTable(); BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash); BinnedLRUHandle* Insert(BinnedLRUHandle* h); BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash); template void ApplyToAllCacheEntries(T func) { for (uint32_t i = 0; i < length_; i++) { BinnedLRUHandle* h = list_[i]; while (h != nullptr) { auto n = h->next_hash; ceph_assert(h->InCache()); func(h); h = n; } } } private: // Return a pointer to slot that points to a cache entry that // matches key/hash. If there is no such cache entry, return a // pointer to the trailing slot in the corresponding linked list. BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash); void Resize(); // The table consists of an array of buckets where each bucket is // a linked list of cache entries that hash into the bucket. BinnedLRUHandle** list_; uint32_t length_; uint32_t elems_; }; // A single shard of sharded cache. class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { public: BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio); virtual ~BinnedLRUCacheShard(); // Separate from constructor so caller can easily make an array of BinnedLRUCache // if current usage is more than new capacity, the function will attempt to // free the needed space virtual void SetCapacity(size_t capacity) override; // Set the flag to reject insertion if cache if full. virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override; // Set percentage of capacity reserved for high-pri cache entries. void SetHighPriPoolRatio(double high_pri_pool_ratio); // Like Cache methods, but with an extra "hash" parameter. virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const rocksdb::Slice& key, void* value), rocksdb::Cache::Handle** handle, rocksdb::Cache::Priority priority) override; virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override; virtual bool Ref(rocksdb::Cache::Handle* handle) override; virtual bool Release(rocksdb::Cache::Handle* handle, bool force_erase = false) override; virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override; // Although in some platforms the update of size_t is atomic, to make sure // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll // protect them with mutex_. virtual size_t GetUsage() const override; virtual size_t GetPinnedUsage() const override; virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) override; virtual void EraseUnRefEntries() override; virtual std::string GetPrintableOptions() const override; void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri); // Retrieves number of elements in LRU, for unit test purpose only // not threadsafe size_t TEST_GetLRUSize(); // Retrieves high pri pool ratio double GetHighPriPoolRatio() const; // Retrieves high pri pool usage size_t GetHighPriPoolUsage() const; private: void LRU_Remove(BinnedLRUHandle* e); void LRU_Insert(BinnedLRUHandle* e); // Overflow the last entry in high-pri pool to low-pri pool until size of // high-pri pool is no larger than the size specify by high_pri_pool_pct. void MaintainPoolSize(); // Just reduce the reference count by 1. // Return true if last reference bool Unref(BinnedLRUHandle* e); // Free some space following strict LRU policy until enough space // to hold (usage_ + charge) is freed or the lru list is empty // This function is not thread safe - it needs to be executed while // holding the mutex_ void EvictFromLRU(size_t charge, ceph::autovector* deleted); // Initialized before use. size_t capacity_; // Memory size for entries in high-pri pool. size_t high_pri_pool_usage_; // Whether to reject insertion if cache reaches its full capacity. bool strict_capacity_limit_; // Ratio of capacity reserved for high priority cache entries. double high_pri_pool_ratio_; // High-pri pool size, equals to capacity * high_pri_pool_ratio. // Remember the value to avoid recomputing each time. double high_pri_pool_capacity_; // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. // LRU contains items which can be evicted, ie reference only by cache BinnedLRUHandle lru_; // Pointer to head of low-pri pool in LRU list. BinnedLRUHandle* lru_low_pri_; // ------------^^^^^^^^^^^^^----------- // Not frequently modified data members // ------------------------------------ // // We separate data members that are updated frequently from the ones that // are not frequently updated so that they don't share the same cache line // which will lead into false cache sharing // // ------------------------------------ // Frequently modified data members // ------------vvvvvvvvvvvvv----------- BinnedLRUHandleTable table_; // Memory size for entries residing in the cache size_t usage_; // Memory size for entries residing only in the LRU list size_t lru_usage_; // mutex_ protects the following state. // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. mutable std::mutex mutex_; }; class BinnedLRUCache : public ShardedCache { public: BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio); virtual ~BinnedLRUCache(); virtual const char* Name() const override { return "BinnedLRUCache"; } virtual CacheShard* GetShard(int shard) override; virtual const CacheShard* GetShard(int shard) const override; virtual void* Value(Handle* handle) override; virtual size_t GetCharge(Handle* handle) const override; virtual uint32_t GetHash(Handle* handle) const override; virtual void DisownData() override; // Retrieves number of elements in LRU, for unit test purpose only size_t TEST_GetLRUSize(); // Sets the high pri pool ratio void SetHighPriPoolRatio(double high_pri_pool_ratio); // Retrieves high pri pool ratio double GetHighPriPoolRatio() const; // Retrieves high pri pool usage size_t GetHighPriPoolUsage() const; // PriorityCache virtual int64_t request_cache_bytes( PriorityCache::Priority pri, uint64_t total_cache) const; virtual int64_t commit_cache_size(uint64_t total_cache); virtual int64_t get_committed_size() const { return GetCapacity(); } virtual std::string get_cache_name() const { return "RocksDB Binned LRU Cache"; } private: CephContext *cct; BinnedLRUCacheShard* shards_; int num_shards_ = 0; }; } // namespace rocksdb_cache #endif // ROCKSDB_BINNED_LRU_CACHE