diff options
Diffstat (limited to 'src/rocksdb/cache/clock_cache.cc')
-rw-r--r-- | src/rocksdb/cache/clock_cache.cc | 1404 |
1 files changed, 1404 insertions, 0 deletions
diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc new file mode 100644 index 000000000..6c9f18c2f --- /dev/null +++ b/src/rocksdb/cache/clock_cache.cc @@ -0,0 +1,1404 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "cache/clock_cache.h" + +#include <cassert> +#include <functional> +#include <numeric> + +#include "cache/cache_key.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "util/hash.h" +#include "util/math.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace clock_cache { + +namespace { +inline uint64_t GetRefcount(uint64_t meta) { + return ((meta >> ClockHandle::kAcquireCounterShift) - + (meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; +} + +inline uint64_t GetInitialCountdown(Cache::Priority priority) { + // Set initial clock data from priority + // TODO: configuration parameters for priority handling and clock cycle + // count? + switch (priority) { + case Cache::Priority::HIGH: + return ClockHandle::kHighCountdown; + default: + assert(false); + FALLTHROUGH_INTENDED; + case Cache::Priority::LOW: + return ClockHandle::kLowCountdown; + case Cache::Priority::BOTTOM: + return ClockHandle::kBottomCountdown; + } +} + +inline void FreeDataMarkEmpty(ClockHandle& h) { + // NOTE: in theory there's more room for parallelism if we copy the handle + // data and delay actions like this until after marking the entry as empty, + // but performance tests only show a regression by copying the few words + // of data. + h.FreeData(); + +#ifndef NDEBUG + // Mark slot as empty, with assertion + uint64_t meta = h.meta.exchange(0, std::memory_order_release); + assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.store(0, std::memory_order_release); +#endif +} + +inline bool ClockUpdate(ClockHandle& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + + uint64_t acquire_count = + (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; + uint64_t release_count = + (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; + // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count, + // release_count, (unsigned)(meta >> ClockHandle::kStateShift)); + if (acquire_count != release_count) { + // Only clock update entries with no outstanding refs + return false; + } + if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) { + // Only clock update Shareable entries + return false; + } + if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && + acquire_count > 0) { + // Decrement clock + uint64_t new_count = + std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1); + // Compare-exchange in the decremented clock info, but + // not aggressively + uint64_t new_meta = + (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | + (new_count << ClockHandle::kReleaseCounterShift) | + (new_count << ClockHandle::kAcquireCounterShift); + h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed); + return false; + } + // Otherwise, remove entry (either unreferenced invisible or + // unreferenced and expired visible). + if (h.meta.compare_exchange_strong( + meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership. + return true; + } else { + // Compare-exchange failing probably + // indicates the entry was used, so skip it in that case. + return false; + } +} + +} // namespace + +void ClockHandleBasicData::FreeData() const { + if (deleter) { + UniqueId64x2 unhashed; + (*deleter)( + ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed), + value); + } +} + +HyperClockTable::HyperClockTable( + size_t capacity, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts) + : length_bits_(CalcHashBits(capacity, opts.estimated_value_size, + metadata_charge_policy)), + length_bits_mask_((size_t{1} << length_bits_) - 1), + occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) * + kStrictLoadFactor)), + array_(new HandleImpl[size_t{1} << length_bits_]) { + if (metadata_charge_policy == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + usage_ += size_t{GetTableSize()} * sizeof(HandleImpl); + } + + static_assert(sizeof(HandleImpl) == 64U, + "Expecting size / alignment with common cache line size"); +} + +HyperClockTable::~HyperClockTable() { + // Assumes there are no references or active operations on any slot/element + // in the table. + for (size_t i = 0; i < GetTableSize(); i++) { + HandleImpl& h = array_[i]; + switch (h.meta >> ClockHandle::kStateShift) { + case ClockHandle::kStateEmpty: + // noop + break; + case ClockHandle::kStateInvisible: // rare but possible + case ClockHandle::kStateVisible: + assert(GetRefcount(h.meta) == 0); + h.FreeData(); +#ifndef NDEBUG + Rollback(h.hashed_key, &h); + ReclaimEntryUsage(h.GetTotalCharge()); +#endif + break; + // otherwise + default: + assert(false); + break; + } + } + +#ifndef NDEBUG + for (size_t i = 0; i < GetTableSize(); i++) { + assert(array_[i].displacements.load() == 0); + } +#endif + + assert(usage_.load() == 0 || + usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl)); + assert(occupancy_ == 0); +} + +// If an entry doesn't receive clock updates but is repeatedly referenced & +// released, the acquire and release counters could overflow without some +// intervention. This is that intervention, which should be inexpensive +// because it only incurs a simple, very predictable check. (Applying a bit +// mask in addition to an increment to every Release likely would be +// relatively expensive, because it's an extra atomic update.) +// +// We do have to assume that we never have many millions of simultaneous +// references to a cache handle, because we cannot represent so many +// references with the difference in counters, masked to the number of +// counter bits. Similarly, we assume there aren't millions of threads +// holding transient references (which might be "undone" rather than +// released by the way). +// +// Consider these possible states for each counter: +// low: less than kMaxCountdown +// medium: kMaxCountdown to half way to overflow + kMaxCountdown +// high: half way to overflow + kMaxCountdown, or greater +// +// And these possible states for the combination of counters: +// acquire / release +// ------- ------- +// low low - Normal / common, with caveats (see below) +// medium low - Can happen while holding some refs +// high low - Violates assumptions (too many refs) +// low medium - Violates assumptions (refs underflow, etc.) +// medium medium - Normal (very read heavy cache) +// high medium - Can happen while holding some refs +// low high - This function is supposed to prevent +// medium high - Violates assumptions (refs underflow, etc.) +// high high - Needs CorrectNearOverflow +// +// Basically, this function detects (high, high) state (inferred from +// release alone being high) and bumps it back down to (medium, medium) +// state with the same refcount and the same logical countdown counter +// (everything > kMaxCountdown is logically the same). Note that bumping +// down to (low, low) would modify the countdown counter, so is "reserved" +// in a sense. +// +// If near-overflow correction is triggered here, there's no guarantee +// that another thread hasn't freed the entry and replaced it with another. +// Therefore, it must be the case that the correction does not affect +// entries unless they are very old (many millions of acquire-release cycles). +// (Our bit manipulation is indeed idempotent and only affects entries in +// exceptional cases.) We assume a pre-empted thread will not stall that long. +// If it did, the state could be corrupted in the (unlikely) case that the top +// bit of the acquire counter is set but not the release counter, and thus +// we only clear the top bit of the acquire counter on resumption. It would +// then appear that there are too many refs and the entry would be permanently +// pinned (which is not terrible for an exceptionally rare occurrence), unless +// it is referenced enough (at least kMaxCountdown more times) for the release +// counter to reach "high" state again and bumped back to "medium." (This +// motivates only checking for release counter in high state, not both in high +// state.) +inline void CorrectNearOverflow(uint64_t old_meta, + std::atomic<uint64_t>& meta) { + // We clear both top-most counter bits at the same time. + constexpr uint64_t kCounterTopBit = uint64_t{1} + << (ClockHandle::kCounterNumBits - 1); + constexpr uint64_t kClearBits = + (kCounterTopBit << ClockHandle::kAcquireCounterShift) | + (kCounterTopBit << ClockHandle::kReleaseCounterShift); + // A simple check that allows us to initiate clearing the top bits for + // a large portion of the "high" state space on release counter. + constexpr uint64_t kCheckBits = + (kCounterTopBit | (ClockHandle::kMaxCountdown + 1)) + << ClockHandle::kReleaseCounterShift; + + if (UNLIKELY(old_meta & kCheckBits)) { + meta.fetch_and(~kClearBits, std::memory_order_relaxed); + } +} + +inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + if (total_charge > capacity) { + return Status::MemoryLimit( + "Cache entry too large for a single cache shard: " + + std::to_string(total_charge) + " > " + std::to_string(capacity)); + } + // Grab any available capacity, and free up any more required. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t new_usage; + if (LIKELY(old_usage != capacity)) { + do { + new_usage = std::min(capacity, old_usage + total_charge); + } while (!usage_.compare_exchange_weak(old_usage, new_usage, + std::memory_order_relaxed)); + } else { + new_usage = old_usage; + } + // How much do we need to evict then? + size_t need_evict_charge = old_usage + total_charge - new_usage; + size_t request_evict_charge = need_evict_charge; + if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { + // Require at least 1 eviction. + request_evict_charge = 1; + } + if (request_evict_charge > 0) { + size_t evicted_charge = 0; + size_t evicted_count = 0; + Evict(request_evict_charge, &evicted_charge, &evicted_count); + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + if (LIKELY(evicted_charge > need_evict_charge)) { + assert(evicted_count > 0); + // Evicted more than enough + usage_.fetch_sub(evicted_charge - need_evict_charge, + std::memory_order_relaxed); + } else if (evicted_charge < need_evict_charge || + (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { + // Roll back to old usage minus evicted + usage_.fetch_sub(evicted_charge + (new_usage - old_usage), + std::memory_order_relaxed); + if (evicted_charge < need_evict_charge) { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "capacity limit."); + } else { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "table occupancy limit."); + } + } + // If we needed to evict something and we are proceeding, we must have + // evicted something. + assert(evicted_count > 0); + } + return Status::OK(); +} + +inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + // For simplicity, we consider that either the cache can accept the insert + // with no evictions, or we must evict enough to make (at least) enough + // space. It could lead to unnecessary failures or excessive evictions in + // some extreme cases, but allows a fast, simple protocol. If we allow a + // race to get us over capacity, then we might never get back to capacity + // limit if the sizes of entries allow each insertion to evict the minimum + // charge. Thus, we should evict some extra if it's not a signifcant + // portion of the shard capacity. This can have the side benefit of + // involving fewer threads in eviction. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t need_evict_charge; + // NOTE: if total_charge > old_usage, there isn't yet enough to evict + // `total_charge` amount. Even if we only try to evict `old_usage` amount, + // there's likely something referenced and we would eat CPU looking for + // enough to evict. + if (old_usage + total_charge <= capacity || total_charge > old_usage) { + // Good enough for me (might run over with a race) + need_evict_charge = 0; + } else { + // Try to evict enough space, and maybe some extra + need_evict_charge = total_charge; + if (old_usage > capacity) { + // Not too much to avoid thundering herd while avoiding strict + // synchronization, such as the compare_exchange used with strict + // capacity limit. + need_evict_charge += std::min(capacity / 1024, total_charge) + 1; + } + } + if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { + // Special case: require at least 1 eviction if we only have to + // deal with occupancy + need_evict_charge = 1; + } + size_t evicted_charge = 0; + size_t evicted_count = 0; + if (need_evict_charge > 0) { + Evict(need_evict_charge, &evicted_charge, &evicted_count); + // Deal with potential occupancy deficit + if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { + assert(evicted_charge == 0); + // Can't meet occupancy requirement + return false; + } else { + // Update occupancy for evictions + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + } + } + // Track new usage even if we weren't able to evict enough + usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + // Success + return true; +} + +inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert( + const ClockHandleBasicData& proto) { + // Heap allocated separate from table + HandleImpl* h = new HandleImpl(); + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + h->SetDetached(); + // Single reference (detached entries only created if returning a refed + // Handle back to user) + uint64_t meta = uint64_t{ClockHandle::kStateInvisible} + << ClockHandle::kStateShift; + meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + h->meta.store(meta, std::memory_order_release); + // Keep track of how much of usage is detached + detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed); + return h; +} + +Status HyperClockTable::Insert(const ClockHandleBasicData& proto, + HandleImpl** handle, Cache::Priority priority, + size_t capacity, bool strict_capacity_limit) { + // Do we have the available occupancy? Optimistically assume we do + // and deal with it if we don't. + size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); + auto revert_occupancy_fn = [&]() { + occupancy_.fetch_sub(1, std::memory_order_relaxed); + }; + // Whether we over-committed and need an eviction to make up for it + bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_; + + // Usage/capacity handling is somewhat different depending on + // strict_capacity_limit, but mostly pessimistic. + bool use_detached_insert = false; + const size_t total_charge = proto.GetTotalCharge(); + if (strict_capacity_limit) { + Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!s.ok()) { + revert_occupancy_fn(); + return s; + } + } else { + // Case strict_capacity_limit == false + bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!success) { + revert_occupancy_fn(); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry + // inserted into cache and evicted immediately. + proto.FreeData(); + return Status::OK(); + } else { + // Need to track usage of fallback detached insert + usage_.fetch_add(total_charge, std::memory_order_relaxed); + use_detached_insert = true; + } + } + } + auto revert_usage_fn = [&]() { + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + }; + + if (!use_detached_insert) { + // Attempt a table insert, but abort if we find an existing entry for the + // key. If we were to overwrite old entries, we would either + // * Have to gain ownership over an existing entry to overwrite it, which + // would only work if there are no outstanding (read) references and would + // create a small gap in availability of the entry (old or new) to lookups. + // * Have to insert into a suboptimal location (more probes) so that the + // old entry can be kept around as well. + + uint64_t initial_countdown = GetInitialCountdown(priority); + assert(initial_countdown > 0); + + size_t probe = 0; + HandleImpl* e = FindSlot( + proto.hashed_key, + [&](HandleImpl* h) { + // Optimistically transition the slot from "empty" to + // "under construction" (no effect on other states) + uint64_t old_meta = + h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit} + << ClockHandle::kStateShift, + std::memory_order_acq_rel); + uint64_t old_state = old_meta >> ClockHandle::kStateShift; + + if (old_state == ClockHandle::kStateEmpty) { + // We've started inserting into an available slot, and taken + // ownership Save data fields + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + + // Transition from "under construction" state to "visible" state + uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} + << ClockHandle::kStateShift; + + // Maybe with an outstanding reference + new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; + new_meta |= (initial_countdown - (handle != nullptr)) + << ClockHandle::kReleaseCounterShift; + +#ifndef NDEBUG + // Save the state transition, with assertion + old_meta = h->meta.exchange(new_meta, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Save the state transition + h->meta.store(new_meta, std::memory_order_release); +#endif + return true; + } else if (old_state != ClockHandle::kStateVisible) { + // Slot not usable / touchable now + return false; + } + // Existing, visible entry, which might be a match. + // But first, we need to acquire a ref to read it. In fact, number of + // refs for initial countdown, so that we boost the clock state if + // this is a match. + old_meta = h->meta.fetch_add( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + // Like Lookup + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == proto.hashed_key) { + // Match. Release in a way that boosts the clock state + old_meta = h->meta.fetch_add( + ClockHandle::kReleaseIncrement * initial_countdown, + std::memory_order_acq_rel); + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + // Insert detached instead (only if return handle needed) + use_detached_insert = true; + return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + // Slot not usable / touchable now. + } + (void)old_meta; + return false; + }, + [&](HandleImpl* /*h*/) { return false; }, + [&](HandleImpl* h) { + h->displacements.fetch_add(1, std::memory_order_relaxed); + }, + probe); + if (e == nullptr) { + // Occupancy check and never abort FindSlot above should generally + // prevent this, except it's theoretically possible for other threads + // to evict and replace entries in the right order to hit every slot + // when it is populated. Assuming random hashing, the chance of that + // should be no higher than pow(kStrictLoadFactor, n) for n slots. + // That should be infeasible for roughly n >= 256, so if this assertion + // fails, that suggests something is going wrong. + assert(GetTableSize() < 256); + use_detached_insert = true; + } + if (!use_detached_insert) { + // Successfully inserted + if (handle) { + *handle = e; + } + return Status::OK(); + } + // Roll back table insertion + Rollback(proto.hashed_key, e); + revert_occupancy_fn(); + // Maybe fall back on detached insert + if (handle == nullptr) { + revert_usage_fn(); + // As if unrefed entry immdiately evicted + proto.FreeData(); + return Status::OK(); + } + } + + // Run detached insert + assert(use_detached_insert); + + *handle = DetachedInsert(proto); + + // The OkOverwritten status is used to count "redundant" insertions into + // block cache. This implementation doesn't strictly check for redundant + // insertions, but we instead are probably interested in how many insertions + // didn't go into the table (instead "detached"), which could be redundant + // Insert or some other reason (use_detached_insert reasons above). + return Status::OkOverwritten(); +} + +HyperClockTable::HandleImpl* HyperClockTable::Lookup( + const UniqueId64x2& hashed_key) { + size_t probe = 0; + HandleImpl* e = FindSlot( + hashed_key, + [&](HandleImpl* h) { + // Mostly branch-free version (similar performance) + /* + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; + bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; + bool match = (h->key == key) & visible; + h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) << + ClockHandle::kAcquireCounterShift, std::memory_order_release); return + match; + */ + // Optimistic lookup should pay off when the table is relatively + // sparse. + constexpr bool kOptimisticLookup = true; + uint64_t old_meta; + if (!kOptimisticLookup) { + old_meta = h->meta.load(std::memory_order_acquire); + if ((old_meta >> ClockHandle::kStateShift) != + ClockHandle::kStateVisible) { + return false; + } + } + // (Optimistically) increment acquire counter + old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == hashed_key) { + // Match + return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + (void)old_meta; + return false; + }, + [&](HandleImpl* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, + [&](HandleImpl* /*h*/) {}, probe); + + return e; +} + +bool HyperClockTable::Release(HandleImpl* h, bool useful, + bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the cache is above capacity and the reference is the last one. Space + // is only freed up by EvictFromClock (called by Insert when space is needed) + // and Erase. We do this to avoid an extra atomic read of the variable usage_. + + uint64_t old_meta; + if (useful) { + // Increment release counter to indicate was used + old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement, + std::memory_order_release); + } else { + // Decrement acquire counter to pretend it never happened + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // No underflow + assert(((old_meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask) != + ((old_meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask)); + + if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateInvisible)) { + // Update for last fetch_add op + if (useful) { + old_meta += ClockHandle::kReleaseIncrement; + } else { + old_meta -= ClockHandle::kAcquireIncrement; + } + // Take ownership if no refs + do { + if (GetRefcount(old_meta) != 0) { + // Not last ref at some point in time during this Release call + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; + } + if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift)) == 0) { + // Someone else took ownership + return false; + } + // Note that there's a small chance that we release, another thread + // replaces this entry with another, reaches zero refs, and then we end + // up erasing that other entry. That's an acceptable risk / imprecision. + } while (!h->meta.compare_exchange_weak( + old_meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)); + // Took ownership + size_t total_charge = h->GetTotalCharge(); + if (UNLIKELY(h->IsDetached())) { + h->FreeData(); + // Delete detached handle + delete h; + detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + } else { + Rollback(h->hashed_key, h); + FreeDataMarkEmpty(*h); + ReclaimEntryUsage(total_charge); + } + return true; + } else { + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; + } +} + +void HyperClockTable::Ref(HandleImpl& h) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // Must have already had a reference + assert(GetRefcount(old_meta) > 0); + (void)old_meta; +} + +void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; +} + +void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { + if (n > 0) { + // Split into n - 1 and 1 steps. + uint64_t old_meta = h->meta.fetch_add( + (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire); + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; + + Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); + } +} + +void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { + size_t probe = 0; + (void)FindSlot( + hashed_key, + [&](HandleImpl* h) { + // Could be multiple entries in rare cases. Erase them all. + // Optimistically increment acquire counter + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == hashed_key) { + // Match. Set invisible. + old_meta = + h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift), + std::memory_order_acq_rel); + // Apply update to local copy + old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift); + for (;;) { + uint64_t refcount = GetRefcount(old_meta); + assert(refcount > 0); + if (refcount > 1) { + // Not last ref at some point in time during this Erase call + // Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + break; + } else if (h->meta.compare_exchange_weak( + old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acq_rel)) { + // Took ownership + assert(hashed_key == h->hashed_key); + size_t total_charge = h->GetTotalCharge(); + FreeDataMarkEmpty(*h); + ReclaimEntryUsage(total_charge); + // We already have a copy of hashed_key in this case, so OK to + // delay Rollback until after releasing the entry + Rollback(hashed_key, h); + break; + } + } + } else { + // Mismatch. Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + } + return false; + }, + [&](HandleImpl* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, + [&](HandleImpl* /*h*/) {}, probe); +} + +void HyperClockTable::ConstApplyToEntriesRange( + std::function<void(const HandleImpl&)> func, size_t index_begin, + size_t index_end, bool apply_if_will_be_deleted) const { + uint64_t check_state_mask = ClockHandle::kStateShareableBit; + if (!apply_if_will_be_deleted) { + check_state_mask |= ClockHandle::kStateVisibleBit; + } + + for (size_t i = index_begin; i < index_end; i++) { + HandleImpl& h = array_[i]; + + // Note: to avoid using compare_exchange, we have to be extra careful. + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + // Increment acquire counter. Note: it's possible that the entry has + // completely changed since we loaded old_meta, but incrementing acquire + // count is always safe. (Similar to optimistic Lookup here.) + old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check whether we actually acquired a reference. + if ((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit) { + // Apply func if appropriate + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + func(h); + } + // Pretend we never took the reference + h.meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + // No net change, so don't need to check for overflow + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + } + } +} + +void HyperClockTable::EraseUnRefEntries() { + for (size_t i = 0; i <= this->length_bits_mask_; i++) { + HandleImpl& h = array_[i]; + + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift) && + GetRefcount(old_meta) == 0 && + h.meta.compare_exchange_strong(old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership + size_t total_charge = h.GetTotalCharge(); + Rollback(h.hashed_key, &h); + FreeDataMarkEmpty(h); + ReclaimEntryUsage(total_charge); + } + } +} + +inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( + const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn, + std::function<bool(HandleImpl*)> abort_fn, + std::function<void(HandleImpl*)> update_fn, size_t& probe) { + // NOTE: upper 32 bits of hashed_key[0] is used for sharding + // + // We use double-hashing probing. Every probe in the sequence is a + // pseudorandom integer, computed as a linear function of two random hashes, + // which we call base and increment. Specifically, the i-th probe is base + i + // * increment modulo the table size. + size_t base = static_cast<size_t>(hashed_key[1]); + // We use an odd increment, which is relatively prime with the power-of-two + // table size. This implies that we cycle back to the first probe only + // after probing every slot exactly once. + // TODO: we could also reconsider linear probing, though locality benefits + // are limited because each slot is a full cache line + size_t increment = static_cast<size_t>(hashed_key[0]) | 1U; + size_t current = ModTableSize(base + probe * increment); + while (probe <= length_bits_mask_) { + HandleImpl* h = &array_[current]; + if (match_fn(h)) { + probe++; + return h; + } + if (abort_fn(h)) { + return nullptr; + } + probe++; + update_fn(h); + current = ModTableSize(current + increment); + } + // We looped back. + return nullptr; +} + +inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key, + const HandleImpl* h) { + size_t current = ModTableSize(hashed_key[1]); + size_t increment = static_cast<size_t>(hashed_key[0]) | 1U; + while (&array_[current] != h) { + array_[current].displacements.fetch_sub(1, std::memory_order_relaxed); + current = ModTableSize(current + increment); + } +} + +inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) { + auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release); + (void)old_occupancy; + // No underflow + assert(old_occupancy > 0); + auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed); + (void)old_usage; + // No underflow + assert(old_usage >= total_charge); +} + +inline void HyperClockTable::Evict(size_t requested_charge, + size_t* freed_charge, size_t* freed_count) { + // precondition + assert(requested_charge > 0); + + // TODO: make a tuning parameter? + constexpr size_t step_size = 4; + + // First (concurrent) increment clock pointer + uint64_t old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + + // Cap the eviction effort at this thread (along with those operating in + // parallel) circling through the whole structure kMaxCountdown times. + // In other words, this eviction run must find something/anything that is + // unreferenced at start of and during the eviction run that isn't reclaimed + // by a concurrent eviction run. + uint64_t max_clock_pointer = + old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_); + + for (;;) { + for (size_t i = 0; i < step_size; i++) { + HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; + bool evicting = ClockUpdate(h); + if (evicting) { + Rollback(h.hashed_key, &h); + *freed_charge += h.GetTotalCharge(); + *freed_count += 1; + FreeDataMarkEmpty(h); + } + } + + // Loop exit condition + if (*freed_charge >= requested_charge) { + return; + } + if (old_clock_pointer >= max_clock_pointer) { + return; + } + + // Advance clock pointer (concurrently) + old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + } +} + +template <class Table> +ClockCacheShard<Table>::ClockCacheShard( + size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + const typename Table::Opts& opts) + : CacheShardBase(metadata_charge_policy), + table_(capacity, strict_capacity_limit, metadata_charge_policy, opts), + capacity_(capacity), + strict_capacity_limit_(strict_capacity_limit) { + // Initial charge metadata should not exceed capacity + assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl)); +} + +template <class Table> +void ClockCacheShard<Table>::EraseUnRefEntries() { + table_.EraseUnRefEntries(); +} + +template <class Table> +void ClockCacheShard<Table>::ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state) { + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. + size_t length_bits = table_.GetLengthBits(); + size_t length = table_.GetTableSize(); + + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow). + assert(average_entries_per_lock < length || *state == 0); + + size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits); + size_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end. + index_end = length; + *state = SIZE_MAX; + } else { + *state = index_end << (sizeof(size_t) * 8u - length_bits); + } + + table_.ConstApplyToEntriesRange( + [callback](const HandleImpl& h) { + UniqueId64x2 unhashed; + callback(ReverseHash(h.hashed_key, &unhashed), h.value, + h.GetTotalCharge(), h.deleter); + }, + index_begin, index_end, false); +} + +int HyperClockTable::CalcHashBits( + size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { + double average_slot_charge = estimated_value_size * kLoadFactor; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + average_slot_charge += sizeof(HandleImpl); + } + assert(average_slot_charge > 0.0); + uint64_t num_slots = + static_cast<uint64_t>(capacity / average_slot_charge + 0.999999); + + int hash_bits = FloorLog2((num_slots << 1) - 1); + if (metadata_charge_policy == kFullChargeCacheMetadata) { + // For very small estimated value sizes, it's possible to overshoot + while (hash_bits > 0 && + uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) { + hash_bits--; + } + } + return hash_bits; +} + +template <class Table> +void ClockCacheShard<Table>::SetCapacity(size_t capacity) { + capacity_.store(capacity, std::memory_order_relaxed); + // next Insert will take care of any necessary evictions +} + +template <class Table> +void ClockCacheShard<Table>::SetStrictCapacityLimit( + bool strict_capacity_limit) { + strict_capacity_limit_.store(strict_capacity_limit, + std::memory_order_relaxed); + // next Insert will take care of any necessary evictions +} + +template <class Table> +Status ClockCacheShard<Table>::Insert(const Slice& key, + const UniqueId64x2& hashed_key, + void* value, size_t charge, + Cache::DeleterFn deleter, + HandleImpl** handle, + Cache::Priority priority) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return Status::NotSupported("ClockCache only supports key size " + + std::to_string(kCacheKeySize) + "B"); + } + ClockHandleBasicData proto; + proto.hashed_key = hashed_key; + proto.value = value; + proto.deleter = deleter; + proto.total_charge = charge; + Status s = table_.Insert( + proto, handle, priority, capacity_.load(std::memory_order_relaxed), + strict_capacity_limit_.load(std::memory_order_relaxed)); + return s; +} + +template <class Table> +typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup( + const Slice& key, const UniqueId64x2& hashed_key) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return nullptr; + } + return table_.Lookup(hashed_key); +} + +template <class Table> +bool ClockCacheShard<Table>::Ref(HandleImpl* h) { + if (h == nullptr) { + return false; + } + table_.Ref(*h); + return true; +} + +template <class Table> +bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful, + bool erase_if_last_ref) { + if (handle == nullptr) { + return false; + } + return table_.Release(handle, useful, erase_if_last_ref); +} + +template <class Table> +void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) { + table_.TEST_RefN(*h, n); +} + +template <class Table> +void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) { + table_.TEST_ReleaseN(h, n); +} + +template <class Table> +bool ClockCacheShard<Table>::Release(HandleImpl* handle, + bool erase_if_last_ref) { + return Release(handle, /*useful=*/true, erase_if_last_ref); +} + +template <class Table> +void ClockCacheShard<Table>::Erase(const Slice& key, + const UniqueId64x2& hashed_key) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return; + } + table_.Erase(hashed_key); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetUsage() const { + return table_.GetUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetDetachedUsage() const { + return table_.GetDetachedUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetCapacity() const { + return capacity_; +} + +template <class Table> +size_t ClockCacheShard<Table>::GetPinnedUsage() const { + // Computes the pinned usage by scanning the whole hash table. This + // is slow, but avoids keeping an exact counter on the clock usage, + // i.e., the number of not externally referenced elements. + // Why avoid this counter? Because Lookup removes elements from the clock + // list, so it would need to update the pinned usage every time, + // which creates additional synchronization costs. + size_t table_pinned_usage = 0; + const bool charge_metadata = + metadata_charge_policy_ == kFullChargeCacheMetadata; + table_.ConstApplyToEntriesRange( + [&table_pinned_usage, charge_metadata](const HandleImpl& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + uint64_t refcount = GetRefcount(meta); + // Holding one ref for ConstApplyToEntriesRange + assert(refcount > 0); + if (refcount > 1) { + table_pinned_usage += h.GetTotalCharge(); + if (charge_metadata) { + table_pinned_usage += sizeof(HandleImpl); + } + } + }, + 0, table_.GetTableSize(), true); + + return table_pinned_usage + table_.GetDetachedUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetOccupancyCount() const { + return table_.GetOccupancy(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetOccupancyLimit() const { + return table_.GetOccupancyLimit(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetTableAddressCount() const { + return table_.GetTableSize(); +} + +// Explicit instantiation +template class ClockCacheShard<HyperClockTable>; + +HyperClockCache::HyperClockCache( + size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + std::shared_ptr<MemoryAllocator> memory_allocator) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, + std::move(memory_allocator)) { + assert(estimated_value_size > 0 || + metadata_charge_policy != kDontChargeCacheMetadata); + // TODO: should not need to go through two levels of pointer indirection to + // get to table entries + size_t per_shard = GetPerShardCapacity(); + InitShards([=](Shard* cs) { + HyperClockTable::Opts opts; + opts.estimated_value_size = estimated_value_size; + new (cs) + Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts); + }); +} + +void* HyperClockCache::Value(Handle* handle) { + return reinterpret_cast<const HandleImpl*>(handle)->value; +} + +size_t HyperClockCache::GetCharge(Handle* handle) const { + return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge(); +} + +Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast<const HandleImpl*>(handle); + return h->deleter; +} + +namespace { + +// For each cache shard, estimate what the table load factor would be if +// cache filled to capacity with average entries. This is considered +// indicative of a potential problem if the shard is essentially operating +// "at limit", which we define as high actual usage (>80% of capacity) +// or actual occupancy very close to limit (>95% of limit). +// Also, for each shard compute the recommended estimated_entry_charge, +// and keep the minimum one for use as overall recommendation. +void AddShardEvaluation(const HyperClockCache::Shard& shard, + std::vector<double>& predicted_load_factors, + size_t& min_recommendation) { + size_t usage = shard.GetUsage() - shard.GetDetachedUsage(); + size_t capacity = shard.GetCapacity(); + double usage_ratio = 1.0 * usage / capacity; + + size_t occupancy = shard.GetOccupancyCount(); + size_t occ_limit = shard.GetOccupancyLimit(); + double occ_ratio = 1.0 * occupancy / occ_limit; + if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) { + // Skip as described above + return; + } + + // If filled to capacity, what would the occupancy ratio be? + double ratio = occ_ratio / usage_ratio; + // Given max load factor, what that load factor be? + double lf = ratio * kStrictLoadFactor; + predicted_load_factors.push_back(lf); + + // Update min_recommendation also + size_t recommendation = usage / occupancy; + min_recommendation = std::min(min_recommendation, recommendation); +} + +} // namespace + +void HyperClockCache::ReportProblems( + const std::shared_ptr<Logger>& info_log) const { + uint32_t shard_count = GetNumShards(); + std::vector<double> predicted_load_factors; + size_t min_recommendation = SIZE_MAX; + const_cast<HyperClockCache*>(this)->ForEachShard( + [&](HyperClockCache::Shard* shard) { + AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); + }); + + if (predicted_load_factors.empty()) { + // None operating "at limit" -> nothing to report + return; + } + std::sort(predicted_load_factors.begin(), predicted_load_factors.end()); + + // First, if the average load factor is within spec, we aren't going to + // complain about a few shards being out of spec. + // NOTE: this is only the average among cache shards operating "at limit," + // which should be representative of what we care about. It it normal, even + // desirable, for a cache to operate "at limit" so this should not create + // selection bias. See AddShardEvaluation(). + // TODO: Consider detecting cases where decreasing the number of shards + // would be good, e.g. serious imbalance among shards. + double average_load_factor = + std::accumulate(predicted_load_factors.begin(), + predicted_load_factors.end(), 0.0) / + shard_count; + + constexpr double kLowSpecLoadFactor = kLoadFactor / 2; + constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414; + if (average_load_factor > kLoadFactor) { + // Out of spec => Consider reporting load factor too high + // Estimate effective overall capacity loss due to enforcing occupancy limit + double lost_portion = 0.0; + int over_count = 0; + for (double lf : predicted_load_factors) { + if (lf > kStrictLoadFactor) { + ++over_count; + lost_portion += (lf - kStrictLoadFactor) / lf / shard_count; + } + } + // >= 20% loss -> error + // >= 10% loss -> consistent warning + // >= 1% loss -> intermittent warning + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + bool report = true; + if (lost_portion > 0.2) { + level = InfoLogLevel::ERROR_LEVEL; + } else if (lost_portion > 0.1) { + level = InfoLogLevel::WARN_LEVEL; + } else if (lost_portion > 0.01) { + int report_percent = static_cast<int>(lost_portion * 100.0); + if (Random::GetTLSInstance()->PercentTrue(report_percent)) { + level = InfoLogLevel::WARN_LEVEL; + } + } else { + // don't report + report = false; + } + if (report) { + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p unable to use estimated %.1f%% capacity because " + "of " + "full occupancy in %d/%u cache shards (estimated_entry_charge too " + "high). Recommend estimated_entry_charge=%zu", + this, lost_portion * 100.0, over_count, (unsigned)shard_count, + min_recommendation); + } + } else if (average_load_factor < kLowSpecLoadFactor) { + // Out of spec => Consider reporting load factor too low + // But cautiously because low is not as big of a problem. + + // Only report if highest occupancy shard is also below + // spec and only if average is substantially out of spec + if (predicted_load_factors.back() < kLowSpecLoadFactor && + average_load_factor < kLowSpecLoadFactor / 1.414) { + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + if (average_load_factor < kLowSpecLoadFactor / 2) { + level = InfoLogLevel::WARN_LEVEL; + } + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p table has low occupancy at full capacity. Higher " + "estimated_entry_charge (about %.1fx) would likely improve " + "performance. Recommend estimated_entry_charge=%zu", + this, kMidSpecLoadFactor / average_load_factor, min_recommendation); + } + } +} + +} // namespace clock_cache + +// DEPRECATED (see public API) +std::shared_ptr<Cache> NewClockCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + /* high_pri_pool_ratio */ 0.5, nullptr, + kDefaultToAdaptiveMutex, metadata_charge_policy, + /* low_pri_pool_ratio */ 0.0); +} + +std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const { + auto my_num_shard_bits = num_shard_bits; + if (my_num_shard_bits >= 20) { + return nullptr; // The cache cannot be sharded into too many fine pieces. + } + if (my_num_shard_bits < 0) { + // Use larger shard size to reduce risk of large entries clustering + // or skewing individual shards. + constexpr size_t min_shard_size = 32U * 1024U * 1024U; + my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); + } + return std::make_shared<clock_cache::HyperClockCache>( + capacity, estimated_entry_charge, my_num_shard_bits, + strict_capacity_limit, metadata_charge_policy, memory_allocator); +} + +} // namespace ROCKSDB_NAMESPACE |