1 files changed, 1404 insertions, 0 deletions
diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc
new file mode 100644
index 000000000..6c9f18c2f
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.cc
@@ -0,0 +1,1404 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/clock_cache.h"
+
+#include <cassert>
+#include <functional>
+#include <numeric>
+
+#include "cache/cache_key.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace clock_cache {
+
+namespace {
+inline uint64_t GetRefcount(uint64_t meta) {
+  return ((meta >> ClockHandle::kAcquireCounterShift) -
+          (meta >> ClockHandle::kReleaseCounterShift)) &
+         ClockHandle::kCounterMask;
+}
+
+inline uint64_t GetInitialCountdown(Cache::Priority priority) {
+  // Set initial clock data from priority
+  // TODO: configuration parameters for priority handling and clock cycle
+  // count?
+  switch (priority) {
+    case Cache::Priority::HIGH:
+      return ClockHandle::kHighCountdown;
+    default:
+      assert(false);
+      FALLTHROUGH_INTENDED;
+    case Cache::Priority::LOW:
+      return ClockHandle::kLowCountdown;
+    case Cache::Priority::BOTTOM:
+      return ClockHandle::kBottomCountdown;
+  }
+}
+
+inline void FreeDataMarkEmpty(ClockHandle& h) {
+  // NOTE: in theory there's more room for parallelism if we copy the handle
+  // data and delay actions like this until after marking the entry as empty,
+  // but performance tests only show a regression by copying the few words
+  // of data.
+  h.FreeData();
+
+#ifndef NDEBUG
+  // Mark slot as empty, with assertion
+  uint64_t meta = h.meta.exchange(0, std::memory_order_release);
+  assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
+#else
+  // Mark slot as empty
+  h.meta.store(0, std::memory_order_release);
+#endif
+}
+
+inline bool ClockUpdate(ClockHandle& h) {
+  uint64_t meta = h.meta.load(std::memory_order_relaxed);
+
+  uint64_t acquire_count =
+      (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
+  uint64_t release_count =
+      (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
+  // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
+  // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
+  if (acquire_count != release_count) {
+    // Only clock update entries with no outstanding refs
+    return false;
+  }
+  if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) {
+    // Only clock update Shareable entries
+    return false;
+  }
+  if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
+      acquire_count > 0) {
+    // Decrement clock
+    uint64_t new_count =
+        std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
+    // Compare-exchange in the decremented clock info, but
+    // not aggressively
+    uint64_t new_meta =
+        (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
+        (new_count << ClockHandle::kReleaseCounterShift) |
+        (new_count << ClockHandle::kAcquireCounterShift);
+    h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
+    return false;
+  }
+  // Otherwise, remove entry (either unreferenced invisible or
+  // unreferenced and expired visible).
+  if (h.meta.compare_exchange_strong(
+          meta,
+          uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+          std::memory_order_acquire)) {
+    // Took ownership.
+    return true;
+  } else {
+    // Compare-exchange failing probably
+    // indicates the entry was used, so skip it in that case.
+    return false;
+  }
+}
+
+}  // namespace
+
+void ClockHandleBasicData::FreeData() const {
+  if (deleter) {
+    UniqueId64x2 unhashed;
+    (*deleter)(
+        ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed),
+        value);
+  }
+}
+
+HyperClockTable::HyperClockTable(
+    size_t capacity, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts)
+    : length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
+                                metadata_charge_policy)),
+      length_bits_mask_((size_t{1} << length_bits_) - 1),
+      occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
+                                           kStrictLoadFactor)),
+      array_(new HandleImpl[size_t{1} << length_bits_]) {
+  if (metadata_charge_policy ==
+      CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
+    usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
+  }
+
+  static_assert(sizeof(HandleImpl) == 64U,
+                "Expecting size / alignment with common cache line size");
+}
+
+HyperClockTable::~HyperClockTable() {
+  // Assumes there are no references or active operations on any slot/element
+  // in the table.
+  for (size_t i = 0; i < GetTableSize(); i++) {
+    HandleImpl& h = array_[i];
+    switch (h.meta >> ClockHandle::kStateShift) {
+      case ClockHandle::kStateEmpty:
+        // noop
+        break;
+      case ClockHandle::kStateInvisible:  // rare but possible
+      case ClockHandle::kStateVisible:
+        assert(GetRefcount(h.meta) == 0);
+        h.FreeData();
+#ifndef NDEBUG
+        Rollback(h.hashed_key, &h);
+        ReclaimEntryUsage(h.GetTotalCharge());
+#endif
+        break;
+      // otherwise
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < GetTableSize(); i++) {
+    assert(array_[i].displacements.load() == 0);
+  }
+#endif
+
+  assert(usage_.load() == 0 ||
+         usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
+  assert(occupancy_ == 0);
+}
+
+// If an entry doesn't receive clock updates but is repeatedly referenced &
+// released, the acquire and release counters could overflow without some
+// intervention. This is that intervention, which should be inexpensive
+// because it only incurs a simple, very predictable check. (Applying a bit
+// mask in addition to an increment to every Release likely would be
+// relatively expensive, because it's an extra atomic update.)
+//
+// We do have to assume that we never have many millions of simultaneous
+// references to a cache handle, because we cannot represent so many
+// references with the difference in counters, masked to the number of
+// counter bits. Similarly, we assume there aren't millions of threads
+// holding transient references (which might be "undone" rather than
+// released by the way).
+//
+// Consider these possible states for each counter:
+// low: less than kMaxCountdown
+// medium: kMaxCountdown to half way to overflow + kMaxCountdown
+// high: half way to overflow + kMaxCountdown, or greater
+//
+// And these possible states for the combination of counters:
+// acquire / release
+// -------   -------
+// low       low       - Normal / common, with caveats (see below)
+// medium    low       - Can happen while holding some refs
+// high      low       - Violates assumptions (too many refs)
+// low       medium    - Violates assumptions (refs underflow, etc.)
+// medium    medium    - Normal (very read heavy cache)
+// high      medium    - Can happen while holding some refs
+// low       high      - This function is supposed to prevent
+// medium    high      - Violates assumptions (refs underflow, etc.)
+// high      high      - Needs CorrectNearOverflow
+//
+// Basically, this function detects (high, high) state (inferred from
+// release alone being high) and bumps it back down to (medium, medium)
+// state with the same refcount and the same logical countdown counter
+// (everything > kMaxCountdown is logically the same). Note that bumping
+// down to (low, low) would modify the countdown counter, so is "reserved"
+// in a sense.
+//
+// If near-overflow correction is triggered here, there's no guarantee
+// that another thread hasn't freed the entry and replaced it with another.
+// Therefore, it must be the case that the correction does not affect
+// entries unless they are very old (many millions of acquire-release cycles).
+// (Our bit manipulation is indeed idempotent and only affects entries in
+// exceptional cases.) We assume a pre-empted thread will not stall that long.
+// If it did, the state could be corrupted in the (unlikely) case that the top
+// bit of the acquire counter is set but not the release counter, and thus
+// we only clear the top bit of the acquire counter on resumption. It would
+// then appear that there are too many refs and the entry would be permanently
+// pinned (which is not terrible for an exceptionally rare occurrence), unless
+// it is referenced enough (at least kMaxCountdown more times) for the release
+// counter to reach "high" state again and bumped back to "medium." (This
+// motivates only checking for release counter in high state, not both in high
+// state.)
+inline void CorrectNearOverflow(uint64_t old_meta,
+                                std::atomic<uint64_t>& meta) {
+  // We clear both top-most counter bits at the same time.
+  constexpr uint64_t kCounterTopBit = uint64_t{1}
+                                      << (ClockHandle::kCounterNumBits - 1);
+  constexpr uint64_t kClearBits =
+      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
+      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
+  // A simple check that allows us to initiate clearing the top bits for
+  // a large portion of the "high" state space on release counter.
+  constexpr uint64_t kCheckBits =
+      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
+      << ClockHandle::kReleaseCounterShift;
+
+  if (UNLIKELY(old_meta & kCheckBits)) {
+    meta.fetch_and(~kClearBits, std::memory_order_relaxed);
+  }
+}
+
+inline Status HyperClockTable::ChargeUsageMaybeEvictStrict(
+    size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+  if (total_charge > capacity) {
+    return Status::MemoryLimit(
+        "Cache entry too large for a single cache shard: " +
+        std::to_string(total_charge) + " > " + std::to_string(capacity));
+  }
+  // Grab any available capacity, and free up any more required.
+  size_t old_usage = usage_.load(std::memory_order_relaxed);
+  size_t new_usage;
+  if (LIKELY(old_usage != capacity)) {
+    do {
+      new_usage = std::min(capacity, old_usage + total_charge);
+    } while (!usage_.compare_exchange_weak(old_usage, new_usage,
+                                           std::memory_order_relaxed));
+  } else {
+    new_usage = old_usage;
+  }
+  // How much do we need to evict then?
+  size_t need_evict_charge = old_usage + total_charge - new_usage;
+  size_t request_evict_charge = need_evict_charge;
+  if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
+    // Require at least 1 eviction.
+    request_evict_charge = 1;
+  }
+  if (request_evict_charge > 0) {
+    size_t evicted_charge = 0;
+    size_t evicted_count = 0;
+    Evict(request_evict_charge, &evicted_charge, &evicted_count);
+    occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+    if (LIKELY(evicted_charge > need_evict_charge)) {
+      assert(evicted_count > 0);
+      // Evicted more than enough
+      usage_.fetch_sub(evicted_charge - need_evict_charge,
+                       std::memory_order_relaxed);
+    } else if (evicted_charge < need_evict_charge ||
+               (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
+      // Roll back to old usage minus evicted
+      usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
+                       std::memory_order_relaxed);
+      if (evicted_charge < need_evict_charge) {
+        return Status::MemoryLimit(
+            "Insert failed because unable to evict entries to stay within "
+            "capacity limit.");
+      } else {
+        return Status::MemoryLimit(
+            "Insert failed because unable to evict entries to stay within "
+            "table occupancy limit.");
+      }
+    }
+    // If we needed to evict something and we are proceeding, we must have
+    // evicted something.
+    assert(evicted_count > 0);
+  }
+  return Status::OK();
+}
+
+inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
+    size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+  // For simplicity, we consider that either the cache can accept the insert
+  // with no evictions, or we must evict enough to make (at least) enough
+  // space. It could lead to unnecessary failures or excessive evictions in
+  // some extreme cases, but allows a fast, simple protocol. If we allow a
+  // race to get us over capacity, then we might never get back to capacity
+  // limit if the sizes of entries allow each insertion to evict the minimum
+  // charge. Thus, we should evict some extra if it's not a signifcant
+  // portion of the shard capacity. This can have the side benefit of
+  // involving fewer threads in eviction.
+  size_t old_usage = usage_.load(std::memory_order_relaxed);
+  size_t need_evict_charge;
+  // NOTE: if total_charge > old_usage, there isn't yet enough to evict
+  // `total_charge` amount. Even if we only try to evict `old_usage` amount,
+  // there's likely something referenced and we would eat CPU looking for
+  // enough to evict.
+  if (old_usage + total_charge <= capacity || total_charge > old_usage) {
+    // Good enough for me (might run over with a race)
+    need_evict_charge = 0;
+  } else {
+    // Try to evict enough space, and maybe some extra
+    need_evict_charge = total_charge;
+    if (old_usage > capacity) {
+      // Not too much to avoid thundering herd while avoiding strict
+      // synchronization, such as the compare_exchange used with strict
+      // capacity limit.
+      need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
+    }
+  }
+  if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
+    // Special case: require at least 1 eviction if we only have to
+    // deal with occupancy
+    need_evict_charge = 1;
+  }
+  size_t evicted_charge = 0;
+  size_t evicted_count = 0;
+  if (need_evict_charge > 0) {
+    Evict(need_evict_charge, &evicted_charge, &evicted_count);
+    // Deal with potential occupancy deficit
+    if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
+      assert(evicted_charge == 0);
+      // Can't meet occupancy requirement
+      return false;
+    } else {
+      // Update occupancy for evictions
+      occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+    }
+  }
+  // Track new usage even if we weren't able to evict enough
+  usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
+  // No underflow
+  assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  // Success
+  return true;
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert(
+    const ClockHandleBasicData& proto) {
+  // Heap allocated separate from table
+  HandleImpl* h = new HandleImpl();
+  ClockHandleBasicData* h_alias = h;
+  *h_alias = proto;
+  h->SetDetached();
+  // Single reference (detached entries only created if returning a refed
+  // Handle back to user)
+  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
+                  << ClockHandle::kStateShift;
+  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+  h->meta.store(meta, std::memory_order_release);
+  // Keep track of how much of usage is detached
+  detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed);
+  return h;
+}
+
+Status HyperClockTable::Insert(const ClockHandleBasicData& proto,
+                               HandleImpl** handle, Cache::Priority priority,
+                               size_t capacity, bool strict_capacity_limit) {
+  // Do we have the available occupancy? Optimistically assume we do
+  // and deal with it if we don't.
+  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+  auto revert_occupancy_fn = [&]() {
+    occupancy_.fetch_sub(1, std::memory_order_relaxed);
+  };
+  // Whether we over-committed and need an eviction to make up for it
+  bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
+
+  // Usage/capacity handling is somewhat different depending on
+  // strict_capacity_limit, but mostly pessimistic.
+  bool use_detached_insert = false;
+  const size_t total_charge = proto.GetTotalCharge();
+  if (strict_capacity_limit) {
+    Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity,
+                                           need_evict_for_occupancy);
+    if (!s.ok()) {
+      revert_occupancy_fn();
+      return s;
+    }
+  } else {
+    // Case strict_capacity_limit == false
+    bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity,
+                                                  need_evict_for_occupancy);
+    if (!success) {
+      revert_occupancy_fn();
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry
+        // inserted into cache and evicted immediately.
+        proto.FreeData();
+        return Status::OK();
+      } else {
+        // Need to track usage of fallback detached insert
+        usage_.fetch_add(total_charge, std::memory_order_relaxed);
+        use_detached_insert = true;
+      }
+    }
+  }
+  auto revert_usage_fn = [&]() {
+    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    // No underflow
+    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  };
+
+  if (!use_detached_insert) {
+    // Attempt a table insert, but abort if we find an existing entry for the
+    // key. If we were to overwrite old entries, we would either
+    // * Have to gain ownership over an existing entry to overwrite it, which
+    // would only work if there are no outstanding (read) references and would
+    // create a small gap in availability of the entry (old or new) to lookups.
+    // * Have to insert into a suboptimal location (more probes) so that the
+    // old entry can be kept around as well.
+
+    uint64_t initial_countdown = GetInitialCountdown(priority);
+    assert(initial_countdown > 0);
+
+    size_t probe = 0;
+    HandleImpl* e = FindSlot(
+        proto.hashed_key,
+        [&](HandleImpl* h) {
+          // Optimistically transition the slot from "empty" to
+          // "under construction" (no effect on other states)
+          uint64_t old_meta =
+              h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
+                                   << ClockHandle::kStateShift,
+                               std::memory_order_acq_rel);
+          uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+
+          if (old_state == ClockHandle::kStateEmpty) {
+            // We've started inserting into an available slot, and taken
+            // ownership Save data fields
+            ClockHandleBasicData* h_alias = h;
+            *h_alias = proto;
+
+            // Transition from "under construction" state to "visible" state
+            uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
+                                << ClockHandle::kStateShift;
+
+            // Maybe with an outstanding reference
+            new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
+            new_meta |= (initial_countdown - (handle != nullptr))
+                        << ClockHandle::kReleaseCounterShift;
+
+#ifndef NDEBUG
+            // Save the state transition, with assertion
+            old_meta = h->meta.exchange(new_meta, std::memory_order_release);
+            assert(old_meta >> ClockHandle::kStateShift ==
+                   ClockHandle::kStateConstruction);
+#else
+            // Save the state transition
+            h->meta.store(new_meta, std::memory_order_release);
+#endif
+            return true;
+          } else if (old_state != ClockHandle::kStateVisible) {
+            // Slot not usable / touchable now
+            return false;
+          }
+          // Existing, visible entry, which might be a match.
+          // But first, we need to acquire a ref to read it. In fact, number of
+          // refs for initial countdown, so that we boost the clock state if
+          // this is a match.
+          old_meta = h->meta.fetch_add(
+              ClockHandle::kAcquireIncrement * initial_countdown,
+              std::memory_order_acq_rel);
+          // Like Lookup
+          if ((old_meta >> ClockHandle::kStateShift) ==
+              ClockHandle::kStateVisible) {
+            // Acquired a read reference
+            if (h->hashed_key == proto.hashed_key) {
+              // Match. Release in a way that boosts the clock state
+              old_meta = h->meta.fetch_add(
+                  ClockHandle::kReleaseIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+              // Correct for possible (but rare) overflow
+              CorrectNearOverflow(old_meta, h->meta);
+              // Insert detached instead (only if return handle needed)
+              use_detached_insert = true;
+              return true;
+            } else {
+              // Mismatch. Pretend we never took the reference
+              old_meta = h->meta.fetch_sub(
+                  ClockHandle::kAcquireIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+            }
+          } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                              ClockHandle::kStateInvisible)) {
+            // Pretend we never took the reference
+            // WART: there's a tiny chance we release last ref to invisible
+            // entry here. If that happens, we let eviction take care of it.
+            old_meta = h->meta.fetch_sub(
+                ClockHandle::kAcquireIncrement * initial_countdown,
+                std::memory_order_acq_rel);
+          } else {
+            // For other states, incrementing the acquire counter has no effect
+            // so we don't need to undo it.
+            // Slot not usable / touchable now.
+          }
+          (void)old_meta;
+          return false;
+        },
+        [&](HandleImpl* /*h*/) { return false; },
+        [&](HandleImpl* h) {
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
+        },
+        probe);
+    if (e == nullptr) {
+      // Occupancy check and never abort FindSlot above should generally
+      // prevent this, except it's theoretically possible for other threads
+      // to evict and replace entries in the right order to hit every slot
+      // when it is populated. Assuming random hashing, the chance of that
+      // should be no higher than pow(kStrictLoadFactor, n) for n slots.
+      // That should be infeasible for roughly n >= 256, so if this assertion
+      // fails, that suggests something is going wrong.
+      assert(GetTableSize() < 256);
+      use_detached_insert = true;
+    }
+    if (!use_detached_insert) {
+      // Successfully inserted
+      if (handle) {
+        *handle = e;
+      }
+      return Status::OK();
+    }
+    // Roll back table insertion
+    Rollback(proto.hashed_key, e);
+    revert_occupancy_fn();
+    // Maybe fall back on detached insert
+    if (handle == nullptr) {
+      revert_usage_fn();
+      // As if unrefed entry immdiately evicted
+      proto.FreeData();
+      return Status::OK();
+    }
+  }
+
+  // Run detached insert
+  assert(use_detached_insert);
+
+  *handle = DetachedInsert(proto);
+
+  // The OkOverwritten status is used to count "redundant" insertions into
+  // block cache. This implementation doesn't strictly check for redundant
+  // insertions, but we instead are probably interested in how many insertions
+  // didn't go into the table (instead "detached"), which could be redundant
+  // Insert or some other reason (use_detached_insert reasons above).
+  return Status::OkOverwritten();
+}
+
+HyperClockTable::HandleImpl* HyperClockTable::Lookup(
+    const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
+  HandleImpl* e = FindSlot(
+      hashed_key,
+      [&](HandleImpl* h) {
+        // Mostly branch-free version (similar performance)
+        /*
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
+        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
+        bool match = (h->key == key) & visible;
+        h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
+        ClockHandle::kAcquireCounterShift, std::memory_order_release); return
+        match;
+        */
+        // Optimistic lookup should pay off when the table is relatively
+        // sparse.
+        constexpr bool kOptimisticLookup = true;
+        uint64_t old_meta;
+        if (!kOptimisticLookup) {
+          old_meta = h->meta.load(std::memory_order_acquire);
+          if ((old_meta >> ClockHandle::kStateShift) !=
+              ClockHandle::kStateVisible) {
+            return false;
+          }
+        }
+        // (Optimistically) increment acquire counter
+        old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->hashed_key == hashed_key) {
+            // Match
+            return true;
+          } else {
+            // Mismatch. Pretend we never took the reference
+            old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                         std::memory_order_release);
+          }
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it. Furthermore, we cannot safely undo
+          // it because we did not acquire a read reference to lock the
+          // entry in a Shareable state.
+        }
+        (void)old_meta;
+        return false;
+      },
+      [&](HandleImpl* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
+      [&](HandleImpl* /*h*/) {}, probe);
+
+  return e;
+}
+
+bool HyperClockTable::Release(HandleImpl* h, bool useful,
+                              bool erase_if_last_ref) {
+  // In contrast with LRUCache's Release, this function won't delete the handle
+  // when the cache is above capacity and the reference is the last one. Space
+  // is only freed up by EvictFromClock (called by Insert when space is needed)
+  // and Erase. We do this to avoid an extra atomic read of the variable usage_.
+
+  uint64_t old_meta;
+  if (useful) {
+    // Increment release counter to indicate was used
+    old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
+                                 std::memory_order_release);
+  } else {
+    // Decrement acquire counter to pretend it never happened
+    old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                 std::memory_order_release);
+  }
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  // No underflow
+  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+          ClockHandle::kCounterMask) !=
+         ((old_meta >> ClockHandle::kReleaseCounterShift) &
+          ClockHandle::kCounterMask));
+
+  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
+                                    ClockHandle::kStateInvisible)) {
+    // Update for last fetch_add op
+    if (useful) {
+      old_meta += ClockHandle::kReleaseIncrement;
+    } else {
+      old_meta -= ClockHandle::kAcquireIncrement;
+    }
+    // Take ownership if no refs
+    do {
+      if (GetRefcount(old_meta) != 0) {
+        // Not last ref at some point in time during this Release call
+        // Correct for possible (but rare) overflow
+        CorrectNearOverflow(old_meta, h->meta);
+        return false;
+      }
+      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                       << ClockHandle::kStateShift)) == 0) {
+        // Someone else took ownership
+        return false;
+      }
+      // Note that there's a small chance that we release, another thread
+      // replaces this entry with another, reaches zero refs, and then we end
+      // up erasing that other entry. That's an acceptable risk / imprecision.
+    } while (!h->meta.compare_exchange_weak(
+        old_meta,
+        uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+        std::memory_order_acquire));
+    // Took ownership
+    size_t total_charge = h->GetTotalCharge();
+    if (UNLIKELY(h->IsDetached())) {
+      h->FreeData();
+      // Delete detached handle
+      delete h;
+      detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    } else {
+      Rollback(h->hashed_key, h);
+      FreeDataMarkEmpty(*h);
+      ReclaimEntryUsage(total_charge);
+    }
+    return true;
+  } else {
+    // Correct for possible (but rare) overflow
+    CorrectNearOverflow(old_meta, h->meta);
+    return false;
+  }
+}
+
+void HyperClockTable::Ref(HandleImpl& h) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  // Must have already had a reference
+  assert(GetRefcount(old_meta) > 0);
+  (void)old_meta;
+}
+
+void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  (void)old_meta;
+}
+
+void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+  if (n > 0) {
+    // Split into n - 1 and 1 steps.
+    uint64_t old_meta = h->meta.fetch_add(
+        (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
+    assert((old_meta >> ClockHandle::kStateShift) &
+           ClockHandle::kStateShareableBit);
+    (void)old_meta;
+
+    Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
+  }
+}
+
+void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
+  (void)FindSlot(
+      hashed_key,
+      [&](HandleImpl* h) {
+        // Could be multiple entries in rare cases. Erase them all.
+        // Optimistically increment acquire counter
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                              std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->hashed_key == hashed_key) {
+            // Match. Set invisible.
+            old_meta =
+                h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+                                    << ClockHandle::kStateShift),
+                                  std::memory_order_acq_rel);
+            // Apply update to local copy
+            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
+                          << ClockHandle::kStateShift);
+            for (;;) {
+              uint64_t refcount = GetRefcount(old_meta);
+              assert(refcount > 0);
+              if (refcount > 1) {
+                // Not last ref at some point in time during this Erase call
+                // Pretend we never took the reference
+                h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_release);
+                break;
+              } else if (h->meta.compare_exchange_weak(
+                             old_meta,
+                             uint64_t{ClockHandle::kStateConstruction}
+                                 << ClockHandle::kStateShift,
+                             std::memory_order_acq_rel)) {
+                // Took ownership
+                assert(hashed_key == h->hashed_key);
+                size_t total_charge = h->GetTotalCharge();
+                FreeDataMarkEmpty(*h);
+                ReclaimEntryUsage(total_charge);
+                // We already have a copy of hashed_key in this case, so OK to
+                // delay Rollback until after releasing the entry
+                Rollback(hashed_key, h);
+                break;
+              }
+            }
+          } else {
+            // Mismatch. Pretend we never took the reference
+            h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                              std::memory_order_release);
+          }
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                            std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it.
+        }
+        return false;
+      },
+      [&](HandleImpl* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
+      [&](HandleImpl* /*h*/) {}, probe);
+}
+
+void HyperClockTable::ConstApplyToEntriesRange(
+    std::function<void(const HandleImpl&)> func, size_t index_begin,
+    size_t index_end, bool apply_if_will_be_deleted) const {
+  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
+  if (!apply_if_will_be_deleted) {
+    check_state_mask |= ClockHandle::kStateVisibleBit;
+  }
+
+  for (size_t i = index_begin; i < index_end; i++) {
+    HandleImpl& h = array_[i];
+
+    // Note: to avoid using compare_exchange, we have to be extra careful.
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    // Check if it's an entry visible to lookups
+    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+      // Increment acquire counter. Note: it's possible that the entry has
+      // completely changed since we loaded old_meta, but incrementing acquire
+      // count is always safe. (Similar to optimistic Lookup here.)
+      old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_acquire);
+      // Check whether we actually acquired a reference.
+      if ((old_meta >> ClockHandle::kStateShift) &
+          ClockHandle::kStateShareableBit) {
+        // Apply func if appropriate
+        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+          func(h);
+        }
+        // Pretend we never took the reference
+        h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                         std::memory_order_release);
+        // No net change, so don't need to check for overflow
+      } else {
+        // For other states, incrementing the acquire counter has no effect
+        // so we don't need to undo it. Furthermore, we cannot safely undo
+        // it because we did not acquire a read reference to lock the
+        // entry in a Shareable state.
+      }
+    }
+  }
+}
+
+void HyperClockTable::EraseUnRefEntries() {
+  for (size_t i = 0; i <= this->length_bits_mask_; i++) {
+    HandleImpl& h = array_[i];
+
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                    << ClockHandle::kStateShift) &&
+        GetRefcount(old_meta) == 0 &&
+        h.meta.compare_exchange_strong(old_meta,
+                                       uint64_t{ClockHandle::kStateConstruction}
+                                           << ClockHandle::kStateShift,
+                                       std::memory_order_acquire)) {
+      // Took ownership
+      size_t total_charge = h.GetTotalCharge();
+      Rollback(h.hashed_key, &h);
+      FreeDataMarkEmpty(h);
+      ReclaimEntryUsage(total_charge);
+    }
+  }
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
+    const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
+    std::function<bool(HandleImpl*)> abort_fn,
+    std::function<void(HandleImpl*)> update_fn, size_t& probe) {
+  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
+  //
+  // We use double-hashing probing. Every probe in the sequence is a
+  // pseudorandom integer, computed as a linear function of two random hashes,
+  // which we call base and increment. Specifically, the i-th probe is base + i
+  // * increment modulo the table size.
+  size_t base = static_cast<size_t>(hashed_key[1]);
+  // We use an odd increment, which is relatively prime with the power-of-two
+  // table size. This implies that we cycle back to the first probe only
+  // after probing every slot exactly once.
+  // TODO: we could also reconsider linear probing, though locality benefits
+  // are limited because each slot is a full cache line
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  size_t current = ModTableSize(base + probe * increment);
+  while (probe <= length_bits_mask_) {
+    HandleImpl* h = &array_[current];
+    if (match_fn(h)) {
+      probe++;
+      return h;
+    }
+    if (abort_fn(h)) {
+      return nullptr;
+    }
+    probe++;
+    update_fn(h);
+    current = ModTableSize(current + increment);
+  }
+  // We looped back.
+  return nullptr;
+}
+
+inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key,
+                                      const HandleImpl* h) {
+  size_t current = ModTableSize(hashed_key[1]);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  while (&array_[current] != h) {
+    array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
+    current = ModTableSize(current + increment);
+  }
+}
+
+inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) {
+  auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
+  (void)old_occupancy;
+  // No underflow
+  assert(old_occupancy > 0);
+  auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+  (void)old_usage;
+  // No underflow
+  assert(old_usage >= total_charge);
+}
+
+inline void HyperClockTable::Evict(size_t requested_charge,
+                                   size_t* freed_charge, size_t* freed_count) {
+  // precondition
+  assert(requested_charge > 0);
+
+  // TODO: make a tuning parameter?
+  constexpr size_t step_size = 4;
+
+  // First (concurrent) increment clock pointer
+  uint64_t old_clock_pointer =
+      clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+
+  // Cap the eviction effort at this thread (along with those operating in
+  // parallel) circling through the whole structure kMaxCountdown times.
+  // In other words, this eviction run must find something/anything that is
+  // unreferenced at start of and during the eviction run that isn't reclaimed
+  // by a concurrent eviction run.
+  uint64_t max_clock_pointer =
+      old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
+
+  for (;;) {
+    for (size_t i = 0; i < step_size; i++) {
+      HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
+      bool evicting = ClockUpdate(h);
+      if (evicting) {
+        Rollback(h.hashed_key, &h);
+        *freed_charge += h.GetTotalCharge();
+        *freed_count += 1;
+        FreeDataMarkEmpty(h);
+      }
+    }
+
+    // Loop exit condition
+    if (*freed_charge >= requested_charge) {
+      return;
+    }
+    if (old_clock_pointer >= max_clock_pointer) {
+      return;
+    }
+
+    // Advance clock pointer (concurrently)
+    old_clock_pointer =
+        clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+  }
+}
+
+template <class Table>
+ClockCacheShard<Table>::ClockCacheShard(
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    const typename Table::Opts& opts)
+    : CacheShardBase(metadata_charge_policy),
+      table_(capacity, strict_capacity_limit, metadata_charge_policy, opts),
+      capacity_(capacity),
+      strict_capacity_limit_(strict_capacity_limit) {
+  // Initial charge metadata should not exceed capacity
+  assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
+}
+
+template <class Table>
+void ClockCacheShard<Table>::EraseUnRefEntries() {
+  table_.EraseUnRefEntries();
+}
+
+template <class Table>
+void ClockCacheShard<Table>::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    size_t average_entries_per_lock, size_t* state) {
+  // The state is essentially going to be the starting hash, which works
+  // nicely even if we resize between calls because we use upper-most
+  // hash bits for table indexes.
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();
+
+  assert(average_entries_per_lock > 0);
+  // Assuming we are called with same average_entries_per_lock repeatedly,
+  // this simplifies some logic (index_end will not overflow).
+  assert(average_entries_per_lock < length || *state == 0);
+
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
+  if (index_end >= length) {
+    // Going to end.
+    index_end = length;
+    *state = SIZE_MAX;
+  } else {
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
+  }
+
+  table_.ConstApplyToEntriesRange(
+      [callback](const HandleImpl& h) {
+        UniqueId64x2 unhashed;
+        callback(ReverseHash(h.hashed_key, &unhashed), h.value,
+                 h.GetTotalCharge(), h.deleter);
+      },
+      index_begin, index_end, false);
+}
+
+int HyperClockTable::CalcHashBits(
+    size_t capacity, size_t estimated_value_size,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  double average_slot_charge = estimated_value_size * kLoadFactor;
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    average_slot_charge += sizeof(HandleImpl);
+  }
+  assert(average_slot_charge > 0.0);
+  uint64_t num_slots =
+      static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
+
+  int hash_bits = FloorLog2((num_slots << 1) - 1);
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    // For very small estimated value sizes, it's possible to overshoot
+    while (hash_bits > 0 &&
+           uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
+      hash_bits--;
+    }
+  }
+  return hash_bits;
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
+  capacity_.store(capacity, std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetStrictCapacityLimit(
+    bool strict_capacity_limit) {
+  strict_capacity_limit_.store(strict_capacity_limit,
+                               std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+Status ClockCacheShard<Table>::Insert(const Slice& key,
+                                      const UniqueId64x2& hashed_key,
+                                      void* value, size_t charge,
+                                      Cache::DeleterFn deleter,
+                                      HandleImpl** handle,
+                                      Cache::Priority priority) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return Status::NotSupported("ClockCache only supports key size " +
+                                std::to_string(kCacheKeySize) + "B");
+  }
+  ClockHandleBasicData proto;
+  proto.hashed_key = hashed_key;
+  proto.value = value;
+  proto.deleter = deleter;
+  proto.total_charge = charge;
+  Status s = table_.Insert(
+      proto, handle, priority, capacity_.load(std::memory_order_relaxed),
+      strict_capacity_limit_.load(std::memory_order_relaxed));
+  return s;
+}
+
+template <class Table>
+typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
+    const Slice& key, const UniqueId64x2& hashed_key) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return nullptr;
+  }
+  return table_.Lookup(hashed_key);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
+  if (h == nullptr) {
+    return false;
+  }
+  table_.Ref(*h);
+  return true;
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
+                                     bool erase_if_last_ref) {
+  if (handle == nullptr) {
+    return false;
+  }
+  return table_.Release(handle, useful, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
+  table_.TEST_RefN(*h, n);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
+  table_.TEST_ReleaseN(h, n);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle,
+                                     bool erase_if_last_ref) {
+  return Release(handle, /*useful=*/true, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::Erase(const Slice& key,
+                                   const UniqueId64x2& hashed_key) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return;
+  }
+  table_.Erase(hashed_key);
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetUsage() const {
+  return table_.GetUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetDetachedUsage() const {
+  return table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetCapacity() const {
+  return capacity_;
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetPinnedUsage() const {
+  // Computes the pinned usage by scanning the whole hash table. This
+  // is slow, but avoids keeping an exact counter on the clock usage,
+  // i.e., the number of not externally referenced elements.
+  // Why avoid this counter? Because Lookup removes elements from the clock
+  // list, so it would need to update the pinned usage every time,
+  // which creates additional synchronization costs.
+  size_t table_pinned_usage = 0;
+  const bool charge_metadata =
+      metadata_charge_policy_ == kFullChargeCacheMetadata;
+  table_.ConstApplyToEntriesRange(
+      [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
+        uint64_t meta = h.meta.load(std::memory_order_relaxed);
+        uint64_t refcount = GetRefcount(meta);
+        // Holding one ref for ConstApplyToEntriesRange
+        assert(refcount > 0);
+        if (refcount > 1) {
+          table_pinned_usage += h.GetTotalCharge();
+          if (charge_metadata) {
+            table_pinned_usage += sizeof(HandleImpl);
+          }
+        }
+      },
+      0, table_.GetTableSize(), true);
+
+  return table_pinned_usage + table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyCount() const {
+  return table_.GetOccupancy();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
+  return table_.GetOccupancyLimit();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetTableAddressCount() const {
+  return table_.GetTableSize();
+}
+
+// Explicit instantiation
+template class ClockCacheShard<HyperClockTable>;
+
+HyperClockCache::HyperClockCache(
+    size_t capacity, size_t estimated_value_size, int num_shard_bits,
+    bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    std::shared_ptr<MemoryAllocator> memory_allocator)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(memory_allocator)) {
+  assert(estimated_value_size > 0 ||
+         metadata_charge_policy != kDontChargeCacheMetadata);
+  // TODO: should not need to go through two levels of pointer indirection to
+  // get to table entries
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](Shard* cs) {
+    HyperClockTable::Opts opts;
+    opts.estimated_value_size = estimated_value_size;
+    new (cs)
+        Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts);
+  });
+}
+
+void* HyperClockCache::Value(Handle* handle) {
+  return reinterpret_cast<const HandleImpl*>(handle)->value;
+}
+
+size_t HyperClockCache::GetCharge(Handle* handle) const {
+  return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge();
+}
+
+Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
+  auto h = reinterpret_cast<const HandleImpl*>(handle);
+  return h->deleter;
+}
+
+namespace {
+
+// For each cache shard, estimate what the table load factor would be if
+// cache filled to capacity with average entries. This is considered
+// indicative of a potential problem if the shard is essentially operating
+// "at limit", which we define as high actual usage (>80% of capacity)
+// or actual occupancy very close to limit (>95% of limit).
+// Also, for each shard compute the recommended estimated_entry_charge,
+// and keep the minimum one for use as overall recommendation.
+void AddShardEvaluation(const HyperClockCache::Shard& shard,
+                        std::vector<double>& predicted_load_factors,
+                        size_t& min_recommendation) {
+  size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
+  size_t capacity = shard.GetCapacity();
+  double usage_ratio = 1.0 * usage / capacity;
+
+  size_t occupancy = shard.GetOccupancyCount();
+  size_t occ_limit = shard.GetOccupancyLimit();
+  double occ_ratio = 1.0 * occupancy / occ_limit;
+  if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
+    // Skip as described above
+    return;
+  }
+
+  // If filled to capacity, what would the occupancy ratio be?
+  double ratio = occ_ratio / usage_ratio;
+  // Given max load factor, what that load factor be?
+  double lf = ratio * kStrictLoadFactor;
+  predicted_load_factors.push_back(lf);
+
+  // Update min_recommendation also
+  size_t recommendation = usage / occupancy;
+  min_recommendation = std::min(min_recommendation, recommendation);
+}
+
+}  // namespace
+
+void HyperClockCache::ReportProblems(
+    const std::shared_ptr<Logger>& info_log) const {
+  uint32_t shard_count = GetNumShards();
+  std::vector<double> predicted_load_factors;
+  size_t min_recommendation = SIZE_MAX;
+  const_cast<HyperClockCache*>(this)->ForEachShard(
+      [&](HyperClockCache::Shard* shard) {
+        AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
+      });
+
+  if (predicted_load_factors.empty()) {
+    // None operating "at limit" -> nothing to report
+    return;
+  }
+  std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
+
+  // First, if the average load factor is within spec, we aren't going to
+  // complain about a few shards being out of spec.
+  // NOTE: this is only the average among cache shards operating "at limit,"
+  // which should be representative of what we care about. It it normal, even
+  // desirable, for a cache to operate "at limit" so this should not create
+  // selection bias. See AddShardEvaluation().
+  // TODO: Consider detecting cases where decreasing the number of shards
+  // would be good, e.g. serious imbalance among shards.
+  double average_load_factor =
+      std::accumulate(predicted_load_factors.begin(),
+                      predicted_load_factors.end(), 0.0) /
+      shard_count;
+
+  constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
+  constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
+  if (average_load_factor > kLoadFactor) {
+    // Out of spec => Consider reporting load factor too high
+    // Estimate effective overall capacity loss due to enforcing occupancy limit
+    double lost_portion = 0.0;
+    int over_count = 0;
+    for (double lf : predicted_load_factors) {
+      if (lf > kStrictLoadFactor) {
+        ++over_count;
+        lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
+      }
+    }
+    // >= 20% loss -> error
+    // >= 10% loss -> consistent warning
+    // >= 1% loss -> intermittent warning
+    InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+    bool report = true;
+    if (lost_portion > 0.2) {
+      level = InfoLogLevel::ERROR_LEVEL;
+    } else if (lost_portion > 0.1) {
+      level = InfoLogLevel::WARN_LEVEL;
+    } else if (lost_portion > 0.01) {
+      int report_percent = static_cast<int>(lost_portion * 100.0);
+      if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
+        level = InfoLogLevel::WARN_LEVEL;
+      }
+    } else {
+      // don't report
+      report = false;
+    }
+    if (report) {
+      ROCKS_LOG_AT_LEVEL(
+          info_log, level,
+          "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
+          "of "
+          "full occupancy in %d/%u cache shards (estimated_entry_charge too "
+          "high). Recommend estimated_entry_charge=%zu",
+          this, lost_portion * 100.0, over_count, (unsigned)shard_count,
+          min_recommendation);
+    }
+  } else if (average_load_factor < kLowSpecLoadFactor) {
+    // Out of spec => Consider reporting load factor too low
+    // But cautiously because low is not as big of a problem.
+
+    // Only report if highest occupancy shard is also below
+    // spec and only if average is substantially out of spec
+    if (predicted_load_factors.back() < kLowSpecLoadFactor &&
+        average_load_factor < kLowSpecLoadFactor / 1.414) {
+      InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+      if (average_load_factor < kLowSpecLoadFactor / 2) {
+        level = InfoLogLevel::WARN_LEVEL;
+      }
+      ROCKS_LOG_AT_LEVEL(
+          info_log, level,
+          "HyperClockCache@%p table has low occupancy at full capacity. Higher "
+          "estimated_entry_charge (about %.1fx) would likely improve "
+          "performance. Recommend estimated_entry_charge=%zu",
+          this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
+    }
+  }
+}
+
+}  // namespace clock_cache
+
+// DEPRECATED (see public API)
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                     /* high_pri_pool_ratio */ 0.5, nullptr,
+                     kDefaultToAdaptiveMutex, metadata_charge_policy,
+                     /* low_pri_pool_ratio */ 0.0);
+}
+
+std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
+  auto my_num_shard_bits = num_shard_bits;
+  if (my_num_shard_bits >= 20) {
+    return nullptr;  // The cache cannot be sharded into too many fine pieces.
+  }
+  if (my_num_shard_bits < 0) {
+    // Use larger shard size to reduce risk of large entries clustering
+    // or skewing individual shards.
+    constexpr size_t min_shard_size = 32U * 1024U * 1024U;
+    my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
+  }
+  return std::make_shared<clock_cache::HyperClockCache>(
+      capacity, estimated_entry_charge, my_num_shard_bits,
+      strict_capacity_limit, metadata_charge_policy, memory_allocator);
+}
+
+}  // namespace ROCKSDB_NAMESPACE