From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 20:45:59 +0200
Subject: Adding upstream version 16.2.11+ds.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/rocksdb/cache/cache_bench.cc    | 281 +++++++++++++
 src/rocksdb/cache/cache_test.cc     | 773 ++++++++++++++++++++++++++++++++++++
 src/rocksdb/cache/clock_cache.cc    | 761 +++++++++++++++++++++++++++++++++++
 src/rocksdb/cache/clock_cache.h     |  16 +
 src/rocksdb/cache/lru_cache.cc      | 574 ++++++++++++++++++++++++++
 src/rocksdb/cache/lru_cache.h       | 339 ++++++++++++++++
 src/rocksdb/cache/lru_cache_test.cc | 198 +++++++++
 src/rocksdb/cache/sharded_cache.cc  | 162 ++++++++
 src/rocksdb/cache/sharded_cache.h   | 111 ++++++
 9 files changed, 3215 insertions(+)
 create mode 100644 src/rocksdb/cache/cache_bench.cc
 create mode 100644 src/rocksdb/cache/cache_test.cc
 create mode 100644 src/rocksdb/cache/clock_cache.cc
 create mode 100644 src/rocksdb/cache/clock_cache.h
 create mode 100644 src/rocksdb/cache/lru_cache.cc
 create mode 100644 src/rocksdb/cache/lru_cache.h
 create mode 100644 src/rocksdb/cache/lru_cache_test.cc
 create mode 100644 src/rocksdb/cache/sharded_cache.cc
 create mode 100644 src/rocksdb/cache/sharded_cache.h

(limited to 'src/rocksdb/cache')
diff --git a/src/rocksdb/cache/cache_bench.cc b/src/rocksdb/cache/cache_bench.cc
new file mode 100644
index 000000000..6ff36a32d
--- /dev/null
+++ b/src/rocksdb/cache/cache_bench.cc
@@ -0,0 +1,281 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <cinttypes>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+static const uint32_t KB = 1024;
+
+DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_int64(cache_size, 8 * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_int32(num_shard_bits, 4, "shard_bits.");
+
+DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+
+DEFINE_bool(populate_cache, false, "Populate cache before operations");
+DEFINE_int32(insert_percent, 40,
+             "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_int32(lookup_percent, 50,
+             "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_int32(erase_percent, 10,
+             "Ratio of erase to total workload (expressed as a percentage)");
+
+DEFINE_bool(use_clock_cache, false, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheBench;
+namespace {
+void deleter(const Slice& /*key*/, void* value) {
+    delete reinterpret_cast<char *>(value);
+}
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  CacheBench* GetCacheBench() const {
+    return cache_bench_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  const uint64_t num_threads_;
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random rnd;
+  SharedState* shared;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+}  // namespace
+
+class CacheBench {
+ public:
+  CacheBench() : num_threads_(FLAGS_threads) {
+    if (FLAGS_use_clock_cache) {
+      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
+      if (!cache_) {
+        fprintf(stderr, "Clock cache not supported.\n");
+        exit(1);
+      }
+    } else {
+      cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
+    }
+  }
+
+  ~CacheBench() {}
+
+  void PopulateCache() {
+    Random rnd(1);
+    for (int64_t i = 0; i < FLAGS_cache_size; i++) {
+      uint64_t rand_key = rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      // do insert
+      cache_->Insert(key, new char[10], 1, &deleter);
+    }
+  }
+
+  bool Run() {
+    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<ThreadState*> threads(num_threads_);
+    for (uint32_t i = 0; i < num_threads_; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      env->StartThread(ThreadBody, threads[i]);
+    }
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      uint64_t start_time = env->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      // Record end time
+      uint64_t end_time = env->NowMicros();
+      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
+      uint32_t qps = static_cast<uint32_t>(
+          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
+      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
+    }
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_threads_;
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      int32_t prob_op = thread->rnd.Uniform(100);
+      if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
+        // do insert
+        cache_->Insert(key, new char[10], 1, &deleter);
+      } else if (prob_op -= FLAGS_insert_percent &&
+                 prob_op < FLAGS_lookup_percent) {
+        // do lookup
+        auto handle = cache_->Lookup(key);
+        if (handle) {
+          cache_->Release(handle);
+        }
+      } else if (prob_op -= FLAGS_lookup_percent &&
+                 prob_op < FLAGS_erase_percent) {
+        // do erase
+        cache_->Erase(key);
+      }
+    }
+  }
+
+  void PrintEnv() const {
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("Number of threads   : %d\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
+    printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
+    printf("Populate cache      : %d\n", FLAGS_populate_cache);
+    printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
+    printf("----------------------------\n");
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+  }
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/cache/cache_test.cc b/src/rocksdb/cache/cache_test.cc
new file mode 100644
index 000000000..ceafefe6f
--- /dev/null
+++ b/src/rocksdb/cache/cache_test.cc
@@ -0,0 +1,773 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include <forward_list>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "cache/clock_cache.h"
+#include "cache/lru_cache.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  return result;
+}
+static int DecodeKey(const Slice& k) {
+  assert(k.size() == 4);
+  return DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) {
+  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
+}
+
+const std::string kLRU = "lru";
+const std::string kClock = "clock";
+
+void dumbDeleter(const Slice& /*key*/, void* /*value*/) {}
+
+void eraseDeleter(const Slice& /*key*/, void* value) {
+  Cache* cache = reinterpret_cast<Cache*>(value);
+  cache->Erase("foo");
+}
+
+class CacheTest : public testing::TestWithParam<std::string> {
+ public:
+  static CacheTest* current_;
+
+  static void Deleter(const Slice& key, void* v) {
+    current_->deleted_keys_.push_back(DecodeKey(key));
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kCacheSize = 1000;
+  static const int kNumShardBits = 4;
+
+  static const int kCacheSize2 = 100;
+  static const int kNumShardBits2 = 2;
+
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> cache2_;
+
+  CacheTest()
+      : cache_(NewCache(kCacheSize, kNumShardBits, false)),
+        cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
+    current_ = this;
+  }
+
+  ~CacheTest() override {}
+
+  std::shared_ptr<Cache> NewCache(size_t capacity) {
+    auto type = GetParam();
+    if (type == kLRU) {
+      return NewLRUCache(capacity);
+    }
+    if (type == kClock) {
+      return NewClockCache(capacity);
+    }
+    return nullptr;
+  }
+
+  std::shared_ptr<Cache> NewCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
+    auto type = GetParam();
+    if (type == kLRU) {
+      LRUCacheOptions co;
+      co.capacity = capacity;
+      co.num_shard_bits = num_shard_bits;
+      co.strict_capacity_limit = strict_capacity_limit;
+      co.high_pri_pool_ratio = 0;
+      co.metadata_charge_policy = charge_policy;
+      return NewLRUCache(co);
+    }
+    if (type == kClock) {
+      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit,
+                           charge_policy);
+    }
+    return nullptr;
+  }
+
+  int Lookup(std::shared_ptr<Cache> cache, int key) {
+    Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
+    if (handle != nullptr) {
+      cache->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(std::shared_ptr<Cache> cache, int key, int value,
+              int charge = 1) {
+    cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                  &CacheTest::Deleter);
+  }
+
+  void Erase(std::shared_ptr<Cache> cache, int key) {
+    cache->Erase(EncodeKey(key));
+  }
+
+  int Lookup(int key) {
+    return Lookup(cache_, key);
+  }
+
+  void Insert(int key, int value, int charge = 1) {
+    Insert(cache_, key, value, charge);
+  }
+
+  void Erase(int key) {
+    Erase(cache_, key);
+  }
+
+  int Lookup2(int key) {
+    return Lookup(cache2_, key);
+  }
+
+  void Insert2(int key, int value, int charge = 1) {
+    Insert(cache2_, key, value, charge);
+  }
+
+  void Erase2(int key) {
+    Erase(cache2_, key);
+  }
+};
+CacheTest* CacheTest::current_;
+
+class LRUCacheTest : public CacheTest {};
+
+TEST_P(CacheTest, UsageTest) {
+  // cache is std::shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 100000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+
+  size_t usage = 0;
+  char value[10] = "abcdef";
+  // make sure everything will be cached
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter);
+    usage += kv_size;
+    ASSERT_EQ(usage, cache->GetUsage());
+    ASSERT_LT(usage, precise_cache->GetUsage());
+  }
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+
+  // make sure the cache will be overloaded
+  for (uint64_t i = 1; i < kCapacity; ++i) {
+    auto key = ToString(i);
+    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                  dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
+  }
+
+  // the usage should be close to the capacity
+  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_GT(kCapacity, precise_cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
+}
+
+TEST_P(CacheTest, PinnedUsageTest) {
+  // cache is std::shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 200000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
+
+  size_t pinned_usage = 0;
+  char value[10] = "abcdef";
+
+  std::forward_list<Cache::Handle*> unreleased_handles;
+  std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;
+
+  // Add entries. Unpin some of them after insertion. Then, pin some of them
+  // again. Check GetPinnedUsage().
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    Cache::Handle* handle;
+    Cache::Handle* handle_in_precise_cache;
+    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter,
+                  &handle);
+    assert(handle);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter, &handle_in_precise_cache);
+    assert(handle_in_precise_cache);
+    pinned_usage += kv_size;
+    ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    if (i % 2 == 0) {
+      cache->Release(handle);
+      precise_cache->Release(handle_in_precise_cache);
+      pinned_usage -= kv_size;
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    } else {
+      unreleased_handles.push_front(handle);
+      unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
+    }
+    if (i % 3 == 0) {
+      unreleased_handles.push_front(cache->Lookup(key));
+      auto x = precise_cache->Lookup(key);
+      assert(x);
+      unreleased_handles_in_precise_cache.push_front(x);
+      // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
+      // usage increased
+      if (i % 2 == 0) {
+        pinned_usage += kv_size;
+      }
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    }
+  }
+  auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);
+
+  // check that overloading the cache does not change the pinned usage
+  for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
+    auto key = ToString(i);
+    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                  dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
+  }
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  // release handles for pinned entries to prevent memory leaks
+  for (auto handle : unreleased_handles) {
+    cache->Release(handle);
+  }
+  for (auto handle : unreleased_handles_in_precise_cache) {
+    precise_cache->Release(handle);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+}
+
+TEST_P(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1,  Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  Insert(100, 102);
+  ASSERT_EQ(102, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1,  Lookup(300));
+
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST_P(CacheTest, InsertSameKey) {
+  Insert(1, 1);
+  Insert(1, 2);
+  ASSERT_EQ(2, Lookup(1));
+}
+
+TEST_P(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1,  Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, EntriesArePinned) {
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(2U, cache_->GetUsage());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+  ASSERT_EQ(0U, cache_->GetUsage());
+}
+
+TEST_P(CacheTest, EvictionPolicy) {
+  Insert(100, 101);
+  Insert(200, 201);
+
+  // Frequently used entry must be kept around
+  for (int i = 0; i < kCacheSize * 2; i++) {
+    Insert(1000+i, 2000+i);
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST_P(CacheTest, ExternalRefPinsEntries) {
+  Insert(100, 101);
+  Cache::Handle* h = cache_->Lookup(EncodeKey(100));
+  ASSERT_TRUE(cache_->Ref(h));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h)));
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  for (int i = 0; i < 3; ++i) {
+    if (i > 0) {
+      // First release (i == 1) corresponds to Ref(), second release (i == 2)
+      // corresponds to Lookup(). Then, since all external refs are released,
+      // the below insertions should push out the cache entry.
+      cache_->Release(h);
+    }
+    // double cache size because the usage bit in block cache prevents 100 from
+    // being evicted in the first kCacheSize iterations
+    for (int j = 0; j < 2 * kCacheSize + 100; j++) {
+      Insert(1000 + j, 2000 + j);
+    }
+    if (i < 2) {
+      ASSERT_EQ(101, Lookup(100));
+    }
+  }
+  ASSERT_EQ(-1, Lookup(100));
+}
+
+TEST_P(CacheTest, EvictionPolicyRef) {
+  Insert(100, 101);
+  Insert(101, 102);
+  Insert(102, 103);
+  Insert(103, 104);
+  Insert(200, 101);
+  Insert(201, 102);
+  Insert(202, 103);
+  Insert(203, 104);
+  Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+  Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+  Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+  Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+  Insert(300, 101);
+  Insert(301, 102);
+  Insert(302, 103);
+  Insert(303, 104);
+
+  // Insert entries much more than Cache capacity
+  for (int i = 0; i < kCacheSize * 2; i++) {
+    Insert(1000 + i, 2000 + i);
+  }
+
+  // Check whether the entries inserted in the beginning
+  // are evicted. Ones without extra ref are evicted and
+  // those with are not.
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(-1, Lookup(101));
+  ASSERT_EQ(-1, Lookup(102));
+  ASSERT_EQ(-1, Lookup(103));
+
+  ASSERT_EQ(-1, Lookup(300));
+  ASSERT_EQ(-1, Lookup(301));
+  ASSERT_EQ(-1, Lookup(302));
+  ASSERT_EQ(-1, Lookup(303));
+
+  ASSERT_EQ(101, Lookup(200));
+  ASSERT_EQ(102, Lookup(201));
+  ASSERT_EQ(103, Lookup(202));
+  ASSERT_EQ(104, Lookup(203));
+
+  // Cleaning up all the handles
+  cache_->Release(h201);
+  cache_->Release(h202);
+  cache_->Release(h203);
+  cache_->Release(h204);
+}
+
+TEST_P(CacheTest, EvictEmptyCache) {
+  // Insert item large than capacity to trigger eviction on empty cache.
+  auto cache = NewCache(1, 0, false);
+  ASSERT_OK(cache->Insert("foo", nullptr, 10, dumbDeleter));
+}
+
+TEST_P(CacheTest, EraseFromDeleter) {
+  // Have deleter which will erase item from cache, which will re-enter
+  // the cache at that point.
+  std::shared_ptr<Cache> cache = NewCache(10, 0, false);
+  ASSERT_OK(cache->Insert("foo", nullptr, 1, dumbDeleter));
+  ASSERT_OK(cache->Insert("bar", cache.get(), 1, eraseDeleter));
+  cache->Erase("bar");
+  ASSERT_EQ(nullptr, cache->Lookup("foo"));
+  ASSERT_EQ(nullptr, cache->Lookup("bar"));
+}
+
+TEST_P(CacheTest, ErasedHandleState) {
+  // insert a key and get two handles
+  Insert(100, 1000);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(h1, h2);
+  ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
+  ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
+
+  // delete the key from the cache
+  Erase(100);
+  // can no longer find in the cache
+  ASSERT_EQ(-1, Lookup(100));
+
+  // release one handle
+  cache_->Release(h1);
+  // still can't find in cache
+  ASSERT_EQ(-1, Lookup(100));
+
+  cache_->Release(h2);
+}
+
+TEST_P(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2*kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000+index, weight);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight;
+      ASSERT_EQ(1000+i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10);
+}
+
+TEST_P(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+
+class Value {
+ public:
+  explicit Value(size_t v) : v_(v) { }
+
+  size_t v_;
+};
+
+namespace {
+void deleter(const Slice& /*key*/, void* value) {
+  delete static_cast<Value *>(value);
+}
+}  // namespace
+
+TEST_P(CacheTest, ReleaseAndErase) {
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  Cache::Handle* handle;
+  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+                           &CacheTest::Deleter, &handle);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(1U, cache->GetUsage());
+  ASSERT_EQ(0U, deleted_keys_.size());
+  auto erased = cache->Release(handle, true);
+  ASSERT_TRUE(erased);
+  // This tests that deleter has been called
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, ReleaseWithoutErase) {
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  Cache::Handle* handle;
+  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+                           &CacheTest::Deleter, &handle);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(1U, cache->GetUsage());
+  ASSERT_EQ(0U, deleted_keys_.size());
+  auto erased = cache->Release(handle);
+  ASSERT_FALSE(erased);
+  // This tests that deleter is not called. When cache has free capacity it is
+  // not expected to immediately erase the released items.
+  ASSERT_EQ(0U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, SetCapacity) {
+  // test1: increase capacity
+  // lets create a cache with capacity 5,
+  // then, insert 5 elements, then increase capacity
+  // to 10, returned capacity should be 10, usage=5
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  std::vector<Cache::Handle*> handles(10);
+  // Insert 5 entries, but not releasing.
+  for (size_t i = 0; i < 5; i++) {
+    std::string key = ToString(i+1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+  cache->SetCapacity(10);
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+
+  // test2: decrease capacity
+  // insert 5 more elements to cache, then release 5,
+  // then decrease capacity to 7, final capacity should be 7
+  // and usage should be 7
+  for (size_t i = 5; i < 10; i++) {
+    std::string key = ToString(i+1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  for (size_t i = 0; i < 5; i++) {
+    cache->Release(handles[i]);
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  cache->SetCapacity(7);
+  ASSERT_EQ(7, cache->GetCapacity());
+  ASSERT_EQ(7, cache->GetUsage());
+
+  // release remaining 5 to keep valgrind happy
+  for (size_t i = 5; i < 10; i++) {
+    cache->Release(handles[i]);
+  }
+}
+
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
+  // test1: set the flag to false. Insert more keys than capacity. See if they
+  // all go through.
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  std::vector<Cache::Handle*> handles(10);
+  Status s;
+  for (size_t i = 0; i < 10; i++) {
+    std::string key = ToString(i + 1);
+    s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_OK(s);
+    ASSERT_NE(nullptr, handles[i]);
+  }
+  ASSERT_EQ(10, cache->GetUsage());
+
+  // test2: set the flag to true. Insert and check if it fails.
+  std::string extra_key = "extra";
+  Value* extra_value = new Value(0);
+  cache->SetStrictCapacityLimit(true);
+  Cache::Handle* handle;
+  s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_EQ(nullptr, handle);
+  ASSERT_EQ(10, cache->GetUsage());
+
+  for (size_t i = 0; i < 10; i++) {
+    cache->Release(handles[i]);
+  }
+
+  // test3: init with flag being true.
+  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
+  for (size_t i = 0; i < 5; i++) {
+    std::string key = ToString(i + 1);
+    s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_OK(s);
+    ASSERT_NE(nullptr, handles[i]);
+  }
+  s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_EQ(nullptr, handle);
+  // test insert without handle
+  s = cache2->Insert(extra_key, extra_value, 1, &deleter);
+  // AS if the key have been inserted into cache but get evicted immediately.
+  ASSERT_OK(s);
+  ASSERT_EQ(5, cache2->GetUsage());
+  ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
+
+  for (size_t i = 0; i < 5; i++) {
+    cache2->Release(handles[i]);
+  }
+}
+
+TEST_P(CacheTest, OverCapacity) {
+  size_t n = 10;
+
+  // a LRUCache with n entries and one shard only
+  std::shared_ptr<Cache> cache = NewCache(n, 0, false);
+
+  std::vector<Cache::Handle*> handles(n+1);
+
+  // Insert n+1 entries, but not releasing.
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+
+  // Guess what's in the cache now?
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
+    auto h = cache->Lookup(key);
+    ASSERT_TRUE(h != nullptr);
+    if (h) cache->Release(h);
+  }
+
+  // the cache is over capacity since nothing could be evicted
+  ASSERT_EQ(n + 1U, cache->GetUsage());
+  for (size_t i = 0; i < n + 1; i++) {
+    cache->Release(handles[i]);
+  }
+  // Make sure eviction is triggered.
+  cache->SetCapacity(n);
+
+  // cache is under capacity now since elements were released
+  ASSERT_EQ(n, cache->GetUsage());
+
+  // element 0 is evicted and the rest is there
+  // This is consistent with the LRU policy since the element 0
+  // was released first
+  for (size_t i = 0; i < n + 1; i++) {
+    std::string key = ToString(i+1);
+    auto h = cache->Lookup(key);
+    if (h) {
+      ASSERT_NE(i, 0U);
+      cache->Release(h);
+    } else {
+      ASSERT_EQ(i, 0U);
+    }
+  }
+}
+
+namespace {
+std::vector<std::pair<int, int>> callback_state;
+void callback(void* entry, size_t charge) {
+  callback_state.push_back({DecodeValue(entry), static_cast<int>(charge)});
+}
+};
+
+TEST_P(CacheTest, ApplyToAllCacheEntiresTest) {
+  std::vector<std::pair<int, int>> inserted;
+  callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back({i * 2, i + 1});
+  }
+  cache_->ApplyToAllCacheEntries(callback, true);
+
+  std::sort(inserted.begin(), inserted.end());
+  std::sort(callback_state.begin(), callback_state.end());
+  ASSERT_TRUE(inserted == callback_state);
+}
+
+TEST_P(CacheTest, DefaultShardBits) {
+  // test1: set the flag to false. Insert more keys than capacity. See if they
+  // all go through.
+  std::shared_ptr<Cache> cache = NewCache(16 * 1024L * 1024L);
+  ShardedCache* sc = dynamic_cast<ShardedCache*>(cache.get());
+  ASSERT_EQ(5, sc->GetNumShardBits());
+
+  cache = NewLRUCache(511 * 1024L, -1, true);
+  sc = dynamic_cast<ShardedCache*>(cache.get());
+  ASSERT_EQ(0, sc->GetNumShardBits());
+
+  cache = NewLRUCache(1024L * 1024L * 1024L, -1, true);
+  sc = dynamic_cast<ShardedCache*>(cache.get());
+  ASSERT_EQ(6, sc->GetNumShardBits());
+}
+
+TEST_P(CacheTest, GetCharge) {
+  Insert(1, 2);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
+  ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1, cache_->GetCharge(h1));
+  cache_->Release(h1);
+}
+
+#ifdef SUPPORT_CLOCK_CACHE
+std::shared_ptr<Cache> (*new_clock_cache_func)(
+    size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
+                        testing::Values(kLRU, kClock));
+#else
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU));
+#endif  // SUPPORT_CLOCK_CACHE
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc
new file mode 100644
index 000000000..797a44fd9
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.cc
@@ -0,0 +1,761 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/clock_cache.h"
+
+#ifndef SUPPORT_CLOCK_CACHE
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<Cache> NewClockCache(
+    size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy /*metadata_charge_policy*/) {
+  // Clock cache not supported.
+  return nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include <assert.h>
+#include <atomic>
+#include <deque>
+
+// "tbb/concurrent_hash_map.h" requires RTTI if exception is enabled.
+// Disable it so users can chooose to disable RTTI.
+#ifndef ROCKSDB_USE_RTTI
+#define TBB_USE_EXCEPTIONS 0
+#endif
+#include "tbb/concurrent_hash_map.h"
+
+#include "cache/sharded_cache.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// An implementation of the Cache interface based on CLOCK algorithm, with
+// better concurrent performance than LRUCache. The idea of CLOCK algorithm
+// is to maintain all cache entries in a circular list, and an iterator
+// (the "head") pointing to the last examined entry. Eviction starts from the
+// current head. Each entry is given a second chance before eviction, if it
+// has been access since last examine. In contrast to LRU, no modification
+// to the internal data-structure (except for flipping the usage bit) needs
+// to be done upon lookup. This gives us oppertunity to implement a cache
+// with better concurrency.
+//
+// Each cache entry is represented by a cache handle, and all the handles
+// are arranged in a circular list, as describe above. Upon erase of an entry,
+// we never remove the handle. Instead, the handle is put into a recycle bin
+// to be re-use. This is to avoid memory dealocation, which is hard to deal
+// with in concurrent environment.
+//
+// The cache also maintains a concurrent hash map for lookup. Any concurrent
+// hash map implementation should do the work. We currently use
+// tbb::concurrent_hash_map because it supports concurrent erase.
+//
+// Each cache handle has the following flags and counters, which are squeeze
+// in an atomic interger, to make sure the handle always be in a consistent
+// state:
+//
+//   * In-cache bit: whether the entry is reference by the cache itself. If
+//     an entry is in cache, its key would also be available in the hash map.
+//   * Usage bit: whether the entry has been access by user since last
+//     examine for eviction. Can be reset by eviction.
+//   * Reference count: reference count by user.
+//
+// An entry can be reference only when it's in cache. An entry can be evicted
+// only when it is in cache, has no usage since last examine, and reference
+// count is zero.
+//
+// The follow figure shows a possible layout of the cache. Boxes represents
+// cache handles and numbers in each box being in-cache bit, usage bit and
+// reference count respectively.
+//
+//    hash map:
+//      +-------+--------+
+//      |  key  | handle |
+//      +-------+--------+
+//      | "foo" |    5   |-------------------------------------+
+//      +-------+--------+                                     |
+//      | "bar" |    2   |--+                                  |
+//      +-------+--------+  |                                  |
+//                          |                                  |
+//                     head |                                  |
+//                       |  |                                  |
+//    circular list:     |  |                                  |
+//         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
+//         |(0,0,0)|---|(1,1,0)|---|(0,0,0)|---|(0,1,3)|---|(1,0,0)|---|  ...
+//         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
+//             |                       |
+//             +-------+   +-----------+
+//                     |   |
+//                   +---+---+
+//    recycle bin:   | 1 | 3 |
+//                   +---+---+
+//
+// Suppose we try to insert "baz" into the cache at this point and the cache is
+// full. The cache will first look for entries to evict, starting from where
+// head points to (the second entry). It resets usage bit of the second entry,
+// skips the third and fourth entry since they are not in cache, and finally
+// evict the fifth entry ("foo"). It looks at recycle bin for available handle,
+// grabs handle 3, and insert the key into the handle. The following figure
+// shows the resulting layout.
+//
+//    hash map:
+//      +-------+--------+
+//      |  key  | handle |
+//      +-------+--------+
+//      | "baz" |    3   |-------------+
+//      +-------+--------+             |
+//      | "bar" |    2   |--+          |
+//      +-------+--------+  |          |
+//                          |          |
+//                          |          |                                 head
+//                          |          |                                   |
+//    circular list:        |          |                                   |
+//         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
+//         |(0,0,0)|---|(1,0,0)|---|(1,0,0)|---|(0,1,3)|---|(0,0,0)|---|  ...
+//         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
+//             |                                               |
+//             +-------+   +-----------------------------------+
+//                     |   |
+//                   +---+---+
+//    recycle bin:   | 1 | 5 |
+//                   +---+---+
+//
+// A global mutex guards the circular list, the head, and the recycle bin.
+// We additionally require that modifying the hash map needs to hold the mutex.
+// As such, Modifying the cache (such as Insert() and Erase()) require to
+// hold the mutex. Lookup() only access the hash map and the flags associated
+// with each handle, and don't require explicit locking. Release() has to
+// acquire the mutex only when it releases the last reference to the entry and
+// the entry has been erased from cache explicitly. A future improvement could
+// be to remove the mutex completely.
+//
+// Benchmark:
+// We run readrandom db_bench on a test DB of size 13GB, with size of each
+// level:
+//
+//    Level    Files   Size(MB)
+//    -------------------------
+//      L0        1       0.01
+//      L1       18      17.32
+//      L2      230     182.94
+//      L3     1186    1833.63
+//      L4     4602    8140.30
+//
+// We test with both 32 and 16 read threads, with 2GB cache size (the whole DB
+// doesn't fits in) and 64GB cache size (the whole DB can fit in cache), and
+// whether to put index and filter blocks in block cache. The benchmark runs
+// with
+// with RocksDB 4.10. We got the following result:
+//
+// Threads Cache     Cache               ClockCache               LRUCache
+//         Size  Index/Filter Throughput(MB/s)   Hit Throughput(MB/s)    Hit
+//     32   2GB       yes               466.7  85.9%           433.7   86.5%
+//     32   2GB       no                529.9  72.7%           532.7   73.9%
+//     32  64GB       yes               649.9  99.9%           507.9   99.9%
+//     32  64GB       no                740.4  99.9%           662.8   99.9%
+//     16   2GB       yes               278.4  85.9%           283.4   86.5%
+//     16   2GB       no                318.6  72.7%           335.8   73.9%
+//     16  64GB       yes               391.9  99.9%           353.3   99.9%
+//     16  64GB       no                433.8  99.8%           419.4   99.8%
+
+// Cache entry meta data.
+struct CacheHandle {
+  Slice key;
+  uint32_t hash;
+  void* value;
+  size_t charge;
+  void (*deleter)(const Slice&, void* value);
+
+  // Flags and counters associated with the cache handle:
+  //   lowest bit: n-cache bit
+  //   second lowest bit: usage bit
+  //   the rest bits: reference count
+  // The handle is unused when flags equals to 0. The thread decreases the count
+  // to 0 is responsible to put the handle back to recycle_ and cleanup memory.
+  std::atomic<uint32_t> flags;
+
+  CacheHandle() = default;
+
+  CacheHandle(const CacheHandle& a) { *this = a; }
+
+  CacheHandle(const Slice& k, void* v,
+              void (*del)(const Slice& key, void* value))
+      : key(k), value(v), deleter(del) {}
+
+  CacheHandle& operator=(const CacheHandle& a) {
+    // Only copy members needed for deletion.
+    key = a.key;
+    value = a.value;
+    deleter = a.deleter;
+    return *this;
+  }
+
+  inline static size_t CalcTotalCharge(
+      Slice key, size_t charge,
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+      meta_charge += sizeof(CacheHandle);
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge +=
+          malloc_usable_size(static_cast<void*>(const_cast<char*>(key.data())));
+#else
+      meta_charge += key.size();
+#endif
+    }
+    return charge + meta_charge;
+  }
+
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    return CalcTotalCharge(key, charge, metadata_charge_policy);
+  }
+};
+
+// Key of hash map. We store hash value with the key for convenience.
+struct CacheKey {
+  Slice key;
+  uint32_t hash_value;
+
+  CacheKey() = default;
+
+  CacheKey(const Slice& k, uint32_t h) {
+    key = k;
+    hash_value = h;
+  }
+
+  static bool equal(const CacheKey& a, const CacheKey& b) {
+    return a.hash_value == b.hash_value && a.key == b.key;
+  }
+
+  static size_t hash(const CacheKey& a) {
+    return static_cast<size_t>(a.hash_value);
+  }
+};
+
+struct CleanupContext {
+  // List of values to be deleted, along with the key and deleter.
+  autovector<CacheHandle> to_delete_value;
+
+  // List of keys to be deleted.
+  autovector<const char*> to_delete_key;
+};
+
+// A cache shard which maintains its own CLOCK cache.
+class ClockCacheShard final : public CacheShard {
+ public:
+  // Hash map type.
+  typedef tbb::concurrent_hash_map<CacheKey, CacheHandle*, CacheKey> HashTable;
+
+  ClockCacheShard();
+  ~ClockCacheShard() override;
+
+  // Interfaces
+  void SetCapacity(size_t capacity) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Cache::Handle** handle, Cache::Priority priority) override;
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  // If the entry in in cache, increase reference count and return true.
+  // Return false otherwise.
+  //
+  // Not necessary to hold mutex_ before being called.
+  bool Ref(Cache::Handle* handle) override;
+  bool Release(Cache::Handle* handle, bool force_erase = false) override;
+  void Erase(const Slice& key, uint32_t hash) override;
+  bool EraseAndConfirm(const Slice& key, uint32_t hash,
+                       CleanupContext* context);
+  size_t GetUsage() const override;
+  size_t GetPinnedUsage() const override;
+  void EraseUnRefEntries() override;
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override;
+
+ private:
+  static const uint32_t kInCacheBit = 1;
+  static const uint32_t kUsageBit = 2;
+  static const uint32_t kRefsOffset = 2;
+  static const uint32_t kOneRef = 1 << kRefsOffset;
+
+  // Helper functions to extract cache handle flags and counters.
+  static bool InCache(uint32_t flags) { return flags & kInCacheBit; }
+  static bool HasUsage(uint32_t flags) { return flags & kUsageBit; }
+  static uint32_t CountRefs(uint32_t flags) { return flags >> kRefsOffset; }
+
+  // Decrease reference count of the entry. If this decreases the count to 0,
+  // recycle the entry. If set_usage is true, also set the usage bit.
+  //
+  // returns true if a value is erased.
+  //
+  // Not necessary to hold mutex_ before being called.
+  bool Unref(CacheHandle* handle, bool set_usage, CleanupContext* context);
+
+  // Unset in-cache bit of the entry. Recycle the handle if necessary.
+  //
+  // returns true if a value is erased.
+  //
+  // Has to hold mutex_ before being called.
+  bool UnsetInCache(CacheHandle* handle, CleanupContext* context);
+
+  // Put the handle back to recycle_ list, and put the value associated with
+  // it into to-be-deleted list. It doesn't cleanup the key as it might be
+  // reused by another handle.
+  //
+  // Has to hold mutex_ before being called.
+  void RecycleHandle(CacheHandle* handle, CleanupContext* context);
+
+  // Delete keys and values in to-be-deleted list. Call the method without
+  // holding mutex, as destructors can be expensive.
+  void Cleanup(const CleanupContext& context);
+
+  // Examine the handle for eviction. If the handle is in cache, usage bit is
+  // not set, and referece count is 0, evict it from cache. Otherwise unset
+  // the usage bit.
+  //
+  // Has to hold mutex_ before being called.
+  bool TryEvict(CacheHandle* value, CleanupContext* context);
+
+  // Scan through the circular list, evict entries until we get enough capacity
+  // for new cache entry of specific size. Return true if success, false
+  // otherwise.
+  //
+  // Has to hold mutex_ before being called.
+  bool EvictFromCache(size_t charge, CleanupContext* context);
+
+  CacheHandle* Insert(const Slice& key, uint32_t hash, void* value,
+                      size_t change,
+                      void (*deleter)(const Slice& key, void* value),
+                      bool hold_reference, CleanupContext* context);
+
+  // Guards list_, head_, and recycle_. In addition, updating table_ also has
+  // to hold the mutex, to avoid the cache being in inconsistent state.
+  mutable port::Mutex mutex_;
+
+  // The circular list of cache handles. Initially the list is empty. Once a
+  // handle is needed by insertion, and no more handles are available in
+  // recycle bin, one more handle is appended to the end.
+  //
+  // We use std::deque for the circular list because we want to make sure
+  // pointers to handles are valid through out the life-cycle of the cache
+  // (in contrast to std::vector), and be able to grow the list (in contrast
+  // to statically allocated arrays).
+  std::deque<CacheHandle> list_;
+
+  // Pointer to the next handle in the circular list to be examine for
+  // eviction.
+  size_t head_;
+
+  // Recycle bin of cache handles.
+  autovector<CacheHandle*> recycle_;
+
+  // Maximum cache size.
+  std::atomic<size_t> capacity_;
+
+  // Current total size of the cache.
+  std::atomic<size_t> usage_;
+
+  // Total un-released cache size.
+  std::atomic<size_t> pinned_usage_;
+
+  // Whether allow insert into cache if cache is full.
+  std::atomic<bool> strict_capacity_limit_;
+
+  // Hash table (tbb::concurrent_hash_map) for lookup.
+  HashTable table_;
+};
+
+ClockCacheShard::ClockCacheShard()
+    : head_(0), usage_(0), pinned_usage_(0), strict_capacity_limit_(false) {}
+
+ClockCacheShard::~ClockCacheShard() {
+  for (auto& handle : list_) {
+    uint32_t flags = handle.flags.load(std::memory_order_relaxed);
+    if (InCache(flags) || CountRefs(flags) > 0) {
+      if (handle.deleter != nullptr) {
+        (*handle.deleter)(handle.key, handle.value);
+      }
+      delete[] handle.key.data();
+    }
+  }
+}
+
+size_t ClockCacheShard::GetUsage() const {
+  return usage_.load(std::memory_order_relaxed);
+}
+
+size_t ClockCacheShard::GetPinnedUsage() const {
+  return pinned_usage_.load(std::memory_order_relaxed);
+}
+
+void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                             bool thread_safe) {
+  if (thread_safe) {
+    mutex_.Lock();
+  }
+  for (auto& handle : list_) {
+    // Use relaxed semantics instead of acquire semantics since we are either
+    // holding mutex, or don't have thread safe requirement.
+    uint32_t flags = handle.flags.load(std::memory_order_relaxed);
+    if (InCache(flags)) {
+      callback(handle.value, handle.charge);
+    }
+  }
+  if (thread_safe) {
+    mutex_.Unlock();
+  }
+}
+
+void ClockCacheShard::RecycleHandle(CacheHandle* handle,
+                                    CleanupContext* context) {
+  mutex_.AssertHeld();
+  assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
+  context->to_delete_key.push_back(handle->key.data());
+  context->to_delete_value.emplace_back(*handle);
+  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+  handle->key.clear();
+  handle->value = nullptr;
+  handle->deleter = nullptr;
+  recycle_.push_back(handle);
+  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+}
+
+void ClockCacheShard::Cleanup(const CleanupContext& context) {
+  for (const CacheHandle& handle : context.to_delete_value) {
+    if (handle.deleter) {
+      (*handle.deleter)(handle.key, handle.value);
+    }
+  }
+  for (const char* key : context.to_delete_key) {
+    delete[] key;
+  }
+}
+
+bool ClockCacheShard::Ref(Cache::Handle* h) {
+  auto handle = reinterpret_cast<CacheHandle*>(h);
+  // CAS loop to increase reference count.
+  uint32_t flags = handle->flags.load(std::memory_order_relaxed);
+  while (InCache(flags)) {
+    // Use acquire semantics on success, as further operations on the cache
+    // entry has to be order after reference count is increased.
+    if (handle->flags.compare_exchange_weak(flags, flags + kOneRef,
+                                            std::memory_order_acquire,
+                                            std::memory_order_relaxed)) {
+      if (CountRefs(flags) == 0) {
+        // No reference count before the operation.
+        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
+                            CleanupContext* context) {
+  if (set_usage) {
+    handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
+  }
+  // Use acquire-release semantics as previous operations on the cache entry
+  // has to be order before reference count is decreased, and potential cleanup
+  // of the entry has to be order after.
+  uint32_t flags = handle->flags.fetch_sub(kOneRef, std::memory_order_acq_rel);
+  assert(CountRefs(flags) > 0);
+  if (CountRefs(flags) == 1) {
+    // this is the last reference.
+    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    // Cleanup if it is the last reference.
+    if (!InCache(flags)) {
+      MutexLock l(&mutex_);
+      RecycleHandle(handle, context);
+    }
+  }
+  return context->to_delete_value.size();
+}
+
+bool ClockCacheShard::UnsetInCache(CacheHandle* handle,
+                                   CleanupContext* context) {
+  mutex_.AssertHeld();
+  // Use acquire-release semantics as previous operations on the cache entry
+  // has to be order before reference count is decreased, and potential cleanup
+  // of the entry has to be order after.
+  uint32_t flags =
+      handle->flags.fetch_and(~kInCacheBit, std::memory_order_acq_rel);
+  // Cleanup if it is the last reference.
+  if (InCache(flags) && CountRefs(flags) == 0) {
+    RecycleHandle(handle, context);
+  }
+  return context->to_delete_value.size();
+}
+
+bool ClockCacheShard::TryEvict(CacheHandle* handle, CleanupContext* context) {
+  mutex_.AssertHeld();
+  uint32_t flags = kInCacheBit;
+  if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire,
+                                            std::memory_order_relaxed)) {
+    bool erased __attribute__((__unused__)) =
+        table_.erase(CacheKey(handle->key, handle->hash));
+    assert(erased);
+    RecycleHandle(handle, context);
+    return true;
+  }
+  handle->flags.fetch_and(~kUsageBit, std::memory_order_relaxed);
+  return false;
+}
+
+bool ClockCacheShard::EvictFromCache(size_t charge, CleanupContext* context) {
+  size_t usage = usage_.load(std::memory_order_relaxed);
+  size_t capacity = capacity_.load(std::memory_order_relaxed);
+  if (usage == 0) {
+    return charge <= capacity;
+  }
+  size_t new_head = head_;
+  bool second_iteration = false;
+  while (usage + charge > capacity) {
+    assert(new_head < list_.size());
+    if (TryEvict(&list_[new_head], context)) {
+      usage = usage_.load(std::memory_order_relaxed);
+    }
+    new_head = (new_head + 1 >= list_.size()) ? 0 : new_head + 1;
+    if (new_head == head_) {
+      if (second_iteration) {
+        return false;
+      } else {
+        second_iteration = true;
+      }
+    }
+  }
+  head_ = new_head;
+  return true;
+}
+
+void ClockCacheShard::SetCapacity(size_t capacity) {
+  CleanupContext context;
+  {
+    MutexLock l(&mutex_);
+    capacity_.store(capacity, std::memory_order_relaxed);
+    EvictFromCache(0, &context);
+  }
+  Cleanup(context);
+}
+
+void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  strict_capacity_limit_.store(strict_capacity_limit,
+                               std::memory_order_relaxed);
+}
+
+CacheHandle* ClockCacheShard::Insert(
+    const Slice& key, uint32_t hash, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value), bool hold_reference,
+    CleanupContext* context) {
+  size_t total_charge =
+      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
+  MutexLock l(&mutex_);
+  bool success = EvictFromCache(total_charge, context);
+  bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
+  if (!success && (strict || !hold_reference)) {
+    context->to_delete_key.push_back(key.data());
+    if (!hold_reference) {
+      context->to_delete_value.emplace_back(key, value, deleter);
+    }
+    return nullptr;
+  }
+  // Grab available handle from recycle bin. If recycle bin is empty, create
+  // and append new handle to end of circular list.
+  CacheHandle* handle = nullptr;
+  if (!recycle_.empty()) {
+    handle = recycle_.back();
+    recycle_.pop_back();
+  } else {
+    list_.emplace_back();
+    handle = &list_.back();
+  }
+  // Fill handle.
+  handle->key = key;
+  handle->hash = hash;
+  handle->value = value;
+  handle->charge = charge;
+  handle->deleter = deleter;
+  uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
+  handle->flags.store(flags, std::memory_order_relaxed);
+  HashTable::accessor accessor;
+  if (table_.find(accessor, CacheKey(key, hash))) {
+    CacheHandle* existing_handle = accessor->second;
+    table_.erase(accessor);
+    UnsetInCache(existing_handle, context);
+  }
+  table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
+  if (hold_reference) {
+    pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
+  }
+  usage_.fetch_add(total_charge, std::memory_order_relaxed);
+  return handle;
+}
+
+Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
+                               size_t charge,
+                               void (*deleter)(const Slice& key, void* value),
+                               Cache::Handle** out_handle,
+                               Cache::Priority /*priority*/) {
+  CleanupContext context;
+  HashTable::accessor accessor;
+  char* key_data = new char[key.size()];
+  memcpy(key_data, key.data(), key.size());
+  Slice key_copy(key_data, key.size());
+  CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter,
+                               out_handle != nullptr, &context);
+  Status s;
+  if (out_handle != nullptr) {
+    if (handle == nullptr) {
+      s = Status::Incomplete("Insert failed due to LRU cache being full.");
+    } else {
+      *out_handle = reinterpret_cast<Cache::Handle*>(handle);
+    }
+  }
+  Cleanup(context);
+  return s;
+}
+
+Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
+  HashTable::const_accessor accessor;
+  if (!table_.find(accessor, CacheKey(key, hash))) {
+    return nullptr;
+  }
+  CacheHandle* handle = accessor->second;
+  accessor.release();
+  // Ref() could fail if another thread sneak in and evict/erase the cache
+  // entry before we are able to hold reference.
+  if (!Ref(reinterpret_cast<Cache::Handle*>(handle))) {
+    return nullptr;
+  }
+  // Double check the key since the handle may now representing another key
+  // if other threads sneak in, evict/erase the entry and re-used the handle
+  // for another cache entry.
+  if (hash != handle->hash || key != handle->key) {
+    CleanupContext context;
+    Unref(handle, false, &context);
+    // It is possible Unref() delete the entry, so we need to cleanup.
+    Cleanup(context);
+    return nullptr;
+  }
+  return reinterpret_cast<Cache::Handle*>(handle);
+}
+
+bool ClockCacheShard::Release(Cache::Handle* h, bool force_erase) {
+  CleanupContext context;
+  CacheHandle* handle = reinterpret_cast<CacheHandle*>(h);
+  bool erased = Unref(handle, true, &context);
+  if (force_erase && !erased) {
+    erased = EraseAndConfirm(handle->key, handle->hash, &context);
+  }
+  Cleanup(context);
+  return erased;
+}
+
+void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
+  CleanupContext context;
+  EraseAndConfirm(key, hash, &context);
+  Cleanup(context);
+}
+
+bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash,
+                                      CleanupContext* context) {
+  MutexLock l(&mutex_);
+  HashTable::accessor accessor;
+  bool erased = false;
+  if (table_.find(accessor, CacheKey(key, hash))) {
+    CacheHandle* handle = accessor->second;
+    table_.erase(accessor);
+    erased = UnsetInCache(handle, context);
+  }
+  return erased;
+}
+
+void ClockCacheShard::EraseUnRefEntries() {
+  CleanupContext context;
+  {
+    MutexLock l(&mutex_);
+    table_.clear();
+    for (auto& handle : list_) {
+      UnsetInCache(&handle, &context);
+    }
+  }
+  Cleanup(context);
+}
+
+class ClockCache final : public ShardedCache {
+ public:
+  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+             CacheMetadataChargePolicy metadata_charge_policy)
+      : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+    int num_shards = 1 << num_shard_bits;
+    shards_ = new ClockCacheShard[num_shards];
+    for (int i = 0; i < num_shards; i++) {
+      shards_[i].set_metadata_charge_policy(metadata_charge_policy);
+    }
+    SetCapacity(capacity);
+    SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  ~ClockCache() override { delete[] shards_; }
+
+  const char* Name() const override { return "ClockCache"; }
+
+  CacheShard* GetShard(int shard) override {
+    return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  }
+
+  const CacheShard* GetShard(int shard) const override {
+    return reinterpret_cast<CacheShard*>(&shards_[shard]);
+  }
+
+  void* Value(Handle* handle) override {
+    return reinterpret_cast<const CacheHandle*>(handle)->value;
+  }
+
+  size_t GetCharge(Handle* handle) const override {
+    return reinterpret_cast<const CacheHandle*>(handle)->charge;
+  }
+
+  uint32_t GetHash(Handle* handle) const override {
+    return reinterpret_cast<const CacheHandle*>(handle)->hash;
+  }
+
+  void DisownData() override { shards_ = nullptr; }
+
+ private:
+  ClockCacheShard* shards_;
+};
+
+}  // end anonymous namespace
+
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  if (num_shard_bits < 0) {
+    num_shard_bits = GetDefaultCacheShardBits(capacity);
+  }
+  return std::make_shared<ClockCache>(
+      capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // SUPPORT_CLOCK_CACHE
diff --git a/src/rocksdb/cache/clock_cache.h b/src/rocksdb/cache/clock_cache.h
new file mode 100644
index 000000000..1614c0ed4
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/cache.h"
+
+#if defined(TBB) && !defined(ROCKSDB_LITE)
+#define SUPPORT_CLOCK_CACHE
+#endif
diff --git a/src/rocksdb/cache/lru_cache.cc b/src/rocksdb/cache/lru_cache.cc
new file mode 100644
index 000000000..987417806
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.cc
@@ -0,0 +1,574 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/lru_cache.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
+  Resize();
+}
+
+LRUHandleTable::~LRUHandleTable() {
+  ApplyToAllCacheEntries([](LRUHandle* h) {
+    if (!h->HasRefs()) {
+      h->Free();
+    }
+  });
+  delete[] list_;
+}
+
+LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
+  return *FindPointer(key, hash);
+}
+
+LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
+  LRUHandle** ptr = FindPointer(h->key(), h->hash);
+  LRUHandle* old = *ptr;
+  h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+  *ptr = h;
+  if (old == nullptr) {
+    ++elems_;
+    if (elems_ > length_) {
+      // Since each cache entry is fairly large, we aim for a small
+      // average linked list length (<= 1).
+      Resize();
+    }
+  }
+  return old;
+}
+
+LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
+  LRUHandle** ptr = FindPointer(key, hash);
+  LRUHandle* result = *ptr;
+  if (result != nullptr) {
+    *ptr = result->next_hash;
+    --elems_;
+  }
+  return result;
+}
+
+LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
+  LRUHandle** ptr = &list_[hash & (length_ - 1)];
+  while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
+    ptr = &(*ptr)->next_hash;
+  }
+  return ptr;
+}
+
+void LRUHandleTable::Resize() {
+  uint32_t new_length = 16;
+  while (new_length < elems_ * 1.5) {
+    new_length *= 2;
+  }
+  LRUHandle** new_list = new LRUHandle*[new_length];
+  memset(new_list, 0, sizeof(new_list[0]) * new_length);
+  uint32_t count = 0;
+  for (uint32_t i = 0; i < length_; i++) {
+    LRUHandle* h = list_[i];
+    while (h != nullptr) {
+      LRUHandle* next = h->next_hash;
+      uint32_t hash = h->hash;
+      LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+      h->next_hash = *ptr;
+      *ptr = h;
+      h = next;
+      count++;
+    }
+  }
+  assert(elems_ == count);
+  delete[] list_;
+  list_ = new_list;
+  length_ = new_length;
+}
+
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio,
+                             bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy)
+    : capacity_(0),
+      high_pri_pool_usage_(0),
+      strict_capacity_limit_(strict_capacity_limit),
+      high_pri_pool_ratio_(high_pri_pool_ratio),
+      high_pri_pool_capacity_(0),
+      usage_(0),
+      lru_usage_(0),
+      mutex_(use_adaptive_mutex) {
+  set_metadata_charge_policy(metadata_charge_policy);
+  // Make empty circular linked list
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+  lru_low_pri_ = &lru_;
+  SetCapacity(capacity);
+}
+
+void LRUCacheShard::EraseUnRefEntries() {
+  autovector<LRUHandle*> last_reference_list;
+  {
+    MutexLock l(&mutex_);
+    while (lru_.next != &lru_) {
+      LRUHandle* old = lru_.next;
+      // LRU list contains only elements which can be evicted
+      assert(old->InCache() && !old->HasRefs());
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      old->SetInCache(false);
+      size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
+      last_reference_list.push_back(old);
+    }
+  }
+
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+}
+
+void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                           bool thread_safe) {
+  const auto applyCallback = [&]() {
+    table_.ApplyToAllCacheEntries(
+        [callback](LRUHandle* h) { callback(h->value, h->charge); });
+  };
+
+  if (thread_safe) {
+    MutexLock l(&mutex_);
+    applyCallback();
+  } else {
+    applyCallback();
+  }
+}
+
+void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
+  MutexLock l(&mutex_);
+  *lru = &lru_;
+  *lru_low_pri = lru_low_pri_;
+}
+
+size_t LRUCacheShard::TEST_GetLRUSize() {
+  MutexLock l(&mutex_);
+  LRUHandle* lru_handle = lru_.next;
+  size_t lru_size = 0;
+  while (lru_handle != &lru_) {
+    lru_size++;
+    lru_handle = lru_handle->next;
+  }
+  return lru_size;
+}
+
+double LRUCacheShard::GetHighPriPoolRatio() {
+  MutexLock l(&mutex_);
+  return high_pri_pool_ratio_;
+}
+
+void LRUCacheShard::LRU_Remove(LRUHandle* e) {
+  assert(e->next != nullptr);
+  assert(e->prev != nullptr);
+  if (lru_low_pri_ == e) {
+    lru_low_pri_ = e->prev;
+  }
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+  e->prev = e->next = nullptr;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+  assert(lru_usage_ >= total_charge);
+  lru_usage_ -= total_charge;
+  if (e->InHighPriPool()) {
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
+  }
+}
+
+void LRUCacheShard::LRU_Insert(LRUHandle* e) {
+  assert(e->next == nullptr);
+  assert(e->prev == nullptr);
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
+    // Inset "e" to head of LRU list.
+    e->next = &lru_;
+    e->prev = lru_.prev;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(true);
+    high_pri_pool_usage_ += total_charge;
+    MaintainPoolSize();
+  } else {
+    // Insert "e" to the head of low-pri pool. Note that when
+    // high_pri_pool_ratio is 0, head of low-pri pool is also head of LRU list.
+    e->next = lru_low_pri_->next;
+    e->prev = lru_low_pri_;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(false);
+    lru_low_pri_ = e;
+  }
+  lru_usage_ += total_charge;
+}
+
+void LRUCacheShard::MaintainPoolSize() {
+  while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
+    // Overflow last entry in high-pri pool to low-pri pool.
+    lru_low_pri_ = lru_low_pri_->next;
+    assert(lru_low_pri_ != &lru_);
+    lru_low_pri_->SetInHighPriPool(false);
+    size_t total_charge =
+        lru_low_pri_->CalcTotalCharge(metadata_charge_policy_);
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
+  }
+}
+
+void LRUCacheShard::EvictFromLRU(size_t charge,
+                                 autovector<LRUHandle*>* deleted) {
+  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
+    LRUHandle* old = lru_.next;
+    // LRU list contains only elements which can be evicted
+    assert(old->InCache() && !old->HasRefs());
+    LRU_Remove(old);
+    table_.Remove(old->key(), old->hash);
+    old->SetInCache(false);
+    size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+    assert(usage_ >= old_total_charge);
+    usage_ -= old_total_charge;
+    deleted->push_back(old);
+  }
+}
+
+void LRUCacheShard::SetCapacity(size_t capacity) {
+  autovector<LRUHandle*> last_reference_list;
+  {
+    MutexLock l(&mutex_);
+    capacity_ = capacity;
+    high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+    EvictFromLRU(0, &last_reference_list);
+  }
+
+  // Free the entries outside of mutex for performance reasons
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+}
+
+void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  MutexLock l(&mutex_);
+  strict_capacity_limit_ = strict_capacity_limit;
+}
+
+Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
+  MutexLock l(&mutex_);
+  LRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    assert(e->InCache());
+    if (!e->HasRefs()) {
+      // The entry is in LRU since it's in hash and has no external references
+      LRU_Remove(e);
+    }
+    e->Ref();
+    e->SetHit();
+  }
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+bool LRUCacheShard::Ref(Cache::Handle* h) {
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
+  MutexLock l(&mutex_);
+  // To create another reference - entry must be already externally referenced
+  assert(e->HasRefs());
+  e->Ref();
+  return true;
+}
+
+void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) {
+  MutexLock l(&mutex_);
+  high_pri_pool_ratio_ = high_pri_pool_ratio;
+  high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+  MaintainPoolSize();
+}
+
+bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
+  if (handle == nullptr) {
+    return false;
+  }
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    last_reference = e->Unref();
+    if (last_reference && e->InCache()) {
+      // The item is still in cache, and nobody else holds a reference to it
+      if (usage_ > capacity_ || force_erase) {
+        // The LRU list must be empty since the cache is full
+        assert(lru_.next == &lru_ || force_erase);
+        // Take this opportunity and remove the item
+        table_.Remove(e->key(), e->hash);
+        e->SetInCache(false);
+      } else {
+        // Put the item back on the LRU list, and don't free it
+        LRU_Insert(e);
+        last_reference = false;
+      }
+    }
+    if (last_reference) {
+      size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
+    }
+  }
+
+  // Free the entry here outside of mutex for performance reasons
+  if (last_reference) {
+    e->Free();
+  }
+  return last_reference;
+}
+
+Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
+                             size_t charge,
+                             void (*deleter)(const Slice& key, void* value),
+                             Cache::Handle** handle, Cache::Priority priority) {
+  // Allocate the memory here outside of the mutex
+  // If the cache is full, we'll have to release it
+  // It shouldn't happen very often though.
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(
+      new char[sizeof(LRUHandle) - 1 + key.size()]);
+  Status s = Status::OK();
+  autovector<LRUHandle*> last_reference_list;
+
+  e->value = value;
+  e->deleter = deleter;
+  e->charge = charge;
+  e->key_length = key.size();
+  e->flags = 0;
+  e->hash = hash;
+  e->refs = 0;
+  e->next = e->prev = nullptr;
+  e->SetInCache(true);
+  e->SetPriority(priority);
+  memcpy(e->key_data, key.data(), key.size());
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+
+  {
+    MutexLock l(&mutex_);
+
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty
+    EvictFromLRU(total_charge, &last_reference_list);
+
+    if ((usage_ + total_charge) > capacity_ &&
+        (strict_capacity_limit_ || handle == nullptr)) {
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry inserted
+        // into cache and get evicted immediately.
+        e->SetInCache(false);
+        last_reference_list.push_back(e);
+      } else {
+        delete[] reinterpret_cast<char*>(e);
+        *handle = nullptr;
+        s = Status::Incomplete("Insert failed due to LRU cache being full.");
+      }
+    } else {
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
+      LRUHandle* old = table_.Insert(e);
+      usage_ += total_charge;
+      if (old != nullptr) {
+        assert(old->InCache());
+        old->SetInCache(false);
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
+          LRU_Remove(old);
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
+          last_reference_list.push_back(old);
+        }
+      }
+      if (handle == nullptr) {
+        LRU_Insert(e);
+      } else {
+        e->Ref();
+        *handle = reinterpret_cast<Cache::Handle*>(e);
+      }
+    }
+  }
+
+  // Free the entries here outside of mutex for performance reasons
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+
+  return s;
+}
+
+void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
+  LRUHandle* e;
+  bool last_reference = false;
+  {
+    MutexLock l(&mutex_);
+    e = table_.Remove(key, hash);
+    if (e != nullptr) {
+      assert(e->InCache());
+      e->SetInCache(false);
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
+        LRU_Remove(e);
+        size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+        assert(usage_ >= total_charge);
+        usage_ -= total_charge;
+        last_reference = true;
+      }
+    }
+  }
+
+  // Free the entry here outside of mutex for performance reasons
+  // last_reference will only be true if e != nullptr
+  if (last_reference) {
+    e->Free();
+  }
+}
+
+size_t LRUCacheShard::GetUsage() const {
+  MutexLock l(&mutex_);
+  return usage_;
+}
+
+size_t LRUCacheShard::GetPinnedUsage() const {
+  MutexLock l(&mutex_);
+  assert(usage_ >= lru_usage_);
+  return usage_ - lru_usage_;
+}
+
+std::string LRUCacheShard::GetPrintableOptions() const {
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    MutexLock l(&mutex_);
+    snprintf(buffer, kBufferSize, "    high_pri_pool_ratio: %.3lf\n",
+             high_pri_pool_ratio_);
+  }
+  return std::string(buffer);
+}
+
+LRUCache::LRUCache(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit, double high_pri_pool_ratio,
+                   std::shared_ptr<MemoryAllocator> allocator,
+                   bool use_adaptive_mutex,
+                   CacheMetadataChargePolicy metadata_charge_policy)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(allocator)) {
+  num_shards_ = 1 << num_shard_bits;
+  shards_ = reinterpret_cast<LRUCacheShard*>(
+      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
+  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
+  for (int i = 0; i < num_shards_; i++) {
+    new (&shards_[i])
+        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
+                      use_adaptive_mutex, metadata_charge_policy);
+  }
+}
+
+LRUCache::~LRUCache() {
+  if (shards_ != nullptr) {
+    assert(num_shards_ > 0);
+    for (int i = 0; i < num_shards_; i++) {
+      shards_[i].~LRUCacheShard();
+    }
+    port::cacheline_aligned_free(shards_);
+  }
+}
+
+CacheShard* LRUCache::GetShard(int shard) {
+  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+}
+
+const CacheShard* LRUCache::GetShard(int shard) const {
+  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+}
+
+void* LRUCache::Value(Handle* handle) {
+  return reinterpret_cast<const LRUHandle*>(handle)->value;
+}
+
+size_t LRUCache::GetCharge(Handle* handle) const {
+  return reinterpret_cast<const LRUHandle*>(handle)->charge;
+}
+
+uint32_t LRUCache::GetHash(Handle* handle) const {
+  return reinterpret_cast<const LRUHandle*>(handle)->hash;
+}
+
+void LRUCache::DisownData() {
+// Do not drop data if compile with ASAN to suppress leak warning.
+#if defined(__clang__)
+#if !defined(__has_feature) || !__has_feature(address_sanitizer)
+  shards_ = nullptr;
+  num_shards_ = 0;
+#endif
+#else  // __clang__
+#ifndef __SANITIZE_ADDRESS__
+  shards_ = nullptr;
+  num_shards_ = 0;
+#endif  // !__SANITIZE_ADDRESS__
+#endif  // __clang__
+}
+
+size_t LRUCache::TEST_GetLRUSize() {
+  size_t lru_size_of_all_shards = 0;
+  for (int i = 0; i < num_shards_; i++) {
+    lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
+  }
+  return lru_size_of_all_shards;
+}
+
+double LRUCache::GetHighPriPoolRatio() {
+  double result = 0.0;
+  if (num_shards_ > 0) {
+    result = shards_[0].GetHighPriPoolRatio();
+  }
+  return result;
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
+                     cache_opts.strict_capacity_limit,
+                     cache_opts.high_pri_pool_ratio,
+                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+                     cache_opts.metadata_charge_policy);
+}
+
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  if (num_shard_bits >= 20) {
+    return nullptr;  // the cache cannot be sharded into too many fine pieces
+  }
+  if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
+    // invalid high_pri_pool_ratio
+    return nullptr;
+  }
+  if (num_shard_bits < 0) {
+    num_shard_bits = GetDefaultCacheShardBits(capacity);
+  }
+  return std::make_shared<LRUCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache.h b/src/rocksdb/cache/lru_cache.h
new file mode 100644
index 000000000..827e0bece
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.h
@@ -0,0 +1,339 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <string>
+
+#include "cache/sharded_cache.h"
+
+#include "port/malloc.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// LRU cache implementation. This class is not thread-safe.
+
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in a hash table. Some elements
+// are also stored on LRU list.
+//
+// LRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+//    In that case the entry is *not* in the LRU list
+//    (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+//    In that case the entry is in the LRU list and can be freed.
+//    (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+//    In that case the entry is not in the LRU list and not in hash table.
+//    The entry can be freed when refs becomes 0.
+//    (refs >= 1 && in_cache == false)
+//
+// All newly created LRUHandles are in state 1. If you call
+// LRUCacheShard::Release on entry in state 1, it will go into state 2.
+// To move from state 1 to state 3, either call LRUCacheShard::Erase or
+// LRUCacheShard::Insert with the same key (but possibly different value).
+// To move from state 2 to state 1, use LRUCacheShard::Lookup.
+// Before destruction, make sure that no handles are in state 1. This means
+// that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a
+// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase
+// (to move into state 3).
+
+struct LRUHandle {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle* next_hash;
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t charge;  // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  // The hash of key(). Used for fast sharding and comparisons.
+  uint32_t hash;
+  // The number of external refs to this entry. The cache itself is not counted.
+  uint32_t refs;
+
+  enum Flags : uint8_t {
+    // Whether this entry is referenced by the hash table.
+    IN_CACHE = (1 << 0),
+    // Whether this entry is high priority entry.
+    IS_HIGH_PRI = (1 << 1),
+    // Whether this entry is in high-pri pool.
+    IN_HIGH_PRI_POOL = (1 << 2),
+    // Wwhether this entry has had any lookups (hits).
+    HAS_HIT = (1 << 3),
+  };
+
+  uint8_t flags;
+
+  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+  char key_data[1];
+
+  Slice key() const { return Slice(key_data, key_length); }
+
+  // Increase the reference count by 1.
+  void Ref() { refs++; }
+
+  // Just reduce the reference count by 1. Return true if it was last reference.
+  bool Unref() {
+    assert(refs > 0);
+    refs--;
+    return refs == 0;
+  }
+
+  // Return true if there are external refs, false otherwise.
+  bool HasRefs() const { return refs > 0; }
+
+  bool InCache() const { return flags & IN_CACHE; }
+  bool IsHighPri() const { return flags & IS_HIGH_PRI; }
+  bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
+  bool HasHit() const { return flags & HAS_HIT; }
+
+  void SetInCache(bool in_cache) {
+    if (in_cache) {
+      flags |= IN_CACHE;
+    } else {
+      flags &= ~IN_CACHE;
+    }
+  }
+
+  void SetPriority(Cache::Priority priority) {
+    if (priority == Cache::Priority::HIGH) {
+      flags |= IS_HIGH_PRI;
+    } else {
+      flags &= ~IS_HIGH_PRI;
+    }
+  }
+
+  void SetInHighPriPool(bool in_high_pri_pool) {
+    if (in_high_pri_pool) {
+      flags |= IN_HIGH_PRI_POOL;
+    } else {
+      flags &= ~IN_HIGH_PRI_POOL;
+    }
+  }
+
+  void SetHit() { flags |= HAS_HIT; }
+
+  void Free() {
+    assert(refs == 0);
+    if (deleter) {
+      (*deleter)(key(), value);
+    }
+    delete[] reinterpret_cast<char*>(this);
+  }
+
+  // Caclculate the memory usage by metadata
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge += malloc_usable_size(static_cast<void*>(this));
+#else
+      // This is the size that is used when a new handle is created
+      meta_charge += sizeof(LRUHandle) - 1 + key_length;
+#endif
+    }
+    return charge + meta_charge;
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class LRUHandleTable {
+ public:
+  LRUHandleTable();
+  ~LRUHandleTable();
+
+  LRUHandle* Lookup(const Slice& key, uint32_t hash);
+  LRUHandle* Insert(LRUHandle* h);
+  LRUHandle* Remove(const Slice& key, uint32_t hash);
+
+  template <typename T>
+  void ApplyToAllCacheEntries(T func) {
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        auto n = h->next_hash;
+        assert(h->InCache());
+        func(h);
+        h = n;
+      }
+    }
+  }
+
+ private:
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle** FindPointer(const Slice& key, uint32_t hash);
+
+  void Resize();
+
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  LRUHandle** list_;
+  uint32_t length_;
+  uint32_t elems_;
+};
+
+// A single shard of sharded cache.
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
+ public:
+  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                double high_pri_pool_ratio, bool use_adaptive_mutex,
+                CacheMetadataChargePolicy metadata_charge_policy);
+  virtual ~LRUCacheShard() override = default;
+
+  // Separate from constructor so caller can easily make an array of LRUCache
+  // if current usage is more than new capacity, the function will attempt to
+  // free the needed space
+  virtual void SetCapacity(size_t capacity) override;
+
+  // Set the flag to reject insertion if cache if full.
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+
+  // Set percentage of capacity reserved for high-pri cache entries.
+  void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        size_t charge,
+                        void (*deleter)(const Slice& key, void* value),
+                        Cache::Handle** handle,
+                        Cache::Priority priority) override;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  virtual bool Ref(Cache::Handle* handle) override;
+  virtual bool Release(Cache::Handle* handle,
+                       bool force_erase = false) override;
+  virtual void Erase(const Slice& key, uint32_t hash) override;
+
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
+  // protect them with mutex_.
+
+  virtual size_t GetUsage() const override;
+  virtual size_t GetPinnedUsage() const override;
+
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) override;
+
+  virtual void EraseUnRefEntries() override;
+
+  virtual std::string GetPrintableOptions() const override;
+
+  void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri);
+
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  //  not threadsafe
+  size_t TEST_GetLRUSize();
+
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio();
+
+ private:
+  void LRU_Remove(LRUHandle* e);
+  void LRU_Insert(LRUHandle* e);
+
+  // Overflow the last entry in high-pri pool to low-pri pool until size of
+  // high-pri pool is no larger than the size specify by high_pri_pool_pct.
+  void MaintainPoolSize();
+
+  // Free some space following strict LRU policy until enough space
+  // to hold (usage_ + charge) is freed or the lru list is empty
+  // This function is not thread safe - it needs to be executed while
+  // holding the mutex_
+  void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
+
+  // Initialized before use.
+  size_t capacity_;
+
+  // Memory size for entries in high-pri pool.
+  size_t high_pri_pool_usage_;
+
+  // Whether to reject insertion if cache reaches its full capacity.
+  bool strict_capacity_limit_;
+
+  // Ratio of capacity reserved for high priority cache entries.
+  double high_pri_pool_ratio_;
+
+  // High-pri pool size, equals to capacity * high_pri_pool_ratio.
+  // Remember the value to avoid recomputing each time.
+  double high_pri_pool_capacity_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  // LRU contains items which can be evicted, ie reference only by cache
+  LRUHandle lru_;
+
+  // Pointer to head of low-pri pool in LRU list.
+  LRUHandle* lru_low_pri_;
+
+  // ------------^^^^^^^^^^^^^-----------
+  // Not frequently modified data members
+  // ------------------------------------
+  //
+  // We separate data members that are updated frequently from the ones that
+  // are not frequently updated so that they don't share the same cache line
+  // which will lead into false cache sharing
+  //
+  // ------------------------------------
+  // Frequently modified data members
+  // ------------vvvvvvvvvvvvv-----------
+  LRUHandleTable table_;
+
+  // Memory size for entries residing in the cache
+  size_t usage_;
+
+  // Memory size for entries residing only in the LRU list
+  size_t lru_usage_;
+
+  // mutex_ protects the following state.
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable port::Mutex mutex_;
+};
+
+class LRUCache
+#ifdef NDEBUG
+    final
+#endif
+    : public ShardedCache {
+ public:
+  LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+           double high_pri_pool_ratio,
+           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+           CacheMetadataChargePolicy metadata_charge_policy =
+               kDontChargeCacheMetadata);
+  virtual ~LRUCache();
+  virtual const char* Name() const override { return "LRUCache"; }
+  virtual CacheShard* GetShard(int shard) override;
+  virtual const CacheShard* GetShard(int shard) const override;
+  virtual void* Value(Handle* handle) override;
+  virtual size_t GetCharge(Handle* handle) const override;
+  virtual uint32_t GetHash(Handle* handle) const override;
+  virtual void DisownData() override;
+
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  size_t TEST_GetLRUSize();
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio();
+
+ private:
+  LRUCacheShard* shards_ = nullptr;
+  int num_shards_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache_test.cc b/src/rocksdb/cache/lru_cache_test.cc
new file mode 100644
index 000000000..f777711f1
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache_test.cc
@@ -0,0 +1,198 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/lru_cache.h"
+
+#include <string>
+#include <vector>
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LRUCacheTest : public testing::Test {
+ public:
+  LRUCacheTest() {}
+  ~LRUCacheTest() override { DeleteCache(); }
+
+  void DeleteCache() {
+    if (cache_ != nullptr) {
+      cache_->~LRUCacheShard();
+      port::cacheline_aligned_free(cache_);
+      cache_ = nullptr;
+    }
+  }
+
+  void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0,
+                bool use_adaptive_mutex = kDefaultToAdaptiveMutex) {
+    DeleteCache();
+    cache_ = reinterpret_cast<LRUCacheShard*>(
+        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
+                               high_pri_pool_ratio, use_adaptive_mutex,
+                               kDontChargeCacheMetadata);
+  }
+
+  void Insert(const std::string& key,
+              Cache::Priority priority = Cache::Priority::LOW) {
+    cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
+                   nullptr /*deleter*/, nullptr /*handle*/, priority);
+  }
+
+  void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
+    Insert(std::string(1, key), priority);
+  }
+
+  bool Lookup(const std::string& key) {
+    auto handle = cache_->Lookup(key, 0 /*hash*/);
+    if (handle) {
+      cache_->Release(handle);
+      return true;
+    }
+    return false;
+  }
+
+  bool Lookup(char key) { return Lookup(std::string(1, key)); }
+
+  void Erase(const std::string& key) { cache_->Erase(key, 0 /*hash*/); }
+
+  void ValidateLRUList(std::vector<std::string> keys,
+                       size_t num_high_pri_pool_keys = 0) {
+    LRUHandle* lru;
+    LRUHandle* lru_low_pri;
+    cache_->TEST_GetLRUList(&lru, &lru_low_pri);
+    LRUHandle* iter = lru;
+    bool in_high_pri_pool = false;
+    size_t high_pri_pool_keys = 0;
+    if (iter == lru_low_pri) {
+      in_high_pri_pool = true;
+    }
+    for (const auto& key : keys) {
+      iter = iter->next;
+      ASSERT_NE(lru, iter);
+      ASSERT_EQ(key, iter->key().ToString());
+      ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool());
+      if (in_high_pri_pool) {
+        high_pri_pool_keys++;
+      }
+      if (iter == lru_low_pri) {
+        ASSERT_FALSE(in_high_pri_pool);
+        in_high_pri_pool = true;
+      }
+    }
+    ASSERT_EQ(lru, iter->next);
+    ASSERT_TRUE(in_high_pri_pool);
+    ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys);
+  }
+
+ private:
+  LRUCacheShard* cache_ = nullptr;
+};
+
+TEST_F(LRUCacheTest, BasicLRU) {
+  NewCache(5);
+  for (char ch = 'a'; ch <= 'e'; ch++) {
+    Insert(ch);
+  }
+  ValidateLRUList({"a", "b", "c", "d", "e"});
+  for (char ch = 'x'; ch <= 'z'; ch++) {
+    Insert(ch);
+  }
+  ValidateLRUList({"d", "e", "x", "y", "z"});
+  ASSERT_FALSE(Lookup("b"));
+  ValidateLRUList({"d", "e", "x", "y", "z"});
+  ASSERT_TRUE(Lookup("e"));
+  ValidateLRUList({"d", "x", "y", "z", "e"});
+  ASSERT_TRUE(Lookup("z"));
+  ValidateLRUList({"d", "x", "y", "e", "z"});
+  Erase("x");
+  ValidateLRUList({"d", "y", "e", "z"});
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"y", "e", "z", "d"});
+  Insert("u");
+  ValidateLRUList({"y", "e", "z", "d", "u"});
+  Insert("v");
+  ValidateLRUList({"e", "z", "d", "u", "v"});
+}
+
+TEST_F(LRUCacheTest, MidpointInsertion) {
+  // Allocate 2 cache entries to high-pri pool.
+  NewCache(5, 0.45);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  Insert("c", Cache::Priority::LOW);
+  Insert("x", Cache::Priority::HIGH);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"a", "b", "c", "x", "y"}, 2);
+
+  // Low-pri entries inserted to the tail of low-pri list (the midpoint).
+  // After lookup, it will move to the tail of the full list.
+  Insert("d", Cache::Priority::LOW);
+  ValidateLRUList({"b", "c", "d", "x", "y"}, 2);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"b", "c", "x", "y", "d"}, 2);
+
+  // High-pri entries will be inserted to the tail of full list.
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"c", "x", "y", "d", "z"}, 2);
+}
+
+TEST_F(LRUCacheTest, EntriesWithPriority) {
+  // Allocate 2 cache entries to high-pri pool.
+  NewCache(5, 0.45);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  Insert("c", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b", "c"}, 0);
+
+  // Low-pri entries can take high-pri pool capacity if available
+  Insert("u", Cache::Priority::LOW);
+  Insert("v", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b", "c", "u", "v"}, 0);
+
+  Insert("X", Cache::Priority::HIGH);
+  Insert("Y", Cache::Priority::HIGH);
+  ValidateLRUList({"c", "u", "v", "X", "Y"}, 2);
+
+  // High-pri entries can overflow to low-pri pool.
+  Insert("Z", Cache::Priority::HIGH);
+  ValidateLRUList({"u", "v", "X", "Y", "Z"}, 2);
+
+  // Low-pri entries will be inserted to head of low-pri pool.
+  Insert("a", Cache::Priority::LOW);
+  ValidateLRUList({"v", "X", "a", "Y", "Z"}, 2);
+
+  // Low-pri entries will be inserted to head of high-pri pool after lookup.
+  ASSERT_TRUE(Lookup("v"));
+  ValidateLRUList({"X", "a", "Y", "Z", "v"}, 2);
+
+  // High-pri entries will be inserted to the head of the list after lookup.
+  ASSERT_TRUE(Lookup("X"));
+  ValidateLRUList({"a", "Y", "Z", "v", "X"}, 2);
+  ASSERT_TRUE(Lookup("Z"));
+  ValidateLRUList({"a", "Y", "v", "X", "Z"}, 2);
+
+  Erase("Y");
+  ValidateLRUList({"a", "v", "X", "Z"}, 2);
+  Erase("X");
+  ValidateLRUList({"a", "v", "Z"}, 1);
+  Insert("d", Cache::Priority::LOW);
+  Insert("e", Cache::Priority::LOW);
+  ValidateLRUList({"a", "v", "d", "e", "Z"}, 1);
+  Insert("f", Cache::Priority::LOW);
+  Insert("g", Cache::Priority::LOW);
+  ValidateLRUList({"d", "e", "f", "g", "Z"}, 1);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/sharded_cache.cc b/src/rocksdb/cache/sharded_cache.cc
new file mode 100644
index 000000000..6c915df8c
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.cc
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/sharded_cache.h"
+
+#include <string>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
+                           bool strict_capacity_limit,
+                           std::shared_ptr<MemoryAllocator> allocator)
+    : Cache(std::move(allocator)),
+      num_shard_bits_(num_shard_bits),
+      capacity_(capacity),
+      strict_capacity_limit_(strict_capacity_limit),
+      last_id_(1) {}
+
+void ShardedCache::SetCapacity(size_t capacity) {
+  int num_shards = 1 << num_shard_bits_;
+  const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+  MutexLock l(&capacity_mutex_);
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->SetCapacity(per_shard);
+  }
+  capacity_ = capacity;
+}
+
+void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  int num_shards = 1 << num_shard_bits_;
+  MutexLock l(&capacity_mutex_);
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+  strict_capacity_limit_ = strict_capacity_limit;
+}
+
+Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
+                            void (*deleter)(const Slice& key, void* value),
+                            Handle** handle, Priority priority) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))
+      ->Insert(key, hash, value, charge, deleter, handle, priority);
+}
+
+Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))->Lookup(key, hash);
+}
+
+bool ShardedCache::Ref(Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Ref(handle);
+}
+
+bool ShardedCache::Release(Handle* handle, bool force_erase) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Release(handle, force_erase);
+}
+
+void ShardedCache::Erase(const Slice& key) {
+  uint32_t hash = HashSlice(key);
+  GetShard(Shard(hash))->Erase(key, hash);
+}
+
+uint64_t ShardedCache::NewId() {
+  return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+size_t ShardedCache::GetCapacity() const {
+  MutexLock l(&capacity_mutex_);
+  return capacity_;
+}
+
+bool ShardedCache::HasStrictCapacityLimit() const {
+  MutexLock l(&capacity_mutex_);
+  return strict_capacity_limit_;
+}
+
+size_t ShardedCache::GetUsage() const {
+  // We will not lock the cache when getting the usage from shards.
+  int num_shards = 1 << num_shard_bits_;
+  size_t usage = 0;
+  for (int s = 0; s < num_shards; s++) {
+    usage += GetShard(s)->GetUsage();
+  }
+  return usage;
+}
+
+size_t ShardedCache::GetUsage(Handle* handle) const {
+  return GetCharge(handle);
+}
+
+size_t ShardedCache::GetPinnedUsage() const {
+  // We will not lock the cache when getting the usage from shards.
+  int num_shards = 1 << num_shard_bits_;
+  size_t usage = 0;
+  for (int s = 0; s < num_shards; s++) {
+    usage += GetShard(s)->GetPinnedUsage();
+  }
+  return usage;
+}
+
+void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                          bool thread_safe) {
+  int num_shards = 1 << num_shard_bits_;
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+}
+
+void ShardedCache::EraseUnRefEntries() {
+  int num_shards = 1 << num_shard_bits_;
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->EraseUnRefEntries();
+  }
+}
+
+std::string ShardedCache::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    MutexLock l(&capacity_mutex_);
+    snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
+             capacity_);
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n", num_shard_bits_);
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    strict_capacity_limit : %d\n",
+             strict_capacity_limit_);
+    ret.append(buffer);
+  }
+  snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
+           memory_allocator() ? memory_allocator()->Name() : "None");
+  ret.append(buffer);
+  ret.append(GetShard(0)->GetPrintableOptions());
+  return ret;
+}
+int GetDefaultCacheShardBits(size_t capacity) {
+  int num_shard_bits = 0;
+  size_t min_shard_size = 512L * 1024L;  // Every shard is at least 512KB.
+  size_t num_shards = capacity / min_shard_size;
+  while (num_shards >>= 1) {
+    if (++num_shard_bits >= 6) {
+      // No more than 6.
+      return num_shard_bits;
+    }
+  }
+  return num_shard_bits;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/sharded_cache.h b/src/rocksdb/cache/sharded_cache.h
new file mode 100644
index 000000000..ce9e459dc
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.h
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Single cache shard interface.
+class CacheShard {
+ public:
+  CacheShard() = default;
+  virtual ~CacheShard() = default;
+
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        size_t charge,
+                        void (*deleter)(const Slice& key, void* value),
+                        Cache::Handle** handle, Cache::Priority priority) = 0;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
+  virtual bool Ref(Cache::Handle* handle) = 0;
+  virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0;
+  virtual void Erase(const Slice& key, uint32_t hash) = 0;
+  virtual void SetCapacity(size_t capacity) = 0;
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+  virtual size_t GetUsage() const = 0;
+  virtual size_t GetPinnedUsage() const = 0;
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) = 0;
+  virtual void EraseUnRefEntries() = 0;
+  virtual std::string GetPrintableOptions() const { return ""; }
+  void set_metadata_charge_policy(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    metadata_charge_policy_ = metadata_charge_policy;
+  }
+
+ protected:
+  CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata;
+};
+
+// Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are sharded by the highest num_shard_bits bits of hash value.
+class ShardedCache : public Cache {
+ public:
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
+  virtual ~ShardedCache() = default;
+  virtual const char* Name() const override = 0;
+  virtual CacheShard* GetShard(int shard) = 0;
+  virtual const CacheShard* GetShard(int shard) const = 0;
+  virtual void* Value(Handle* handle) override = 0;
+  virtual size_t GetCharge(Handle* handle) const override = 0;
+
+  virtual uint32_t GetHash(Handle* handle) const = 0;
+  virtual void DisownData() override = 0;
+
+  virtual void SetCapacity(size_t capacity) override;
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+
+  virtual Status Insert(const Slice& key, void* value, size_t charge,
+                        void (*deleter)(const Slice& key, void* value),
+                        Handle** handle, Priority priority) override;
+  virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
+  virtual bool Ref(Handle* handle) override;
+  virtual bool Release(Handle* handle, bool force_erase = false) override;
+  virtual void Erase(const Slice& key) override;
+  virtual uint64_t NewId() override;
+  virtual size_t GetCapacity() const override;
+  virtual bool HasStrictCapacityLimit() const override;
+  virtual size_t GetUsage() const override;
+  virtual size_t GetUsage(Handle* handle) const override;
+  virtual size_t GetPinnedUsage() const override;
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) override;
+  virtual void EraseUnRefEntries() override;
+  virtual std::string GetPrintableOptions() const override;
+
+  int GetNumShardBits() const { return num_shard_bits_; }
+
+ private:
+  static inline uint32_t HashSlice(const Slice& s) {
+    return static_cast<uint32_t>(GetSliceNPHash64(s));
+  }
+
+  uint32_t Shard(uint32_t hash) {
+    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
+    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
+  }
+
+  int num_shard_bits_;
+  mutable port::Mutex capacity_mutex_;
+  size_t capacity_;
+  bool strict_capacity_limit_;
+  std::atomic<uint64_t> last_id_;
+};
+
+extern int GetDefaultCacheShardBits(size_t capacity);
+
+}  // namespace ROCKSDB_NAMESPACE
-- 
cgit v1.2.3