summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/cache
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/cache')
-rw-r--r--src/rocksdb/cache/cache.cc129
-rw-r--r--src/rocksdb/cache/cache_bench.cc20
-rw-r--r--src/rocksdb/cache/cache_bench_tool.cc973
-rw-r--r--src/rocksdb/cache/cache_entry_roles.cc134
-rw-r--r--src/rocksdb/cache/cache_entry_roles.h103
-rw-r--r--src/rocksdb/cache/cache_entry_stats.h183
-rw-r--r--src/rocksdb/cache/cache_helpers.h145
-rw-r--r--src/rocksdb/cache/cache_key.cc364
-rw-r--r--src/rocksdb/cache/cache_key.h143
-rw-r--r--src/rocksdb/cache/cache_reservation_manager.cc185
-rw-r--r--src/rocksdb/cache/cache_reservation_manager.h316
-rw-r--r--src/rocksdb/cache/cache_reservation_manager_test.cc469
-rw-r--r--src/rocksdb/cache/cache_test.cc1037
-rw-r--r--src/rocksdb/cache/charged_cache.cc117
-rw-r--r--src/rocksdb/cache/charged_cache.h121
-rw-r--r--src/rocksdb/cache/clock_cache.cc1404
-rw-r--r--src/rocksdb/cache/clock_cache.h701
-rw-r--r--src/rocksdb/cache/compressed_secondary_cache.cc325
-rw-r--r--src/rocksdb/cache/compressed_secondary_cache.h139
-rw-r--r--src/rocksdb/cache/compressed_secondary_cache_test.cc1005
-rw-r--r--src/rocksdb/cache/lru_cache.cc921
-rw-r--r--src/rocksdb/cache/lru_cache.h546
-rw-r--r--src/rocksdb/cache/lru_cache_test.cc2624
-rw-r--r--src/rocksdb/cache/secondary_cache.cc32
-rw-r--r--src/rocksdb/cache/sharded_cache.cc100
-rw-r--r--src/rocksdb/cache/sharded_cache.h322
26 files changed, 12558 insertions, 0 deletions
diff --git a/src/rocksdb/cache/cache.cc b/src/rocksdb/cache/cache.cc
new file mode 100644
index 000000000..7d23fb757
--- /dev/null
+++ b/src/rocksdb/cache/cache.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include "cache/lru_cache.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo>
+ lru_cache_options_type_info = {
+ {"capacity",
+ {offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT,
+ OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+ {"num_shard_bits",
+ {offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+ {"strict_capacity_limit",
+ {offsetof(struct LRUCacheOptions, strict_capacity_limit),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"high_pri_pool_ratio",
+ {offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
+ OptionType::kDouble, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"low_pri_pool_ratio",
+ {offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
+ OptionType::kDouble, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ comp_sec_cache_options_type_info = {
+ {"capacity",
+ {offsetof(struct CompressedSecondaryCacheOptions, capacity),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"num_shard_bits",
+ {offsetof(struct CompressedSecondaryCacheOptions, num_shard_bits),
+ OptionType::kInt, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"compression_type",
+ {offsetof(struct CompressedSecondaryCacheOptions, compression_type),
+ OptionType::kCompressionType, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"compress_format_version",
+ {offsetof(struct CompressedSecondaryCacheOptions,
+ compress_format_version),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"enable_custom_split_merge",
+ {offsetof(struct CompressedSecondaryCacheOptions,
+ enable_custom_split_merge),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+};
+#endif // ROCKSDB_LITE
+
+Status SecondaryCache::CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::shared_ptr<SecondaryCache>* result) {
+ if (value.find("compressed_secondary_cache://") == 0) {
+ std::string args = value;
+ args.erase(0, std::strlen("compressed_secondary_cache://"));
+ Status status;
+ std::shared_ptr<SecondaryCache> sec_cache;
+
+#ifndef ROCKSDB_LITE
+ CompressedSecondaryCacheOptions sec_cache_opts;
+ status = OptionTypeInfo::ParseStruct(config_options, "",
+ &comp_sec_cache_options_type_info, "",
+ args, &sec_cache_opts);
+ if (status.ok()) {
+ sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
+ }
+
+#else
+ (void)config_options;
+ status = Status::NotSupported(
+ "Cannot load compressed secondary cache in LITE mode ", args);
+#endif //! ROCKSDB_LITE
+
+ if (status.ok()) {
+ result->swap(sec_cache);
+ }
+ return status;
+ } else {
+ return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
+ result);
+ }
+}
+
+Status Cache::CreateFromString(const ConfigOptions& config_options,
+ const std::string& value,
+ std::shared_ptr<Cache>* result) {
+ Status status;
+ std::shared_ptr<Cache> cache;
+ if (value.find('=') == std::string::npos) {
+ cache = NewLRUCache(ParseSizeT(value));
+ } else {
+#ifndef ROCKSDB_LITE
+ LRUCacheOptions cache_opts;
+ status = OptionTypeInfo::ParseStruct(config_options, "",
+ &lru_cache_options_type_info, "",
+ value, &cache_opts);
+ if (status.ok()) {
+ cache = NewLRUCache(cache_opts);
+ }
+#else
+ (void)config_options;
+ status = Status::NotSupported("Cannot load cache in LITE mode ", value);
+#endif //! ROCKSDB_LITE
+ }
+ if (status.ok()) {
+ result->swap(cache);
+ }
+ return status;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_bench.cc b/src/rocksdb/cache/cache_bench.cc
new file mode 100644
index 000000000..f836939a3
--- /dev/null
+++ b/src/rocksdb/cache/cache_bench.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+#include "rocksdb/cache_bench_tool.h"
+int main(int argc, char** argv) {
+ return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
+}
+#endif // GFLAGS
diff --git a/src/rocksdb/cache/cache_bench_tool.cc b/src/rocksdb/cache/cache_bench_tool.cc
new file mode 100644
index 000000000..73360f414
--- /dev/null
+++ b/src/rocksdb/cache/cache_bench_tool.cc
@@ -0,0 +1,973 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache_key.h"
+#ifdef GFLAGS
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <set>
+#include <sstream>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/coding.h"
+#include "util/distributed_mutex.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+static constexpr uint32_t KiB = uint32_t{1} << 10;
+static constexpr uint32_t MiB = KiB << 10;
+static constexpr uint64_t GiB = MiB << 10;
+
+DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_uint64(cache_size, 1 * GiB,
+ "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
+
+DEFINE_double(resident_ratio, 0.25,
+ "Ratio of keys fitting in cache to keyspace.");
+DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
+DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
+
+DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_bool(populate_cache, true, "Populate cache before operations");
+
+DEFINE_uint32(lookup_insert_percent, 87,
+ "Ratio of lookup (+ insert on not found) to total workload "
+ "(expressed as a percentage)");
+DEFINE_uint32(insert_percent, 2,
+ "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_uint32(lookup_percent, 10,
+ "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_uint32(erase_percent, 1,
+ "Ratio of erase to total workload (expressed as a percentage)");
+DEFINE_bool(gather_stats, false,
+ "Whether to periodically simulate gathering block cache stats, "
+ "using one more thread.");
+DEFINE_uint32(
+ gather_stats_sleep_ms, 1000,
+ "How many milliseconds to sleep between each gathering of stats.");
+
+DEFINE_uint32(gather_stats_entries_per_lock, 256,
+ "For Cache::ApplyToAllEntries");
+DEFINE_bool(skewed, false, "If true, skew the key access distribution");
+
+DEFINE_bool(lean, false,
+ "If true, no additional computation is performed besides cache "
+ "operations.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+ "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif // ROCKSDB_LITE
+
+DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+
+// ## BEGIN stress_cache_key sub-tool options ##
+// See class StressCacheKey below.
+DEFINE_bool(stress_cache_key, false,
+ "If true, run cache key stress test instead");
+DEFINE_uint32(
+ sck_files_per_day, 2500000,
+ "(-stress_cache_key) Simulated files generated per simulated day");
+// NOTE: Giving each run a specified lifetime, rather than e.g. "until
+// first collision" ensures equal skew from start-up, when collisions are
+// less likely.
+DEFINE_uint32(sck_days_per_run, 90,
+ "(-stress_cache_key) Number of days to simulate in each run");
+// NOTE: The number of observed collisions directly affects the relative
+// accuracy of the predicted probabilities. 15 observations should be well
+// within factor-of-2 accuracy.
+DEFINE_uint32(
+ sck_min_collision, 15,
+ "(-stress_cache_key) Keep running until this many collisions seen");
+// sck_file_size_mb can be thought of as average file size. The simulation is
+// not precise enough to care about the distribution of file sizes; other
+// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
+// indicate the distribution only makes a small difference (e.g. < 2x factor)
+DEFINE_uint32(
+ sck_file_size_mb, 32,
+ "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
+DEFINE_uint32(sck_reopen_nfiles, 100,
+ "(-stress_cache_key) Simulate DB re-open average every n files");
+DEFINE_uint32(sck_newdb_nreopen, 1000,
+ "(-stress_cache_key) Simulate new DB average every n re-opens");
+DEFINE_uint32(sck_restarts_per_day, 24,
+ "(-stress_cache_key) Average simulated process restarts per day "
+ "(across DBs)");
+DEFINE_uint32(
+ sck_db_count, 100,
+ "(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
+DEFINE_uint32(
+ sck_table_bits, 20,
+ "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
+// sck_keep_bits being well below full 128 bits amplifies the collision
+// probability so that the true probability can be estimated through observed
+// collisions. (More explanation below.)
+DEFINE_uint32(
+ sck_keep_bits, 50,
+ "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
+// sck_randomize is used to validate whether cache key is performing "better
+// than random." Even with this setting, file offsets are not randomized.
+DEFINE_bool(sck_randomize, false,
+ "(-stress_cache_key) Randomize (hash) cache key");
+// See https://github.com/facebook/rocksdb/pull/9058
+DEFINE_bool(sck_footer_unique_id, false,
+ "(-stress_cache_key) Simulate using proposed footer unique id");
+// ## END stress_cache_key sub-tool options ##
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheBench;
+namespace {
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+ explicit SharedState(CacheBench* cache_bench)
+ : cv_(&mu_),
+ num_initialized_(0),
+ start_(false),
+ num_done_(0),
+ cache_bench_(cache_bench) {}
+
+ ~SharedState() {}
+
+ port::Mutex* GetMutex() { return &mu_; }
+
+ port::CondVar* GetCondVar() { return &cv_; }
+
+ CacheBench* GetCacheBench() const { return cache_bench_; }
+
+ void IncInitialized() { num_initialized_++; }
+
+ void IncDone() { num_done_++; }
+
+ bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
+
+ bool AllDone() const { return num_done_ >= FLAGS_threads; }
+
+ void SetStart() { start_ = true; }
+
+ bool Started() const { return start_; }
+
+ private:
+ port::Mutex mu_;
+ port::CondVar cv_;
+
+ uint64_t num_initialized_;
+ bool start_;
+ uint64_t num_done_;
+
+ CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+ uint32_t tid;
+ Random64 rnd;
+ SharedState* shared;
+ HistogramImpl latency_ns_hist;
+ uint64_t duration_us = 0;
+
+ ThreadState(uint32_t index, SharedState* _shared)
+ : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+
+struct KeyGen {
+ char key_data[27];
+
+ Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
+ uint64_t key = 0;
+ if (!FLAGS_skewed) {
+ uint64_t raw = rnd.Next();
+ // Skew according to setting
+ for (uint32_t i = 0; i < FLAGS_skew; ++i) {
+ raw = std::min(raw, rnd.Next());
+ }
+ key = FastRange64(raw, max_key);
+ } else {
+ key = rnd.Skewed(max_log);
+ if (key > max_key) {
+ key -= max_key;
+ }
+ }
+ // Variable size and alignment
+ size_t off = key % 8;
+ key_data[0] = char{42};
+ EncodeFixed64(key_data + 1, key);
+ key_data[9] = char{11};
+ EncodeFixed64(key_data + 10, key);
+ key_data[18] = char{4};
+ EncodeFixed64(key_data + 19, key);
+ assert(27 >= kCacheKeySize);
+ return Slice(&key_data[off], kCacheKeySize);
+ }
+};
+
+char* createValue(Random64& rnd) {
+ char* rv = new char[FLAGS_value_bytes];
+ // Fill with some filler data, and take some CPU time
+ for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+ EncodeFixed64(rv + i, rnd.Next());
+ }
+ return rv;
+}
+
+// Callbacks for secondary cache
+size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
+
+Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
+ memcpy(out, obj, size);
+ return Status::OK();
+}
+
+// Different deleters to simulate using deleter to gather
+// stats on the code origin and kind of cache entries.
+void deleter1(const Slice& /*key*/, void* value) {
+ delete[] static_cast<char*>(value);
+}
+void deleter2(const Slice& /*key*/, void* value) {
+ delete[] static_cast<char*>(value);
+}
+void deleter3(const Slice& /*key*/, void* value) {
+ delete[] static_cast<char*>(value);
+}
+
+Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
+Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
+Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
+} // namespace
+
+class CacheBench {
+ static constexpr uint64_t kHundredthUint64 =
+ std::numeric_limits<uint64_t>::max() / 100U;
+
+ public:
+ CacheBench()
+ : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
+ FLAGS_value_bytes)),
+ lookup_insert_threshold_(kHundredthUint64 *
+ FLAGS_lookup_insert_percent),
+ insert_threshold_(lookup_insert_threshold_ +
+ kHundredthUint64 * FLAGS_insert_percent),
+ lookup_threshold_(insert_threshold_ +
+ kHundredthUint64 * FLAGS_lookup_percent),
+ erase_threshold_(lookup_threshold_ +
+ kHundredthUint64 * FLAGS_erase_percent),
+ skewed_(FLAGS_skewed) {
+ if (erase_threshold_ != 100U * kHundredthUint64) {
+ fprintf(stderr, "Percentages must add to 100.\n");
+ exit(1);
+ }
+
+ max_log_ = 0;
+ if (skewed_) {
+ uint64_t max_key = max_key_;
+ while (max_key >>= 1) max_log_++;
+ if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
+ }
+
+ if (FLAGS_cache_type == "clock_cache") {
+ fprintf(stderr, "Old clock cache implementation has been removed.\n");
+ exit(1);
+ } else if (FLAGS_cache_type == "hyper_clock_cache") {
+ cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
+ FLAGS_num_shard_bits)
+ .MakeSharedCache();
+ } else if (FLAGS_cache_type == "lru_cache") {
+ LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */);
+#ifndef ROCKSDB_LITE
+ if (!FLAGS_secondary_cache_uri.empty()) {
+ Status s = SecondaryCache::CreateFromString(
+ ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
+ if (secondary_cache == nullptr) {
+ fprintf(
+ stderr,
+ "No secondary cache registered matching string: %s status=%s\n",
+ FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+ exit(1);
+ }
+ opts.secondary_cache = secondary_cache;
+ }
+#endif // ROCKSDB_LITE
+
+ cache_ = NewLRUCache(opts);
+ } else {
+ fprintf(stderr, "Cache type not supported.");
+ exit(1);
+ }
+ }
+
+ ~CacheBench() {}
+
+ void PopulateCache() {
+ Random64 rnd(1);
+ KeyGen keygen;
+ for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
+ Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
+ createValue(rnd), &helper1, FLAGS_value_bytes);
+ assert(s.ok());
+ }
+ }
+
+ bool Run() {
+ const auto clock = SystemClock::Default().get();
+
+ PrintEnv();
+ SharedState shared(this);
+ std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+ for (uint32_t i = 0; i < FLAGS_threads; i++) {
+ threads[i].reset(new ThreadState(i, &shared));
+ std::thread(ThreadBody, threads[i].get()).detach();
+ }
+
+ HistogramImpl stats_hist;
+ std::string stats_report;
+ std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report);
+
+ uint64_t start_time;
+ {
+ MutexLock l(shared.GetMutex());
+ while (!shared.AllInitialized()) {
+ shared.GetCondVar()->Wait();
+ }
+ // Record start time
+ start_time = clock->NowMicros();
+
+ // Start all threads
+ shared.SetStart();
+ shared.GetCondVar()->SignalAll();
+
+ // Wait threads to complete
+ while (!shared.AllDone()) {
+ shared.GetCondVar()->Wait();
+ }
+ }
+
+ // Stats gathering is considered background work. This time measurement
+ // is for foreground work, and not really ideal for that. See below.
+ uint64_t end_time = clock->NowMicros();
+ stats_thread.join();
+
+ // Wall clock time - includes idle time if threads
+ // finish at different times (not ideal).
+ double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6;
+ uint32_t ops_per_sec = static_cast<uint32_t>(
+ 1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs);
+ printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs,
+ ops_per_sec);
+
+ // Total time in each thread (more accurate throughput measure)
+ elapsed_secs = 0;
+ for (uint32_t i = 0; i < FLAGS_threads; i++) {
+ elapsed_secs += threads[i]->duration_us * 1e-6;
+ }
+ ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads *
+ FLAGS_ops_per_thread / elapsed_secs);
+ printf("Thread ops/sec = %u\n", ops_per_sec);
+
+ printf("\nOperation latency (ns):\n");
+ HistogramImpl combined;
+ for (uint32_t i = 0; i < FLAGS_threads; i++) {
+ combined.Merge(threads[i]->latency_ns_hist);
+ }
+ printf("%s", combined.ToString().c_str());
+
+ if (FLAGS_gather_stats) {
+ printf("\nGather stats latency (us):\n");
+ printf("%s", stats_hist.ToString().c_str());
+ }
+
+ printf("\n%s", stats_report.c_str());
+
+ return true;
+ }
+
+ private:
+ std::shared_ptr<Cache> cache_;
+ const uint64_t max_key_;
+ // Cumulative thresholds in the space of a random uint64_t
+ const uint64_t lookup_insert_threshold_;
+ const uint64_t insert_threshold_;
+ const uint64_t lookup_threshold_;
+ const uint64_t erase_threshold_;
+ const bool skewed_;
+ int max_log_;
+
+ // A benchmark version of gathering stats on an active block cache by
+ // iterating over it. The primary purpose is to measure the impact of
+ // gathering stats with ApplyToAllEntries on throughput- and
+ // latency-sensitive Cache users. Performance of stats gathering is
+ // also reported. The last set of gathered stats is also reported, for
+ // manual sanity checking for logical errors or other unexpected
+ // behavior of cache_bench or the underlying Cache.
+ static void StatsBody(SharedState* shared, HistogramImpl* stats_hist,
+ std::string* stats_report) {
+ if (!FLAGS_gather_stats) {
+ return;
+ }
+ const auto clock = SystemClock::Default().get();
+ uint64_t total_key_size = 0;
+ uint64_t total_charge = 0;
+ uint64_t total_entry_count = 0;
+ uint64_t table_occupancy = 0;
+ uint64_t table_size = 0;
+ std::set<Cache::DeleterFn> deleters;
+ StopWatchNano timer(clock);
+
+ for (;;) {
+ uint64_t time;
+ time = clock->NowMicros();
+ uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000;
+
+ {
+ MutexLock l(shared->GetMutex());
+ for (;;) {
+ if (shared->AllDone()) {
+ std::ostringstream ostr;
+ ostr << "Most recent cache entry stats:\n"
+ << "Number of entries: " << total_entry_count << "\n"
+ << "Table occupancy: " << table_occupancy << " / "
+ << table_size << " = "
+ << (100.0 * table_occupancy / table_size) << "%\n"
+ << "Total charge: " << BytesToHumanString(total_charge) << "\n"
+ << "Average key size: "
+ << (1.0 * total_key_size / total_entry_count) << "\n"
+ << "Average charge: "
+ << BytesToHumanString(static_cast<uint64_t>(
+ 1.0 * total_charge / total_entry_count))
+ << "\n"
+ << "Unique deleters: " << deleters.size() << "\n";
+ *stats_report = ostr.str();
+ return;
+ }
+ if (clock->NowMicros() >= deadline) {
+ break;
+ }
+ uint64_t diff = deadline - std::min(clock->NowMicros(), deadline);
+ shared->GetCondVar()->TimedWait(diff + 1);
+ }
+ }
+
+ // Now gather stats, outside of mutex
+ total_key_size = 0;
+ total_charge = 0;
+ total_entry_count = 0;
+ deleters.clear();
+ auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
+ Cache::DeleterFn deleter) {
+ total_key_size += key.size();
+ total_charge += charge;
+ ++total_entry_count;
+ // Something slightly more expensive as in (future) stats by category
+ deleters.insert(deleter);
+ };
+ timer.Start();
+ Cache::ApplyToAllEntriesOptions opts;
+ opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
+ shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
+ table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
+ table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
+ stats_hist->Add(timer.ElapsedNanos() / 1000);
+ }
+ }
+
+ static void ThreadBody(ThreadState* thread) {
+ SharedState* shared = thread->shared;
+
+ {
+ MutexLock l(shared->GetMutex());
+ shared->IncInitialized();
+ if (shared->AllInitialized()) {
+ shared->GetCondVar()->SignalAll();
+ }
+ while (!shared->Started()) {
+ shared->GetCondVar()->Wait();
+ }
+ }
+ thread->shared->GetCacheBench()->OperateCache(thread);
+
+ {
+ MutexLock l(shared->GetMutex());
+ shared->IncDone();
+ if (shared->AllDone()) {
+ shared->GetCondVar()->SignalAll();
+ }
+ }
+ }
+
+ void OperateCache(ThreadState* thread) {
+ // To use looked-up values
+ uint64_t result = 0;
+ // To hold handles for a non-trivial amount of time
+ Cache::Handle* handle = nullptr;
+ KeyGen gen;
+ const auto clock = SystemClock::Default().get();
+ uint64_t start_time = clock->NowMicros();
+ StopWatchNano timer(clock);
+
+ for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+ Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+ uint64_t random_op = thread->rnd.Next();
+ Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+ void** out_obj,
+ size_t* charge) -> Status {
+ *out_obj = reinterpret_cast<void*>(new char[size]);
+ memcpy(*out_obj, buf, size);
+ *charge = size;
+ return Status::OK();
+ };
+
+ timer.Start();
+
+ if (random_op < lookup_insert_threshold_) {
+ if (handle) {
+ cache_->Release(handle);
+ handle = nullptr;
+ }
+ // do lookup
+ handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+ true);
+ if (handle) {
+ if (!FLAGS_lean) {
+ // do something with the data
+ result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+ FLAGS_value_bytes);
+ }
+ } else {
+ // do insert
+ Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
+ FLAGS_value_bytes, &handle);
+ assert(s.ok());
+ }
+ } else if (random_op < insert_threshold_) {
+ if (handle) {
+ cache_->Release(handle);
+ handle = nullptr;
+ }
+ // do insert
+ Status s = cache_->Insert(key, createValue(thread->rnd), &helper3,
+ FLAGS_value_bytes, &handle);
+ assert(s.ok());
+ } else if (random_op < lookup_threshold_) {
+ if (handle) {
+ cache_->Release(handle);
+ handle = nullptr;
+ }
+ // do lookup
+ handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+ true);
+ if (handle) {
+ if (!FLAGS_lean) {
+ // do something with the data
+ result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+ FLAGS_value_bytes);
+ }
+ }
+ } else if (random_op < erase_threshold_) {
+ // do erase
+ cache_->Erase(key);
+ } else {
+ // Should be extremely unlikely (noop)
+ assert(random_op >= kHundredthUint64 * 100U);
+ }
+ thread->latency_ns_hist.Add(timer.ElapsedNanos());
+ }
+ if (handle) {
+ cache_->Release(handle);
+ handle = nullptr;
+ }
+ // Ensure computations on `result` are not optimized away.
+ if (result == 1) {
+ printf("You are extremely unlucky(2). Try again.\n");
+ exit(1);
+ }
+ thread->duration_us = clock->NowMicros() - start_time;
+ }
+
+ void PrintEnv() const {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+ printf(
+ "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+ printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+ printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
+ printf("DMutex impl name : %s\n", DMutex::kName());
+ printf("Number of threads : %u\n", FLAGS_threads);
+ printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread);
+ printf("Cache size : %s\n",
+ BytesToHumanString(FLAGS_cache_size).c_str());
+ printf("Num shard bits : %u\n", FLAGS_num_shard_bits);
+ printf("Max key : %" PRIu64 "\n", max_key_);
+ printf("Resident ratio : %g\n", FLAGS_resident_ratio);
+ printf("Skew degree : %u\n", FLAGS_skew);
+ printf("Populate cache : %d\n", int{FLAGS_populate_cache});
+ printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent);
+ printf("Insert percentage : %u%%\n", FLAGS_insert_percent);
+ printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent);
+ printf("Erase percentage : %u%%\n", FLAGS_erase_percent);
+ std::ostringstream stats;
+ if (FLAGS_gather_stats) {
+ stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, "
+ << FLAGS_gather_stats_entries_per_lock << "/lock)";
+ } else {
+ stats << "disabled";
+ }
+ printf("Gather stats : %s\n", stats.str().c_str());
+ printf("----------------------------\n");
+ }
+};
+
+// cache_bench -stress_cache_key is an independent embedded tool for
+// estimating the probability of CacheKey collisions through simulation.
+// At a high level, it simulates generating SST files over many months,
+// keeping them in the DB and/or cache for some lifetime while staying
+// under resource caps, and checking for any cache key collisions that
+// arise among the set of live files. For efficient simulation, we make
+// some simplifying "pessimistic" assumptions (that only increase the
+// chance of the simulation reporting a collision relative to the chance
+// of collision in practice):
+// * Every generated file has a cache entry for every byte offset in the
+// file (contiguous range of cache keys)
+// * All of every file is cached for its entire lifetime. (Here "lifetime"
+// is technically the union of DB and Cache lifetime, though we only
+// model a generous DB lifetime, where space usage is always maximized.
+// In a effective Cache, lifetime in cache can only substantially exceed
+// lifetime in DB if there is little cache activity; cache activity is
+// required to hit cache key collisions.)
+//
+// It would be possible to track an exact set of cache key ranges for the
+// set of live files, but we would have no hope of observing collisions
+// (overlap in live files) in our simulation. We need to employ some way
+// of amplifying collision probability that allows us to predict the real
+// collision probability by extrapolation from observed collisions. Our
+// basic approach is to reduce each cache key range down to some smaller
+// number of bits, and limiting to bits that are shared over the whole
+// range. Now we can observe collisions using a set of smaller stripped-down
+// (reduced) cache keys. Let's do some case analysis to understand why this
+// works:
+// * No collision in reduced key - because the reduction is a pure function
+// this implies no collision in the full keys
+// * Collision detected between two reduced keys - either
+// * The reduction has dropped some structured uniqueness info (from one of
+// session counter or file number; file offsets are never materialized here).
+// This can only artificially inflate the observed and extrapolated collision
+// probabilities. We only have to worry about this in designing the reduction.
+// * The reduction has preserved all the structured uniqueness in the cache
+// key, which means either
+// * REJECTED: We have a uniqueness bug in generating cache keys, where
+// structured uniqueness info should have been different but isn't. In such a
+// case, increasing by 1 the number of bits kept after reduction would not
+// reduce observed probabilities by half. (In our observations, the
+// probabilities are reduced approximately by half.)
+// * ACCEPTED: The lost unstructured uniqueness in the key determines the
+// probability that an observed collision would imply an overlap in ranges.
+// In short, dropping n bits from key would increase collision probability by
+// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
+//
+// But we also have to account for the key ranges based on file size. If file
+// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
+// "ranges", we know from other simulations (see
+// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
+// (less than 2x higher collision probability) using a cache key of size
+// 128 - b bits for the whole file. (This is the only place we make an
+// "optimistic" assumption, which is more than offset by the real
+// implementation stripping off 2 lower bits from block byte offsets for cache
+// keys. The simulation assumes byte offsets, which is net pessimistic.)
+//
+// So to accept the extrapolation as valid, we need to be confident that all
+// "lost" bits, excluding those covered by file offset, are full entropy.
+// Recall that we have assumed (verifiably, safely) that other structured data
+// (file number and session counter) are kept, not lost. Based on the
+// implementation comments for OffsetableCacheKey, the only potential hole here
+// is that we only have ~103 bits of entropy in "all new" session IDs, and in
+// extreme cases, there might be only 1 DB ID. However, because the upper ~39
+// bits of session ID are hashed, the combination of file number and file
+// offset only has to add to 25 bits (or more) to ensure full entropy in
+// unstructured uniqueness lost in the reduction. Typical file size of 32MB
+// suffices (at least for simulation purposes where we assume each file offset
+// occupies a cache key).
+//
+// Example results in comments on OffsetableCacheKey.
+class StressCacheKey {
+ public:
+ void Run() {
+ if (FLAGS_sck_footer_unique_id) {
+ // Proposed footer unique IDs are DB-independent and session-independent
+ // (but process-dependent) which is most easily simulated here by
+ // assuming 1 DB and (later below) no session resets without process
+ // reset.
+ FLAGS_sck_db_count = 1;
+ }
+
+ // Describe the simulated workload
+ uint64_t mb_per_day =
+ uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
+ printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n",
+ FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
+ std::pow(2.0, FLAGS_sck_table_bits),
+ mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
+ // For extrapolating probability of any collisions from a number of
+ // observed collisions
+ multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
+ (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
+ printf(
+ "Multiply by %g to correct for simulation losses (but still assume "
+ "whole file cached)\n",
+ multiplier_);
+ restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
+ double without_ejection =
+ std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
+ // This should be a lower bound for -sck_randomize, usually a terribly
+ // rough lower bound.
+ // If observation is worse than this, then something has gone wrong.
+ printf(
+ "Without ejection, expect random collision after %g days (%g "
+ "corrected)\n",
+ without_ejection, without_ejection * multiplier_);
+ double with_full_table =
+ std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
+ FLAGS_sck_files_per_day;
+ // This is an alternate lower bound for -sck_randomize, usually pretty
+ // accurate. Our cache keys should usually perform "better than random"
+ // but always no worse. (If observation is substantially worse than this,
+ // then something has gone wrong.)
+ printf(
+ "With ejection and full table, expect random collision after %g "
+ "days (%g corrected)\n",
+ with_full_table, with_full_table * multiplier_);
+ collisions_ = 0;
+
+ // Run until sufficient number of observed collisions.
+ for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
+ RunOnce();
+ if (collisions_ == 0) {
+ printf(
+ "No collisions after %d x %u days "
+ " \n",
+ i, FLAGS_sck_days_per_run);
+ } else {
+ double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
+ printf("%" PRIu64
+ " collisions after %d x %u days, est %g days between (%g "
+ "corrected) \n",
+ collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
+ }
+ }
+ }
+
+ void RunOnce() {
+ // Re-initialized simulated state
+ const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1});
+ dbs_.reset(new TableProperties[db_count]{});
+ const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
+ table_.reset(new uint64_t[table_mask + 1]{});
+ if (FLAGS_sck_keep_bits > 64) {
+ FLAGS_sck_keep_bits = 64;
+ }
+
+ // Details of which bits are dropped in reduction
+ uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
+ // Shift away fewer potential file number bits (b) than potential
+ // session counter bits (a).
+ uint32_t shift_away_b = shift_away / 3;
+ uint32_t shift_away_a = shift_away - shift_away_b;
+
+ process_count_ = 0;
+ session_count_ = 0;
+ newdb_count_ = 0;
+ ResetProcess(/*newdbs*/ true);
+
+ Random64 r{std::random_device{}()};
+
+ uint64_t max_file_count =
+ uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
+ uint32_t report_count = 0;
+ uint32_t collisions_this_run = 0;
+ size_t db_i = 0;
+
+ for (uint64_t file_count = 1; file_count <= max_file_count;
+ ++file_count, ++db_i) {
+ // Round-robin through DBs (this faster than %)
+ if (db_i >= db_count) {
+ db_i = 0;
+ }
+ // Any other periodic actions before simulating next file
+ if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
+ ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen));
+ } else if (r.OneIn(restart_nfiles_)) {
+ ResetProcess(/*newdbs*/ false);
+ }
+ // Simulate next file
+ OffsetableCacheKey ock;
+ dbs_[db_i].orig_file_number += 1;
+ // skip some file numbers for other file kinds, except in footer unique
+ // ID, orig_file_number here tracks process-wide generated SST file
+ // count.
+ if (!FLAGS_sck_footer_unique_id) {
+ dbs_[db_i].orig_file_number += (r.Next() & 3);
+ }
+ bool is_stable;
+ BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
+ /* ignored */ 42, &ock, &is_stable);
+ assert(is_stable);
+ // Get a representative cache key, which later we analytically generalize
+ // to a range.
+ CacheKey ck = ock.WithOffset(0);
+ uint64_t reduced_key;
+ if (FLAGS_sck_randomize) {
+ reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
+ } else if (FLAGS_sck_footer_unique_id) {
+ // Special case: keep only file number, not session counter
+ reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
+ } else {
+ // Try to keep file number and session counter (shift away other bits)
+ uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
+ uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
+ reduced_key = (uint64_t{a} << 32) + b;
+ }
+ if (reduced_key == 0) {
+ // Unlikely, but we need to exclude tracking this value because we
+ // use it to mean "empty" in table. This case is OK as long as we
+ // don't hit it often.
+ printf("Hit Zero! \n");
+ file_count--;
+ continue;
+ }
+ uint64_t h =
+ NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
+ // Skew expected lifetimes, for high variance (super-Poisson) variance
+ // in actual lifetimes.
+ size_t pos =
+ std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
+ if (table_[pos] == reduced_key) {
+ collisions_this_run++;
+ // Our goal is to predict probability of no collisions, not expected
+ // number of collisions. To make the distinction, we have to get rid
+ // of observing correlated collisions, which this takes care of:
+ ResetProcess(/*newdbs*/ false);
+ } else {
+ // Replace (end of lifetime for file that was in this slot)
+ table_[pos] = reduced_key;
+ }
+
+ if (++report_count == FLAGS_sck_files_per_day) {
+ report_count = 0;
+ // Estimate fill %
+ size_t incr = table_mask / 1000;
+ size_t sampled_count = 0;
+ for (size_t i = 0; i <= table_mask; i += incr) {
+ if (table_[i] != 0) {
+ sampled_count++;
+ }
+ }
+ // Report
+ printf(
+ "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64
+ " newdb, %u coll, occ %g%%, ejected %g%% \r",
+ file_count / FLAGS_sck_files_per_day, process_count_,
+ session_count_, newdb_count_ - FLAGS_sck_db_count,
+ collisions_this_run, 100.0 * sampled_count / 1000.0,
+ 100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count));
+ fflush(stdout);
+ }
+ }
+ collisions_ += collisions_this_run;
+ }
+
+ void ResetSession(size_t i, bool newdb) {
+ dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+ if (newdb) {
+ ++newdb_count_;
+ if (FLAGS_sck_footer_unique_id) {
+ // Simulate how footer id would behave
+ dbs_[i].db_id = "none";
+ } else {
+ // db_id might be ignored, depending on the implementation details
+ dbs_[i].db_id = std::to_string(newdb_count_);
+ dbs_[i].orig_file_number = 0;
+ }
+ }
+ session_count_++;
+ }
+
+ void ResetProcess(bool newdbs) {
+ process_count_++;
+ DBImpl::TEST_ResetDbSessionIdGen();
+ for (size_t i = 0; i < FLAGS_sck_db_count; ++i) {
+ ResetSession(i, newdbs);
+ }
+ if (FLAGS_sck_footer_unique_id) {
+ // For footer unique ID, this tracks process-wide generated SST file
+ // count.
+ dbs_[0].orig_file_number = 0;
+ }
+ }
+
+ private:
+ // Use db_session_id and orig_file_number from TableProperties
+ std::unique_ptr<TableProperties[]> dbs_;
+ std::unique_ptr<uint64_t[]> table_;
+ uint64_t process_count_ = 0;
+ uint64_t session_count_ = 0;
+ uint64_t newdb_count_ = 0;
+ uint64_t collisions_ = 0;
+ uint32_t restart_nfiles_ = 0;
+ double multiplier_ = 0.0;
+};
+
+int cache_bench_tool(int argc, char** argv) {
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ if (FLAGS_stress_cache_key) {
+ // Alternate tool
+ StressCacheKey().Run();
+ return 0;
+ }
+
+ if (FLAGS_threads <= 0) {
+ fprintf(stderr, "threads number <= 0\n");
+ exit(1);
+ }
+
+ ROCKSDB_NAMESPACE::CacheBench bench;
+ if (FLAGS_populate_cache) {
+ bench.PopulateCache();
+ printf("Population complete\n");
+ printf("----------------------------\n");
+ }
+ if (bench.Run()) {
+ return 0;
+ } else {
+ return 1;
+ }
+} // namespace ROCKSDB_NAMESPACE
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // GFLAGS
diff --git a/src/rocksdb/cache/cache_entry_roles.cc b/src/rocksdb/cache/cache_entry_roles.cc
new file mode 100644
index 000000000..b27349554
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_roles.cc
@@ -0,0 +1,134 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_entry_roles.h"
+
+#include <mutex>
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{
+ "DataBlock",
+ "FilterBlock",
+ "FilterMetaBlock",
+ "DeprecatedFilterBlock",
+ "IndexBlock",
+ "OtherBlock",
+ "WriteBuffer",
+ "CompressionDictionaryBuildingBuffer",
+ "FilterConstruction",
+ "BlockBasedTableReader",
+ "FileMetadata",
+ "BlobValue",
+ "BlobCache",
+ "Misc",
+}};
+
+std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{
+ "data-block",
+ "filter-block",
+ "filter-meta-block",
+ "deprecated-filter-block",
+ "index-block",
+ "other-block",
+ "write-buffer",
+ "compression-dictionary-building-buffer",
+ "filter-construction",
+ "block-based-table-reader",
+ "file-metadata",
+ "blob-value",
+ "blob-cache",
+ "misc",
+}};
+
+const std::string& GetCacheEntryRoleName(CacheEntryRole role) {
+ return kCacheEntryRoleToHyphenString[static_cast<size_t>(role)];
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::CacheId() {
+ static const std::string kCacheId = "id";
+ return kCacheId;
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::CacheCapacityBytes() {
+ static const std::string kCacheCapacityBytes = "capacity";
+ return kCacheCapacityBytes;
+}
+
+const std::string&
+BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds() {
+ static const std::string kLastCollectionDurationSeconds =
+ "secs_for_last_collection";
+ return kLastCollectionDurationSeconds;
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds() {
+ static const std::string kLastCollectionAgeSeconds =
+ "secs_since_last_collection";
+ return kLastCollectionAgeSeconds;
+}
+
+namespace {
+
+std::string GetPrefixedCacheEntryRoleName(const std::string& prefix,
+ CacheEntryRole role) {
+ const std::string& role_name = GetCacheEntryRoleName(role);
+ std::string prefixed_role_name;
+ prefixed_role_name.reserve(prefix.size() + role_name.size());
+ prefixed_role_name.append(prefix);
+ prefixed_role_name.append(role_name);
+ return prefixed_role_name;
+}
+
+} // namespace
+
+std::string BlockCacheEntryStatsMapKeys::EntryCount(CacheEntryRole role) {
+ const static std::string kPrefix = "count.";
+ return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+std::string BlockCacheEntryStatsMapKeys::UsedBytes(CacheEntryRole role) {
+ const static std::string kPrefix = "bytes.";
+ return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
+ const static std::string kPrefix = "percent.";
+ return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+namespace {
+
+struct Registry {
+ std::mutex mutex;
+ UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map;
+ void Register(Cache::DeleterFn fn, CacheEntryRole role) {
+ std::lock_guard<std::mutex> lock(mutex);
+ role_map[fn] = role;
+ }
+ UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() {
+ std::lock_guard<std::mutex> lock(mutex);
+ return role_map;
+ }
+};
+
+Registry& GetRegistry() {
+ STATIC_AVOID_DESTRUCTION(Registry, registry);
+ return registry;
+}
+
+} // namespace
+
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
+ GetRegistry().Register(fn, role);
+}
+
+UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
+ return GetRegistry().Copy();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_entry_roles.h b/src/rocksdb/cache/cache_entry_roles.h
new file mode 100644
index 000000000..5a49fdfd4
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_roles.h
@@ -0,0 +1,103 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "rocksdb/cache.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern std::array<std::string, kNumCacheEntryRoles>
+ kCacheEntryRoleToCamelString;
+extern std::array<std::string, kNumCacheEntryRoles>
+ kCacheEntryRoleToHyphenString;
+
+// To associate cache entries with their role, we use a hack on the
+// existing Cache interface. Because the deleter of an entry can authenticate
+// the code origin of an entry, we can elaborate the choice of deleter to
+// also encode role information, without inferring false role information
+// from entries not choosing to encode a role.
+//
+// The rest of this file is for handling mappings between deleters and
+// roles.
+
+// To infer a role from a deleter, the deleter must be registered. This
+// can be done "manually" with this function. This function is thread-safe,
+// and the registration mappings go into private but static storage. (Note
+// that DeleterFn is a function pointer, not std::function. Registrations
+// should not be too many.)
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
+
+// Gets a copy of the registered deleter -> role mappings. This is the only
+// function for reading the mappings made with RegisterCacheDeleterRole.
+// Why only this interface for reading?
+// * This function has to be thread safe, which could incur substantial
+// overhead. We should not pay this overhead for every deleter look-up.
+// * This is suitable for preparing for batch operations, like with
+// CacheEntryStatsCollector.
+// * The number of mappings should be sufficiently small (dozens).
+UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
+
+// ************************************************************** //
+// An automatic registration infrastructure. This enables code
+// to simply ask for a deleter associated with a particular type
+// and role, and registration is automatic. In a sense, this is
+// a small dependency injection infrastructure, because linking
+// in new deleter instantiations is essentially sufficient for
+// making stats collection (using CopyCacheDeleterRoleMap) aware
+// of them.
+
+namespace cache_entry_roles_detail {
+
+template <typename T, CacheEntryRole R>
+struct RegisteredDeleter {
+ RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+ // These have global linkage to help ensure compiler optimizations do not
+ // break uniqueness for each <T,R>
+ static void Delete(const Slice& /* key */, void* value) {
+ // Supports T == Something[], unlike delete operator
+ std::default_delete<T>()(
+ static_cast<typename std::remove_extent<T>::type*>(value));
+ }
+};
+
+template <CacheEntryRole R>
+struct RegisteredNoopDeleter {
+ RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+ static void Delete(const Slice& /* key */, void* /* value */) {
+ // Here was `assert(value == nullptr);` but we can also put pointers
+ // to static data in Cache, for testing at least.
+ }
+};
+
+} // namespace cache_entry_roles_detail
+
+// Get an automatically registered deleter for value type T and role R.
+// Based on C++ semantics, registration is invoked exactly once in a
+// thread-safe way on first call to this function, for each <T, R>.
+template <typename T, CacheEntryRole R>
+Cache::DeleterFn GetCacheEntryDeleterForRole() {
+ static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
+ return reg.Delete;
+}
+
+// Get an automatically registered no-op deleter (value should be nullptr)
+// and associated with role R. This is used for Cache "reservation" entries
+// such as for WriteBufferManager.
+template <CacheEntryRole R>
+Cache::DeleterFn GetNoopDeleterForRole() {
+ static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
+ return reg.Delete;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_entry_stats.h b/src/rocksdb/cache/cache_entry_stats.h
new file mode 100644
index 000000000..63b12735b
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_stats.h
@@ -0,0 +1,183 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A generic helper object for gathering stats about cache entries by
+// iterating over them with ApplyToAllEntries. This class essentially
+// solves the problem of slowing down a Cache with too many stats
+// collectors that could be sharing stat results, such as from multiple
+// column families or multiple DBs sharing a Cache. We employ a few
+// mitigations:
+// * Only one collector for a particular kind of Stats is alive
+// for each Cache. This is guaranteed using the Cache itself to hold
+// the collector.
+// * A mutex ensures only one thread is gathering stats for this
+// collector.
+// * The most recent gathered stats are saved and simply copied to
+// satisfy requests within a time window (default: 3 minutes) of
+// completion of the most recent stat gathering.
+//
+// Template parameter Stats must be copyable and trivially constructable,
+// as well as...
+// concept Stats {
+// // Notification before applying callback to all entries
+// void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+// // Get the callback to apply to all entries. `callback`
+// // type must be compatible with Cache::ApplyToAllEntries
+// callback GetEntryCallback();
+// // Notification after applying callback to all entries
+// void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+// // Notification that a collection was skipped because of
+// // sufficiently recent saved results.
+// void SkippedCollection();
+// }
+template <class Stats>
+class CacheEntryStatsCollector {
+ public:
+ // Gather and save stats if saved stats are too old. (Use GetStats() to
+ // read saved stats.)
+ //
+ // Maximum allowed age for a "hit" on saved results is determined by the
+ // two interval parameters. Both set to 0 forces a re-scan. For example
+ // with min_interval_seconds=300 and min_interval_factor=100, if the last
+ // scan took 10s, we would only rescan ("miss") if the age in seconds of
+ // the saved results is > max(300, 100*10).
+ // Justification: scans can vary wildly in duration, e.g. from 0.02 sec
+ // to as much as 20 seconds, so we want to be able to cap the absolute
+ // and relative frequency of scans.
+ void CollectStats(int min_interval_seconds, int min_interval_factor) {
+ // Waits for any pending reader or writer (collector)
+ std::lock_guard<std::mutex> lock(working_mutex_);
+
+ uint64_t max_age_micros =
+ static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
+
+ if (last_end_time_micros_ > last_start_time_micros_ &&
+ min_interval_factor > 0) {
+ max_age_micros = std::max(
+ max_age_micros, min_interval_factor * (last_end_time_micros_ -
+ last_start_time_micros_));
+ }
+
+ uint64_t start_time_micros = clock_->NowMicros();
+ if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
+ last_start_time_micros_ = start_time_micros;
+ working_stats_.BeginCollection(cache_, clock_, start_time_micros);
+
+ cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {});
+ TEST_SYNC_POINT_CALLBACK(
+ "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
+
+ uint64_t end_time_micros = clock_->NowMicros();
+ last_end_time_micros_ = end_time_micros;
+ working_stats_.EndCollection(cache_, clock_, end_time_micros);
+ } else {
+ working_stats_.SkippedCollection();
+ }
+
+ // Save so that we don't need to wait for an outstanding collection in
+ // order to make of copy of the last saved stats
+ std::lock_guard<std::mutex> lock2(saved_mutex_);
+ saved_stats_ = working_stats_;
+ }
+
+ // Gets saved stats, regardless of age
+ void GetStats(Stats *stats) {
+ std::lock_guard<std::mutex> lock(saved_mutex_);
+ *stats = saved_stats_;
+ }
+
+ Cache *GetCache() const { return cache_; }
+
+ // Gets or creates a shared instance of CacheEntryStatsCollector in the
+ // cache itself, and saves into `ptr`. This shared_ptr will hold the
+ // entry in cache until all refs are destroyed.
+ static Status GetShared(Cache *cache, SystemClock *clock,
+ std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+ const Slice &cache_key = GetCacheKey();
+
+ Cache::Handle *h = cache->Lookup(cache_key);
+ if (h == nullptr) {
+ // Not yet in cache, but Cache doesn't provide a built-in way to
+ // avoid racing insert. So we double-check under a shared mutex,
+ // inspired by TableCache.
+ STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
+ std::lock_guard<std::mutex> lock(static_mutex);
+
+ h = cache->Lookup(cache_key);
+ if (h == nullptr) {
+ auto new_ptr = new CacheEntryStatsCollector(cache, clock);
+ // TODO: non-zero charge causes some tests that count block cache
+ // usage to go flaky. Fix the problem somehow so we can use an
+ // accurate charge.
+ size_t charge = 0;
+ Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
+ Cache::Priority::HIGH);
+ if (!s.ok()) {
+ assert(h == nullptr);
+ delete new_ptr;
+ return s;
+ }
+ }
+ }
+ // If we reach here, shared entry is in cache with handle `h`.
+ assert(cache->GetDeleter(h) == Deleter);
+
+ // Build an aliasing shared_ptr that keeps `ptr` in cache while there
+ // are references.
+ *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
+ return Status::OK();
+ }
+
+ private:
+ explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+ : saved_stats_(),
+ working_stats_(),
+ last_start_time_micros_(0),
+ last_end_time_micros_(/*pessimistic*/ 10000000),
+ cache_(cache),
+ clock_(clock) {}
+
+ static void Deleter(const Slice &, void *value) {
+ delete static_cast<CacheEntryStatsCollector *>(value);
+ }
+
+ static const Slice &GetCacheKey() {
+ // For each template instantiation
+ static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
+ static Slice ckey_slice = ckey.AsSlice();
+ return ckey_slice;
+ }
+
+ std::mutex saved_mutex_;
+ Stats saved_stats_;
+
+ std::mutex working_mutex_;
+ Stats working_stats_;
+ uint64_t last_start_time_micros_;
+ uint64_t last_end_time_micros_;
+
+ Cache *const cache_;
+ SystemClock *const clock_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_helpers.h b/src/rocksdb/cache/cache_helpers.h
new file mode 100644
index 000000000..7ea2365b8
--- /dev/null
+++ b/src/rocksdb/cache/cache_helpers.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the cached value given a cache handle.
+template <typename T>
+T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
+ assert(cache);
+ assert(handle);
+
+ return static_cast<T*>(cache->Value(handle));
+}
+
+// Simple generic deleter for Cache (to be used with Cache::Insert).
+template <typename T>
+void DeleteCacheEntry(const Slice& /* key */, void* value) {
+ delete static_cast<T*>(value);
+}
+
+// Turns a T* into a Slice so it can be used as a key with Cache.
+template <typename T>
+Slice GetSlice(const T* t) {
+ return Slice(reinterpret_cast<const char*>(t), sizeof(T));
+}
+
+// Generic resource management object for cache handles that releases the handle
+// when destroyed. Has unique ownership of the handle, so copying it is not
+// allowed, while moving it transfers ownership.
+template <typename T>
+class CacheHandleGuard {
+ public:
+ CacheHandleGuard() = default;
+
+ CacheHandleGuard(Cache* cache, Cache::Handle* handle)
+ : cache_(cache),
+ handle_(handle),
+ value_(GetFromCacheHandle<T>(cache, handle)) {
+ assert(cache_ && handle_ && value_);
+ }
+
+ CacheHandleGuard(const CacheHandleGuard&) = delete;
+ CacheHandleGuard& operator=(const CacheHandleGuard&) = delete;
+
+ CacheHandleGuard(CacheHandleGuard&& rhs) noexcept
+ : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) {
+ assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+ rhs.ResetFields();
+ }
+
+ CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ ReleaseHandle();
+
+ cache_ = rhs.cache_;
+ handle_ = rhs.handle_;
+ value_ = rhs.value_;
+
+ assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+ rhs.ResetFields();
+
+ return *this;
+ }
+
+ ~CacheHandleGuard() { ReleaseHandle(); }
+
+ bool IsEmpty() const { return !handle_; }
+
+ Cache* GetCache() const { return cache_; }
+ Cache::Handle* GetCacheHandle() const { return handle_; }
+ T* GetValue() const { return value_; }
+
+ void TransferTo(Cleanable* cleanable) {
+ if (cleanable) {
+ if (handle_ != nullptr) {
+ assert(cache_);
+ cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_);
+ }
+ }
+ ResetFields();
+ }
+
+ void Reset() {
+ ReleaseHandle();
+ ResetFields();
+ }
+
+ private:
+ void ReleaseHandle() {
+ if (IsEmpty()) {
+ return;
+ }
+
+ assert(cache_);
+ cache_->Release(handle_);
+ }
+
+ void ResetFields() {
+ cache_ = nullptr;
+ handle_ = nullptr;
+ value_ = nullptr;
+ }
+
+ static void ReleaseCacheHandle(void* arg1, void* arg2) {
+ Cache* const cache = static_cast<Cache*>(arg1);
+ assert(cache);
+
+ Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+ assert(cache_handle);
+
+ cache->Release(cache_handle);
+ }
+
+ private:
+ Cache* cache_ = nullptr;
+ Cache::Handle* handle_ = nullptr;
+ T* value_ = nullptr;
+};
+
+// Build an aliasing shared_ptr that keeps `handle` in cache while there
+// are references, but the pointer is to the value for that cache entry,
+// which must be of type T. This is copyable, unlike CacheHandleGuard, but
+// does not provide access to caching details.
+template <typename T>
+std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
+ Cache::Handle* handle) {
+ auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
+ return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_key.cc b/src/rocksdb/cache/cache_key.cc
new file mode 100644
index 000000000..a79328972
--- /dev/null
+++ b/src/rocksdb/cache/cache_key.cc
@@ -0,0 +1,364 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+
+#include <algorithm>
+#include <atomic>
+
+#include "rocksdb/cache.h"
+#include "table/unique_id_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Value space plan for CacheKey:
+//
+// file_num_etc64_ | offset_etc64_ | Only generated by
+// ---------------+---------------+------------------------------------------
+// 0 | 0 | Reserved for "empty" CacheKey()
+// 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime
+// 0 | >= 1<<63 | CreateUniqueForProcessLifetime
+// > 0 | any | OffsetableCacheKey.WithOffset
+
+CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
+ // +1 so that we can reserve all zeros for "unset" cache key
+ uint64_t id = cache->NewId() + 1;
+ // Ensure we don't collide with CreateUniqueForProcessLifetime
+ assert((id >> 63) == 0U);
+ return CacheKey(0, id);
+}
+
+CacheKey CacheKey::CreateUniqueForProcessLifetime() {
+ // To avoid colliding with CreateUniqueForCacheLifetime, assuming
+ // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
+ // If this ever becomes a point of contention, we could sub-divide the
+ // space and use CoreLocalArray.
+ static std::atomic<uint64_t> counter{UINT64_MAX};
+ uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
+ // Ensure we don't collide with CreateUniqueForCacheLifetime
+ assert((id >> 63) == 1U);
+ return CacheKey(0, id);
+}
+
+// How we generate CacheKeys and base OffsetableCacheKey, assuming that
+// db_session_ids are generated from a base_session_id and
+// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
+// in DBImpl::GenerateDbSessionId):
+//
+// Conceptual inputs:
+// db_id (unstructured, from GenerateRawUniqueId or equiv)
+// * could be shared between cloned DBs but rare
+// * could be constant, if session id suffices
+// base_session_id (unstructured, from GenerateRawUniqueId)
+// session_id_counter (structured)
+// * usually much smaller than 2**24
+// orig_file_number (structured)
+// * usually smaller than 2**24
+// offset_in_file (structured, might skip lots of values)
+// * usually smaller than 2**32
+//
+// Overall approach (see https://github.com/pdillinger/unique_id for
+// background):
+//
+// First, we have three "structured" values, up to 64 bits each, that we
+// need to fit, without losses, into 128 bits. In practice, the values will
+// be small enough that they should fit. For example, applications generating
+// large SST files (large offsets) will naturally produce fewer files (small
+// file numbers). But we don't know ahead of time what bounds the values will
+// have.
+//
+// Second, we have unstructured inputs that enable distinct RocksDB processes
+// to pick a random point in space, likely very different from others. Xoring
+// the structured with the unstructured give us a cache key that is
+// structurally distinct between related keys (e.g. same file or same RocksDB
+// process) and distinct with high probability between unrelated keys.
+//
+// The problem of packing three structured values into the space for two is
+// complicated by the fact that we want to derive cache keys from SST unique
+// IDs, which have already combined structured and unstructured inputs in a
+// practically inseparable way. And we want a base cache key that works
+// with an offset of any size. So basically, we need to encode these three
+// structured values, each up to 64 bits, into 128 bits without knowing any
+// of their sizes. The DownwardInvolution() function gives us a mechanism to
+// accomplish this. (See its properties in math.h.) Specifically, for inputs
+// a, b, and c:
+// lower64 = DownwardInvolution(a) ^ ReverseBits(b);
+// upper64 = c ^ ReverseBits(a);
+// The 128-bit output is unique assuming there exist some i, j, and k
+// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
+// i + j + k <= 128. In other words, as long as there exist some bounds
+// that would allow us to pack the bits of a, b, and c into the output
+// if we know the bound, we can generate unique outputs without knowing
+// those bounds. To validate this claim, the inversion function (given
+// the bounds) has been implemented in CacheKeyDecoder in
+// db_block_cache_test.cc.
+//
+// With that in mind, the outputs in terms of the conceptual inputs look
+// like this, using bitwise-xor of the constituent pieces, low bits on left:
+//
+// |------------------------- file_num_etc64 -------------------------|
+// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
+// |-----------------------------------------------------------------|
+// | session_id_counter (involution) ..... | |
+// |-----------------------------------------------------------------|
+// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
+// | * base_session_id (upper ~39 bits) |
+// | * db_id (~122 bits entropy) |
+// |-----------------------------------------------------------------|
+// | | ..... orig_file_number (reversed) |
+// |-----------------------------------------------------------------|
+//
+//
+// |------------------------- offset_etc64 --------------------------|
+// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
+// |-----------------------------------------------------------------|
+// | | ..... session_id_counter (reversed) |
+// |-----------------------------------------------------------------|
+// | offset_in_file ............... | |
+// |-----------------------------------------------------------------|
+//
+// Some oddities or inconveniences of this layout are due to deriving
+// the "base" cache key (without offset) from the SST unique ID (see
+// GetSstInternalUniqueId). Specifically,
+// * Lower 64 of base_session_id occurs in both output words (ok but
+// weird)
+// * The inclusion of db_id is bad for the conditions under which we
+// can guarantee uniqueness, but could be useful in some cases with
+// few small files per process, to make up for db session id only having
+// ~103 bits of entropy.
+//
+// In fact, if DB ids were not involved, we would be guaranteed unique
+// cache keys for files generated in a single process until total bits for
+// biggest session_id_counter, orig_file_number, and offset_in_file
+// reach 128 bits.
+//
+// With the DB id limitation, we only have nice guaranteed unique cache
+// keys for files generated in a single process until biggest
+// session_id_counter and offset_in_file reach combined 64 bits. This
+// is quite good in practice because we can have millions of DB Opens
+// with terabyte size SST files, or billions of DB Opens with gigabyte
+// size SST files.
+//
+// One of the considerations in the translation between existing SST unique
+// IDs and base cache keys is supporting better SST unique IDs in a future
+// format_version. If we use a process-wide file counter instead of
+// session counter and file numbers, we only need to combine two 64-bit values
+// instead of three. But we don't want to track unique ID versions in the
+// manifest, so we want to keep the same translation layer between SST unique
+// IDs and base cache keys, even with updated SST unique IDs. If the new
+// unique IDs put the file counter where the orig_file_number was, and
+// use no structured field where session_id_counter was, then our translation
+// layer works fine for two structured fields as well as three (for
+// compatibility). The small computation for the translation (one
+// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
+// is negligible for computing as part of SST file reader open.
+//
+// More on how https://github.com/pdillinger/unique_id applies here:
+// Every bit of output always includes "unstructured" uniqueness bits and
+// often combines with "structured" uniqueness bits. The "unstructured" bits
+// change infrequently: only when we cannot guarantee our state tracking for
+// "structured" uniqueness hasn't been cloned. Using a static
+// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
+// "all new" session id when a new process uses RocksDB. (Between processes,
+// we don't know if a DB or other persistent storage has been cloned. We
+// assume that if VM hot cloning is used, subsequently generated SST files
+// do not interact.) Within a process, only the session_lower of the
+// db_session_id changes incrementally ("structured" uniqueness).
+//
+// This basically means that our offsets, counters and file numbers allow us
+// to do somewhat "better than random" (birthday paradox) while in the
+// degenerate case of completely new session for each tiny file, we still
+// have strong uniqueness properties from the birthday paradox, with ~103
+// bit session IDs or up to 128 bits entropy with different DB IDs sharing a
+// cache.
+//
+// More collision probability analysis:
+// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD)
+// with average process/session lifetime of (pessimistically) 4 minutes.
+// In 180 days (generous allowable data lifespan), we generate 31 million GB
+// of data, or 2^55 bytes, and 2^16 "all new" session IDs.
+//
+// First, suppose this is in a single DB (lifetime 180 days):
+// 128 bits cache key size
+// - 55 <- ideal size for byte offsets + file numbers
+// - 2 <- bits for offsets and file numbers not exactly powers of two
+// + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
+// ----
+// 73 <- bits remaining for distinguishing session IDs
+// The probability of a collision in 73 bits of session ID data is less than
+// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
+// data from the last 180 days is in cache for potential collision, and that
+// cache keys under each session id exhaustively cover the remaining 57 bits
+// while in reality they'll only cover a small fraction of it.
+//
+// Although data could be transferred between hosts, each host has its own
+// cache and we are already assuming a high rate of "all new" session ids.
+// So this doesn't really change the collision calculation. Across a fleet
+// of 1 million, each with <1 in a trillion collision possibility,
+// fleetwide collision probability is <1 in a million.
+//
+// Now suppose we have many DBs per host, say 2**10, with same host-wide write
+// rate and process/session lifetime. File numbers will be ~10 bits smaller
+// and we will have 2**10 times as many session IDs because of simultaneous
+// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
+// or roughly 1 in a billion.
+//
+// Suppose instead we generated random or hashed cache keys for each
+// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys
+// in 180 days. Collision probability is more easily estimated at roughly
+// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all
+// data from the last 180 days is in cache, but NOT the other assumption
+// for the 1 in a trillion estimate above).
+//
+//
+// Collision probability estimation through simulation:
+// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
+// activity over many months, by making some pessimistic simplifying
+// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
+// Here is some sample output with
+// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
+//
+// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
+// Multiply by 1.15292e+18 to correct for simulation losses (but still
+// assume whole file cached)
+//
+// These come from default settings of 2.5M files per day of 32 MB each, and
+// `-sck_keep_bits=43` means that to represent a single file, we are only
+// keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25
+// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
+// about 1 billion billion times more prone to collision than reality.
+//
+// More default assumptions, relatively pessimistic:
+// * 100 DBs in same process (doesn't matter much)
+// * Re-open DB in same process (new session ID related to old session ID) on
+// average every 100 files generated
+// * Restart process (all new session IDs unrelated to old) 24 times per day
+//
+// After enough data, we get a result at the end (-sck_keep_bits=43):
+//
+// (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between
+// (1.15292e+19 corrected)
+//
+// If we believe the (pessimistic) simulation and the mathematical
+// extrapolation, we would need to run a billion machines all for 11 billion
+// days to expect a cache key collision. To help verify that our extrapolation
+// ("corrected") is robust, we can make our simulation more precise by
+// increasing the "keep" bits, which takes more running time to get enough
+// collision data:
+//
+// (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between
+// (1.6213e+19 corrected)
+// (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between
+// (1.21057e+19 corrected)
+// (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between
+// (1.46997e+19 corrected)
+// (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between
+// (2.11849e+19 corrected)
+//
+// The extrapolated prediction seems to be within noise (sampling error).
+//
+// With the `-sck_randomize` option, we can see that typical workloads like
+// above have lower collision probability than "random" cache keys (note:
+// offsets still non-randomized) by a modest amount (roughly 2-3x less
+// collision prone than random), which should make us reasonably comfortable
+// even in "degenerate" cases (e.g. repeatedly launch a process to generate
+// one file with SstFileWriter):
+//
+// (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
+// (4.7165e+18 corrected)
+//
+// We can see that with more frequent process restarts,
+// -sck_restarts_per_day=5000, which means more all-new session IDs, we get
+// closer to the "random" cache key performance:
+//
+// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
+//
+// And with less frequent process restarts and re-opens,
+// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
+// probability:
+//
+// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
+//
+// Other tests have been run to validate other conditions behave as expected,
+// never behaving "worse than random" unless we start chopping off structured
+// data.
+//
+// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
+// that only arise when a new process is started, the chance of any cache key
+// collisions in a giant fleet of machines is negligible. Especially when
+// processes live for hours or days, the chance of a cache key collision is
+// likely more plausibly due to bad hardware than to bad luck in random
+// session ID data. Software defects are surely more likely to cause corruption
+// than both of those.
+//
+// TODO: Nevertheless / regardless, an efficient way to detect (and thus
+// quantify) block cache corruptions, including collisions, should be added.
+OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
+ const std::string &db_session_id,
+ uint64_t file_number) {
+ UniqueId64x2 internal_id;
+ Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
+ &internal_id, /*force=*/true);
+ assert(s.ok());
+ *this = FromInternalUniqueId(&internal_id);
+}
+
+OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
+ uint64_t session_lower = id.ptr[0];
+ uint64_t file_num_etc = id.ptr[1];
+
+#ifndef NDEBUG
+ bool is_empty = session_lower == 0 && file_num_etc == 0;
+#endif
+
+ // Although DBImpl guarantees (in recent versions) that session_lower is not
+ // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
+ // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
+ // However, if we are given an "empty" id as input, then we should produce
+ // "empty" as output.
+ // As a consequence, this function is only bijective assuming
+ // id[0] == 0 only if id[1] == 0.
+ if (session_lower == 0U) {
+ session_lower = file_num_etc;
+ }
+
+ // See comments above for how DownwardInvolution and ReverseBits
+ // make this function invertible under various assumptions.
+ OffsetableCacheKey rv;
+ rv.file_num_etc64_ =
+ DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
+ rv.offset_etc64_ = ReverseBits(session_lower);
+
+ // Because of these transformations and needing to allow arbitrary
+ // offset (thus, second 64 bits of cache key might be 0), we need to
+ // make some correction to ensure the first 64 bits is not 0.
+ // Fortunately, the transformation ensures the second 64 bits is not 0
+ // for non-empty base key, so we can swap in the case one is 0 without
+ // breaking bijectivity (assuming condition above).
+ assert(is_empty || rv.offset_etc64_ > 0);
+ if (rv.file_num_etc64_ == 0) {
+ std::swap(rv.file_num_etc64_, rv.offset_etc64_);
+ }
+ assert(is_empty || rv.file_num_etc64_ > 0);
+ return rv;
+}
+
+// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
+// offset_etc64 == 0)
+UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
+ uint64_t a = file_num_etc64_;
+ uint64_t b = offset_etc64_;
+ if (b == 0) {
+ std::swap(a, b);
+ }
+ UniqueId64x2 rv;
+ rv[0] = ReverseBits(b);
+ rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
+ return rv;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_key.h b/src/rocksdb/cache/cache_key.h
new file mode 100644
index 000000000..0b93c6bd9
--- /dev/null
+++ b/src/rocksdb/cache/cache_key.h
@@ -0,0 +1,143 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+
+// A standard holder for fixed-size block cache keys (and for related caches).
+// They are created through one of these, each using its own range of values:
+// * CacheKey::CreateUniqueForCacheLifetime
+// * CacheKey::CreateUniqueForProcessLifetime
+// * Default ctor ("empty" cache key)
+// * OffsetableCacheKey->WithOffset
+//
+// The first two use atomic counters to guarantee uniqueness over the given
+// lifetime and the last uses a form of universally unique identifier for
+// uniqueness with very high probabilty (and guaranteed for files generated
+// during a single process lifetime).
+//
+// CacheKeys are currently used by calling AsSlice() to pass as a key to
+// Cache. For performance, the keys are endianness-dependent (though otherwise
+// portable). (Persistable cache entries are not intended to cross platforms.)
+class CacheKey {
+ public:
+ // For convenience, constructs an "empty" cache key that is never returned
+ // by other means.
+ inline CacheKey() : file_num_etc64_(), offset_etc64_() {}
+
+ inline bool IsEmpty() const {
+ return (file_num_etc64_ == 0) & (offset_etc64_ == 0);
+ }
+
+ // Use this cache key as a Slice (byte order is endianness-dependent)
+ inline Slice AsSlice() const {
+ static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
+ assert(!IsEmpty());
+ return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
+ }
+
+ // Create a CacheKey that is unique among others associated with this Cache
+ // instance. Depends on Cache::NewId. This is useful for block cache
+ // "reservations".
+ static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
+
+ // Create a CacheKey that is unique among others for the lifetime of this
+ // process. This is useful for saving in a static data member so that
+ // different DB instances can agree on a cache key for shared entities,
+ // such as for CacheEntryStatsCollector.
+ static CacheKey CreateUniqueForProcessLifetime();
+
+ protected:
+ friend class OffsetableCacheKey;
+ CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64)
+ : file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {}
+ uint64_t file_num_etc64_;
+ uint64_t offset_etc64_;
+};
+
+constexpr uint8_t kCacheKeySize = static_cast<uint8_t>(sizeof(CacheKey));
+
+// A file-specific generator of cache keys, sometimes referred to as the
+// "base" cache key for a file because all the cache keys for various offsets
+// within the file are computed using simple arithmetic. The basis for the
+// general approach is dicussed here: https://github.com/pdillinger/unique_id
+// Heavily related to GetUniqueIdFromTableProperties.
+//
+// If the db_id, db_session_id, and file_number come from the file's table
+// properties, then the keys will be stable across DB::Open/Close, backup/
+// restore, import/export, etc.
+//
+// This class "is a" CacheKey only privately so that it is not misused as
+// a ready-to-use CacheKey.
+class OffsetableCacheKey : private CacheKey {
+ public:
+ // For convenience, constructs an "empty" cache key that should not be used.
+ inline OffsetableCacheKey() : CacheKey() {}
+
+ // Constructs an OffsetableCacheKey with the given information about a file.
+ // This constructor never generates an "empty" base key.
+ OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
+ uint64_t file_number);
+
+ // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
+ // can be derived from DB manifest data before reading the file from
+ // storage--so that every part of the file can potentially go in a persistent
+ // cache.
+ //
+ // Calling GetSstInternalUniqueId() on a db_id, db_session_id, and
+ // file_number and passing the result to this function produces the same
+ // base cache key as feeding those inputs directly to the constructor.
+ //
+ // This is a bijective transformation assuming either id is empty or
+ // lower 64 bits is non-zero:
+ // * Empty (all zeros) input -> empty (all zeros) output
+ // * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is
+ // non-zero
+ static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id);
+
+ // This is the inverse transformation to the above, assuming either empty
+ // or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for
+ // testing.
+ UniqueId64x2 ToInternalUniqueId();
+
+ inline bool IsEmpty() const {
+ bool result = file_num_etc64_ == 0;
+ assert(!(offset_etc64_ > 0 && result));
+ return result;
+ }
+
+ // Construct a CacheKey for an offset within a file. An offset is not
+ // necessarily a byte offset if a smaller unique identifier of keyable
+ // offsets is used.
+ //
+ // This class was designed to make this hot code extremely fast.
+ inline CacheKey WithOffset(uint64_t offset) const {
+ assert(!IsEmpty());
+ return CacheKey(file_num_etc64_, offset_etc64_ ^ offset);
+ }
+
+ // The "common prefix" is a shared prefix for all the returned CacheKeys.
+ // It is specific to the file but the same for all offsets within the file.
+ static constexpr size_t kCommonPrefixSize = 8;
+ inline Slice CommonPrefixSlice() const {
+ static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
+ "8 byte common prefix expected");
+ assert(!IsEmpty());
+ assert(&this->file_num_etc64_ == static_cast<const void *>(this));
+
+ return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager.cc b/src/rocksdb/cache/cache_reservation_manager.cc
new file mode 100644
index 000000000..53dee5d79
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/block_based/reader_common.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::CacheReservationHandle::CacheReservationHandle(
+ std::size_t incremental_memory_used,
+ std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr)
+ : incremental_memory_used_(incremental_memory_used) {
+ assert(cache_res_mgr);
+ cache_res_mgr_ = cache_res_mgr;
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<
+ R>::CacheReservationHandle::~CacheReservationHandle() {
+ Status s = cache_res_mgr_->ReleaseCacheReservation(incremental_memory_used_);
+ s.PermitUncheckedError();
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
+ std::shared_ptr<Cache> cache, bool delayed_decrease)
+ : delayed_decrease_(delayed_decrease),
+ cache_allocated_size_(0),
+ memory_used_(0) {
+ assert(cache != nullptr);
+ cache_ = cache;
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
+ for (auto* handle : dummy_handles_) {
+ cache_->Release(handle, true);
+ }
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::UpdateCacheReservation(
+ std::size_t new_mem_used) {
+ memory_used_ = new_mem_used;
+ std::size_t cur_cache_allocated_size =
+ cache_allocated_size_.load(std::memory_order_relaxed);
+ if (new_mem_used == cur_cache_allocated_size) {
+ return Status::OK();
+ } else if (new_mem_used > cur_cache_allocated_size) {
+ Status s = IncreaseCacheReservation(new_mem_used);
+ return s;
+ } else {
+ // In delayed decrease mode, we don't decrease cache reservation
+ // untill the memory usage is less than 3/4 of what we reserve
+ // in the cache.
+ // We do this because
+ // (1) Dummy entry insertion is expensive in block cache
+ // (2) Delayed releasing previously inserted dummy entries can save such
+ // expensive dummy entry insertion on memory increase in the near future,
+ // which is likely to happen when the memory usage is greater than or equal
+ // to 3/4 of what we reserve
+ if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) {
+ return Status::OK();
+ } else {
+ Status s = DecreaseCacheReservation(new_mem_used);
+ return s;
+ }
+ }
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::MakeCacheReservation(
+ std::size_t incremental_memory_used,
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle) {
+ assert(handle);
+ Status s =
+ UpdateCacheReservation(GetTotalMemoryUsed() + incremental_memory_used);
+ (*handle).reset(new CacheReservationManagerImpl::CacheReservationHandle(
+ incremental_memory_used,
+ std::enable_shared_from_this<
+ CacheReservationManagerImpl<R>>::shared_from_this()));
+ return s;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::ReleaseCacheReservation(
+ std::size_t incremental_memory_used) {
+ assert(GetTotalMemoryUsed() >= incremental_memory_used);
+ std::size_t updated_total_mem_used =
+ GetTotalMemoryUsed() - incremental_memory_used;
+ Status s = UpdateCacheReservation(updated_total_mem_used);
+ return s;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
+ std::size_t new_mem_used) {
+ Status return_status = Status::OK();
+ while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
+ Cache::Handle* handle = nullptr;
+ return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
+ GetNoopDeleterForRole<R>(), &handle);
+
+ if (return_status != Status::OK()) {
+ return return_status;
+ }
+
+ dummy_handles_.push_back(handle);
+ cache_allocated_size_ += kSizeDummyEntry;
+ }
+ return return_status;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
+ std::size_t new_mem_used) {
+ Status return_status = Status::OK();
+
+ // Decrease to the smallest multiple of kSizeDummyEntry that is greater than
+ // or equal to new_mem_used We do addition instead of new_mem_used <=
+ // cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to
+ // avoid underflow of size_t when cache_allocated_size_ = 0
+ while (new_mem_used + kSizeDummyEntry <=
+ cache_allocated_size_.load(std::memory_order_relaxed)) {
+ assert(!dummy_handles_.empty());
+ auto* handle = dummy_handles_.back();
+ cache_->Release(handle, true);
+ dummy_handles_.pop_back();
+ cache_allocated_size_ -= kSizeDummyEntry;
+ }
+ return return_status;
+}
+
+template <CacheEntryRole R>
+std::size_t CacheReservationManagerImpl<R>::GetTotalReservedCacheSize() {
+ return cache_allocated_size_.load(std::memory_order_relaxed);
+}
+
+template <CacheEntryRole R>
+std::size_t CacheReservationManagerImpl<R>::GetTotalMemoryUsed() {
+ return memory_used_;
+}
+
+template <CacheEntryRole R>
+Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
+ // Calling this function will have the side-effect of changing the
+ // underlying cache_key_ that is shared among other keys generated from this
+ // fucntion. Therefore please make sure the previous keys are saved/copied
+ // before calling this function.
+ cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get());
+ return cache_key_.AsSlice();
+}
+
+template <CacheEntryRole R>
+Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() {
+ return GetNoopDeleterForRole<R>();
+}
+
+template class CacheReservationManagerImpl<
+ CacheEntryRole::kBlockBasedTableReader>;
+template class CacheReservationManagerImpl<
+ CacheEntryRole::kCompressionDictionaryBuildingBuffer>;
+template class CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>;
+template class CacheReservationManagerImpl<CacheEntryRole::kMisc>;
+template class CacheReservationManagerImpl<CacheEntryRole::kWriteBuffer>;
+template class CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>;
+template class CacheReservationManagerImpl<CacheEntryRole::kBlobCache>;
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager.h b/src/rocksdb/cache/cache_reservation_manager.h
new file mode 100644
index 000000000..147aaa915
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager.h
@@ -0,0 +1,316 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+// CacheReservationManager is an interface for reserving cache space for the
+// memory used
+class CacheReservationManager {
+ public:
+ // CacheReservationHandle is for managing the lifetime of a cache reservation
+ // for an incremental amount of memory used (i.e, incremental_memory_used)
+ class CacheReservationHandle {
+ public:
+ virtual ~CacheReservationHandle() {}
+ };
+ virtual ~CacheReservationManager() {}
+ virtual Status UpdateCacheReservation(std::size_t new_memory_used) = 0;
+ // TODO(hx235): replace the usage of
+ // `UpdateCacheReservation(memory_used_delta, increase)` with
+ // `UpdateCacheReservation(new_memory_used)` so that we only have one
+ // `UpdateCacheReservation` function
+ virtual Status UpdateCacheReservation(std::size_t memory_used_delta,
+ bool increase) = 0;
+ virtual Status MakeCacheReservation(
+ std::size_t incremental_memory_used,
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+ *handle) = 0;
+ virtual std::size_t GetTotalReservedCacheSize() = 0;
+ virtual std::size_t GetTotalMemoryUsed() = 0;
+};
+
+// CacheReservationManagerImpl implements interface CacheReservationManager
+// for reserving cache space for the memory used by inserting/releasing dummy
+// entries in the cache.
+//
+// This class is NOT thread-safe, except that GetTotalReservedCacheSize()
+// can be called without external synchronization.
+template <CacheEntryRole R>
+class CacheReservationManagerImpl
+ : public CacheReservationManager,
+ public std::enable_shared_from_this<CacheReservationManagerImpl<R>> {
+ public:
+ class CacheReservationHandle
+ : public CacheReservationManager::CacheReservationHandle {
+ public:
+ CacheReservationHandle(
+ std::size_t incremental_memory_used,
+ std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr);
+ ~CacheReservationHandle() override;
+
+ private:
+ std::size_t incremental_memory_used_;
+ std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr_;
+ };
+
+ // Construct a CacheReservationManagerImpl
+ // @param cache The cache where dummy entries are inserted and released for
+ // reserving cache space
+ // @param delayed_decrease If set true, then dummy entries won't be released
+ // immediately when memory usage decreases.
+ // Instead, it will be released when the memory usage
+ // decreases to 3/4 of what we have reserved so far.
+ // This is for saving some future dummy entry
+ // insertion when memory usage increases are likely to
+ // happen in the near future.
+ //
+ // REQUIRED: cache is not nullptr
+ explicit CacheReservationManagerImpl(std::shared_ptr<Cache> cache,
+ bool delayed_decrease = false);
+
+ // no copy constructor, copy assignment, move constructor, move assignment
+ CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete;
+ CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) =
+ delete;
+ CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete;
+ CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) =
+ delete;
+
+ ~CacheReservationManagerImpl() override;
+
+ // One of the two ways of reserving/releasing cache space,
+ // see MakeCacheReservation() for the other.
+ //
+ // Use ONLY one of these two ways to prevent unexpected behavior.
+ //
+ // Insert and release dummy entries in the cache to
+ // match the size of total dummy entries with the least multiple of
+ // kSizeDummyEntry greater than or equal to new_mem_used
+ //
+ // Insert dummy entries if new_memory_used > cache_allocated_size_;
+ //
+ // Release dummy entries if new_memory_used < cache_allocated_size_
+ // (and new_memory_used < cache_allocated_size_ * 3/4
+ // when delayed_decrease is set true);
+ //
+ // Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_
+ // or (2) new_memory_used is in the interval of
+ // [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease
+ // is set true.
+ //
+ // @param new_memory_used The number of bytes used by new memory
+ // The most recent new_memoy_used passed in will be returned
+ // in GetTotalMemoryUsed() even when the call return non-ok status.
+ //
+ // Since the class is NOT thread-safe, external synchronization on the
+ // order of calling UpdateCacheReservation() is needed if you want
+ // GetTotalMemoryUsed() indeed returns the latest memory used.
+ //
+ // @return On inserting dummy entries, it returns Status::OK() if all dummy
+ // entry insertions succeed.
+ // Otherwise, it returns the first non-ok status;
+ // On releasing dummy entries, it always returns Status::OK().
+ // On keeping dummy entries the same, it always returns Status::OK().
+ Status UpdateCacheReservation(std::size_t new_memory_used) override;
+
+ Status UpdateCacheReservation(std::size_t /* memory_used_delta */,
+ bool /* increase */) override {
+ return Status::NotSupported();
+ }
+
+ // One of the two ways of reserving cache space and releasing is done through
+ // destruction of CacheReservationHandle.
+ // See UpdateCacheReservation() for the other way.
+ //
+ // Use ONLY one of these two ways to prevent unexpected behavior.
+ //
+ // Insert dummy entries in the cache for the incremental memory usage
+ // to match the size of total dummy entries with the least multiple of
+ // kSizeDummyEntry greater than or equal to the total memory used.
+ //
+ // A CacheReservationHandle is returned as an output parameter.
+ // The reserved dummy entries are automatically released on the destruction of
+ // this handle, which achieves better RAII per cache reservation.
+ //
+ // WARNING: Deallocate all the handles of the CacheReservationManager object
+ // before deallocating the object to prevent unexpected behavior.
+ //
+ // @param incremental_memory_used The number of bytes increased in memory
+ // usage.
+ //
+ // Calling GetTotalMemoryUsed() afterward will return the total memory
+ // increased by this number, even when calling MakeCacheReservation()
+ // returns non-ok status.
+ //
+ // Since the class is NOT thread-safe, external synchronization in
+ // calling MakeCacheReservation() is needed if you want
+ // GetTotalMemoryUsed() indeed returns the latest memory used.
+ //
+ // @param handle An pointer to std::unique_ptr<CacheReservationHandle> that
+ // manages the lifetime of the cache reservation represented by the
+ // handle.
+ //
+ // @return It returns Status::OK() if all dummy
+ // entry insertions succeed.
+ // Otherwise, it returns the first non-ok status;
+ //
+ // REQUIRES: handle != nullptr
+ Status MakeCacheReservation(
+ std::size_t incremental_memory_used,
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+ override;
+
+ // Return the size of the cache (which is a multiple of kSizeDummyEntry)
+ // successfully reserved by calling UpdateCacheReservation().
+ //
+ // When UpdateCacheReservation() returns non-ok status,
+ // calling GetTotalReservedCacheSize() after that might return a slightly
+ // smaller number than the actual reserved cache size due to
+ // the returned number will always be a multiple of kSizeDummyEntry
+ // and cache full might happen in the middle of inserting a dummy entry.
+ std::size_t GetTotalReservedCacheSize() override;
+
+ // Return the latest total memory used indicated by the most recent call of
+ // UpdateCacheReservation(std::size_t new_memory_used);
+ std::size_t GetTotalMemoryUsed() override;
+
+ static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
+
+ // For testing only - it is to help ensure the NoopDeleterForRole<R>
+ // accessed from CacheReservationManagerImpl and the one accessed from the
+ // test are from the same translation units
+ static Cache::DeleterFn TEST_GetNoopDeleterForRole();
+
+ private:
+ static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+
+ Slice GetNextCacheKey();
+
+ Status ReleaseCacheReservation(std::size_t incremental_memory_used);
+ Status IncreaseCacheReservation(std::size_t new_mem_used);
+ Status DecreaseCacheReservation(std::size_t new_mem_used);
+
+ std::shared_ptr<Cache> cache_;
+ bool delayed_decrease_;
+ std::atomic<std::size_t> cache_allocated_size_;
+ std::size_t memory_used_;
+ std::vector<Cache::Handle *> dummy_handles_;
+ CacheKey cache_key_;
+};
+
+class ConcurrentCacheReservationManager
+ : public CacheReservationManager,
+ public std::enable_shared_from_this<ConcurrentCacheReservationManager> {
+ public:
+ class CacheReservationHandle
+ : public CacheReservationManager::CacheReservationHandle {
+ public:
+ CacheReservationHandle(
+ std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr,
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+ cache_res_handle) {
+ assert(cache_res_mgr && cache_res_handle);
+ cache_res_mgr_ = cache_res_mgr;
+ cache_res_handle_ = std::move(cache_res_handle);
+ }
+
+ ~CacheReservationHandle() override {
+ std::lock_guard<std::mutex> lock(cache_res_mgr_->cache_res_mgr_mu_);
+ cache_res_handle_.reset();
+ }
+
+ private:
+ std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+ cache_res_handle_;
+ };
+
+ explicit ConcurrentCacheReservationManager(
+ std::shared_ptr<CacheReservationManager> cache_res_mgr) {
+ cache_res_mgr_ = std::move(cache_res_mgr);
+ }
+ ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) =
+ delete;
+ ConcurrentCacheReservationManager &operator=(
+ const ConcurrentCacheReservationManager &) = delete;
+ ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) =
+ delete;
+ ConcurrentCacheReservationManager &operator=(
+ ConcurrentCacheReservationManager &&) = delete;
+
+ ~ConcurrentCacheReservationManager() override {}
+
+ inline Status UpdateCacheReservation(std::size_t new_memory_used) override {
+ std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+ return cache_res_mgr_->UpdateCacheReservation(new_memory_used);
+ }
+
+ inline Status UpdateCacheReservation(std::size_t memory_used_delta,
+ bool increase) override {
+ std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+ std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed();
+ Status s;
+ if (!increase) {
+ assert(total_mem_used >= memory_used_delta);
+ s = cache_res_mgr_->UpdateCacheReservation(total_mem_used -
+ memory_used_delta);
+ } else {
+ s = cache_res_mgr_->UpdateCacheReservation(total_mem_used +
+ memory_used_delta);
+ }
+ return s;
+ }
+
+ inline Status MakeCacheReservation(
+ std::size_t incremental_memory_used,
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+ override {
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+ wrapped_handle;
+ Status s;
+ {
+ std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+ s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used,
+ &wrapped_handle);
+ }
+ (*handle).reset(
+ new ConcurrentCacheReservationManager::CacheReservationHandle(
+ std::enable_shared_from_this<
+ ConcurrentCacheReservationManager>::shared_from_this(),
+ std::move(wrapped_handle)));
+ return s;
+ }
+ inline std::size_t GetTotalReservedCacheSize() override {
+ return cache_res_mgr_->GetTotalReservedCacheSize();
+ }
+ inline std::size_t GetTotalMemoryUsed() override {
+ std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+ return cache_res_mgr_->GetTotalMemoryUsed();
+ }
+
+ private:
+ std::mutex cache_res_mgr_mu_;
+ std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager_test.cc b/src/rocksdb/cache/cache_reservation_manager_test.cc
new file mode 100644
index 000000000..2a0c318e0
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager_test.cc
@@ -0,0 +1,469 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManagerTest : public ::testing::Test {
+ protected:
+ static constexpr std::size_t kSizeDummyEntry =
+ CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+ static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+ static constexpr int kNumShardBits = 0; // 2^0 shard
+ static constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+ std::shared_ptr<Cache> cache = NewLRUCache(kCacheCapacity, kNumShardBits);
+ std::shared_ptr<CacheReservationManager> test_cache_rev_mng;
+
+ CacheReservationManagerTest() {
+ test_cache_rev_mng =
+ std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+ cache);
+ }
+};
+
+TEST_F(CacheReservationManagerTest, GenerateCacheKey) {
+ std::size_t new_mem_used = 1 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+ ASSERT_LT(cache->GetPinnedUsage(),
+ 1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+ // Next unique Cache key
+ CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ // Get to the underlying values
+ uint64_t* ckey_data = reinterpret_cast<uint64_t*>(&ckey);
+ // Back it up to the one used by CRM (using CacheKey implementation details)
+ ckey_data[1]--;
+
+ // Specific key (subject to implementation details)
+ EXPECT_EQ(ckey_data[0], 0);
+ EXPECT_EQ(ckey_data[1], 2);
+
+ Cache::Handle* handle = cache->Lookup(ckey.AsSlice());
+ EXPECT_NE(handle, nullptr)
+ << "Failed to generate the cache key for the dummy entry correctly";
+ // Clean up the returned handle from Lookup() to prevent memory leak
+ cache->Release(handle);
+}
+
+TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) {
+ std::size_t new_mem_used = 1 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry);
+ ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+ std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+ ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry);
+ ASSERT_LT(initial_pinned_usage,
+ 1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to keep cache reservation the same when new_mem_used equals "
+ "to current cache reservation";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly when new_mem_used equals to current "
+ "cache reservation";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly when new_mem_used "
+ "equals to current cache reservation";
+ EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+ << "Failed to keep underlying dummy entries the same when new_mem_used "
+ "equals to current cache reservation";
+}
+
+TEST_F(CacheReservationManagerTest,
+ IncreaseCacheReservationByMultiplesOfDummyEntrySize) {
+ std::size_t new_mem_used = 2 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to increase cache reservation correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 2 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation increase correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+ << "Failed to increase underlying dummy entries in cache correctly";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+ IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+ std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to increase cache reservation correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 3 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation increase correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry)
+ << "Failed to increase underlying dummy entries in cache correctly";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 3 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest,
+ IncreaseCacheReservationOnFullCache) {
+ ;
+ constexpr std::size_t kSizeDummyEntry =
+ CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+ constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry;
+ constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry;
+ constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+ LRUCacheOptions lo;
+ lo.capacity = kSmallCacheCapacity;
+ lo.num_shard_bits = 0; // 2^0 shard
+ lo.strict_capacity_limit = true;
+ std::shared_ptr<Cache> cache = NewLRUCache(lo);
+ std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+ std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+ cache);
+
+ std::size_t new_mem_used = kSmallCacheCapacity + 1;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::MemoryLimit())
+ << "Failed to return status to indicate failure of dummy entry insertion "
+ "during cache reservation on full cache";
+ EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly before cache resevation failure happens "
+ "due to full cache";
+ EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ kSmallCacheCapacity)
+ << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+ "entry insertions) when encountering cache resevation failure due to "
+ "full cache";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+ << "Failed to insert underlying dummy entries correctly when "
+ "encountering cache resevation failure due to full cache";
+ EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+ << "Failed to insert underlying dummy entries correctly when "
+ "encountering cache resevation failure due to full cache";
+
+ new_mem_used = kSmallCacheCapacity / 2; // 2 dummy entries
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to decrease cache reservation after encountering cache "
+ "reservation failure due to full cache";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 2 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation decrease correctly after "
+ "encountering cache reservation due to full cache";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+ << "Failed to release underlying dummy entries correctly on cache "
+ "reservation decrease after encountering cache resevation failure due "
+ "to full cache";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to release underlying dummy entries correctly on cache "
+ "reservation decrease after encountering cache resevation failure due "
+ "to full cache";
+
+ // Create cache full again for subsequent tests
+ new_mem_used = kSmallCacheCapacity + 1;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::MemoryLimit())
+ << "Failed to return status to indicate failure of dummy entry insertion "
+ "during cache reservation on full cache";
+ EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly before cache resevation failure happens "
+ "due to full cache";
+ EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ kSmallCacheCapacity)
+ << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+ "entry insertions) when encountering cache resevation failure due to "
+ "full cache";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+ << "Failed to insert underlying dummy entries correctly when "
+ "encountering cache resevation failure due to full cache";
+ EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+ << "Failed to insert underlying dummy entries correctly when "
+ "encountering cache resevation failure due to full cache";
+
+ // Increase cache capacity so the previously failed insertion can fully
+ // succeed
+ cache->SetCapacity(kBigCacheCapacity);
+ new_mem_used = kSmallCacheCapacity + 1;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to increase cache reservation after increasing cache capacity "
+ "and mitigating cache full error";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 5 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation increase correctly after "
+ "increasing cache capacity and mitigating cache full error";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry)
+ << "Failed to insert underlying dummy entries correctly after increasing "
+ "cache capacity and mitigating cache full error";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 5 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to insert underlying dummy entries correctly after increasing "
+ "cache capacity and mitigating cache full error";
+}
+
+TEST_F(CacheReservationManagerTest,
+ DecreaseCacheReservationByMultiplesOfDummyEntrySize) {
+ std::size_t new_mem_used = 2 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 2 * kSizeDummyEntry);
+ ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+ ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+ ASSERT_LT(cache->GetPinnedUsage(),
+ 2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+ new_mem_used = 1 * kSizeDummyEntry;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to decrease cache reservation correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation decrease correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+ << "Failed to decrease underlying dummy entries in cache correctly";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+ DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+ std::size_t new_mem_used = 2 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 2 * kSizeDummyEntry);
+ ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+ ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+ ASSERT_LT(cache->GetPinnedUsage(),
+ 2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+ new_mem_used = kSizeDummyEntry / 2;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to decrease cache reservation correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 1 * kSizeDummyEntry)
+ << "Failed to bookkeep cache reservation decrease correctly";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+ << "Failed to decrease underlying dummy entries in cache correctly";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerWithDelayedDecreaseTest,
+ DecreaseCacheReservationWithDelayedDecrease) {
+ constexpr std::size_t kSizeDummyEntry =
+ CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+ constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+ constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+ LRUCacheOptions lo;
+ lo.capacity = kCacheCapacity;
+ lo.num_shard_bits = 0;
+ std::shared_ptr<Cache> cache = NewLRUCache(lo);
+ std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+ std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+ cache, true /* delayed_decrease */);
+
+ std::size_t new_mem_used = 8 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 8 * kSizeDummyEntry);
+ ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+ std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+ ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry);
+ ASSERT_LT(initial_pinned_usage,
+ 8 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+ new_mem_used = 6 * kSizeDummyEntry;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 8 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly when delaying cache reservation "
+ "decrease";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+ << "Failed to delay decreasing underlying dummy entries in cache";
+
+ new_mem_used = 7 * kSizeDummyEntry;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 8 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly when delaying cache reservation "
+ "decrease";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+ << "Failed to delay decreasing underlying dummy entries in cache";
+
+ new_mem_used = 6 * kSizeDummyEntry - 1;
+ s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ EXPECT_EQ(s, Status::OK())
+ << "Failed to decrease cache reservation correctly when new_mem_used < "
+ "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+ 6 * kSizeDummyEntry)
+ << "Failed to bookkeep correctly when new_mem_used < "
+ "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+ << "Failed to bookkeep the used memory correctly";
+ EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry)
+ << "Failed to decrease underlying dummy entries in cache when "
+ "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+ "decrease mode";
+ EXPECT_LT(cache->GetPinnedUsage(),
+ 6 * kSizeDummyEntry + kMetaDataChargeOverhead)
+ << "Failed to decrease underlying dummy entries in cache when "
+ "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+ "decrease mode";
+}
+
+TEST(CacheReservationManagerDestructorTest,
+ ReleaseRemainingDummyEntriesOnDestruction) {
+ constexpr std::size_t kSizeDummyEntry =
+ CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+ constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+ constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+ LRUCacheOptions lo;
+ lo.capacity = kCacheCapacity;
+ lo.num_shard_bits = 0;
+ std::shared_ptr<Cache> cache = NewLRUCache(lo);
+ {
+ std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+ std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+ cache);
+ std::size_t new_mem_used = 1 * kSizeDummyEntry;
+ Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+ ASSERT_EQ(s, Status::OK());
+ ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+ ASSERT_LT(cache->GetPinnedUsage(),
+ 1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+ }
+ EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry)
+ << "Failed to release remaining underlying dummy entries in cache in "
+ "CacheReservationManager's destructor";
+}
+
+TEST(CacheReservationHandleTest, HandleTest) {
+ constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024;
+ constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+ constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+ LRUCacheOptions lo;
+ lo.capacity = kOneGigabyte;
+ lo.num_shard_bits = 0;
+ std::shared_ptr<Cache> cache = NewLRUCache(lo);
+
+ std::shared_ptr<CacheReservationManager> test_cache_rev_mng(
+ std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+ cache));
+
+ std::size_t mem_used = 0;
+ const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry;
+ const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry;
+ std::unique_ptr<CacheReservationManager::CacheReservationHandle> handle_1,
+ handle_2;
+
+ // To test consecutive CacheReservationManager::MakeCacheReservation works
+ // correctly in terms of returning the handle as well as updating cache
+ // reservation and the latest total memory used
+ Status s = test_cache_rev_mng->MakeCacheReservation(
+ incremental_mem_used_handle_1, &handle_1);
+ mem_used = mem_used + incremental_mem_used_handle_1;
+ ASSERT_EQ(s, Status::OK());
+ EXPECT_TRUE(handle_1 != nullptr);
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+ EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+ EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+ s = test_cache_rev_mng->MakeCacheReservation(incremental_mem_used_handle_2,
+ &handle_2);
+ mem_used = mem_used + incremental_mem_used_handle_2;
+ ASSERT_EQ(s, Status::OK());
+ EXPECT_TRUE(handle_2 != nullptr);
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+ EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+ EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+ // To test
+ // CacheReservationManager::CacheReservationHandle::~CacheReservationHandle()
+ // works correctly in releasing the cache reserved for the handle
+ handle_1.reset();
+ EXPECT_TRUE(handle_1 == nullptr);
+ mem_used = mem_used - incremental_mem_used_handle_1;
+ EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+ EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+ EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+ EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+ // To test the actual CacheReservationManager object won't be deallocated
+ // as long as there remain handles pointing to it.
+ // We strongly recommend deallocating CacheReservationManager object only
+ // after all its handles are deallocated to keep things easy to reasonate
+ test_cache_rev_mng.reset();
+ EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+ EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+ handle_2.reset();
+ // The CacheReservationManager object is now deallocated since all the handles
+ // and its original pointer is gone
+ mem_used = mem_used - incremental_mem_used_handle_2;
+ EXPECT_EQ(mem_used, 0);
+ EXPECT_EQ(cache->GetPinnedUsage(), mem_used);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/cache_test.cc b/src/rocksdb/cache/cache_test.cc
new file mode 100644
index 000000000..212d65d96
--- /dev/null
+++ b/src/rocksdb/cache/cache_test.cc
@@ -0,0 +1,1037 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include <forward_list>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cache/lru_cache.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+// HyperClockCache only supports 16-byte keys, so some of the tests
+// originally written for LRUCache do not work on the other caches.
+// Those tests were adapted to use 16-byte keys. We kept the original ones.
+// TODO: Remove the original tests if they ever become unused.
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+std::string EncodeKey16Bytes(int k) {
+ std::string result;
+ PutFixed32(&result, k);
+ result.append(std::string(12, 'a')); // Because we need a 16B output, we
+ // add a 12-byte padding.
+ return result;
+}
+
+int DecodeKey16Bytes(const Slice& k) {
+ assert(k.size() == 16);
+ return DecodeFixed32(k.data()); // Decodes only the first 4 bytes of k.
+}
+
+std::string EncodeKey32Bits(int k) {
+ std::string result;
+ PutFixed32(&result, k);
+ return result;
+}
+
+int DecodeKey32Bits(const Slice& k) {
+ assert(k.size() == 4);
+ return DecodeFixed32(k.data());
+}
+
+void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+int DecodeValue(void* v) {
+ return static_cast<int>(reinterpret_cast<uintptr_t>(v));
+}
+
+void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}
+
+void EraseDeleter1(const Slice& /*key*/, void* value) {
+ Cache* cache = reinterpret_cast<Cache*>(value);
+ cache->Erase("foo");
+}
+
+void EraseDeleter2(const Slice& /*key*/, void* value) {
+ Cache* cache = reinterpret_cast<Cache*>(value);
+ cache->Erase(EncodeKey16Bytes(1234));
+}
+
+const std::string kLRU = "lru";
+const std::string kHyperClock = "hyper_clock";
+
+} // anonymous namespace
+
+class CacheTest : public testing::TestWithParam<std::string> {
+ public:
+ static CacheTest* current_;
+ static std::string type_;
+
+ static void Deleter(const Slice& key, void* v) {
+ if (type_ == kHyperClock) {
+ current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
+ } else {
+ current_->deleted_keys_.push_back(DecodeKey32Bits(key));
+ }
+ current_->deleted_values_.push_back(DecodeValue(v));
+ }
+
+ static const int kCacheSize = 1000;
+ static const int kNumShardBits = 4;
+
+ static const int kCacheSize2 = 100;
+ static const int kNumShardBits2 = 2;
+
+ std::vector<int> deleted_keys_;
+ std::vector<int> deleted_values_;
+ std::shared_ptr<Cache> cache_;
+ std::shared_ptr<Cache> cache2_;
+
+ size_t estimated_value_size_ = 1;
+
+ CacheTest()
+ : cache_(NewCache(kCacheSize, kNumShardBits, false)),
+ cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
+ current_ = this;
+ type_ = GetParam();
+ }
+
+ ~CacheTest() override {}
+
+ std::shared_ptr<Cache> NewCache(size_t capacity) {
+ auto type = GetParam();
+ if (type == kLRU) {
+ return NewLRUCache(capacity);
+ }
+ if (type == kHyperClock) {
+ return HyperClockCacheOptions(
+ capacity, estimated_value_size_ /*estimated_value_size*/)
+ .MakeSharedCache();
+ }
+ return nullptr;
+ }
+
+ std::shared_ptr<Cache> NewCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
+ auto type = GetParam();
+ if (type == kLRU) {
+ LRUCacheOptions co;
+ co.capacity = capacity;
+ co.num_shard_bits = num_shard_bits;
+ co.strict_capacity_limit = strict_capacity_limit;
+ co.high_pri_pool_ratio = 0;
+ co.metadata_charge_policy = charge_policy;
+ return NewLRUCache(co);
+ }
+ if (type == kHyperClock) {
+ return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/,
+ num_shard_bits, strict_capacity_limit,
+ nullptr /*allocator*/, charge_policy)
+ .MakeSharedCache();
+ }
+ return nullptr;
+ }
+
+ // These functions encode/decode keys in tests cases that use
+ // int keys.
+ // Currently, HyperClockCache requires keys to be 16B long, whereas
+ // LRUCache doesn't, so the encoding depends on the cache type.
+ std::string EncodeKey(int k) {
+ auto type = GetParam();
+ if (type == kHyperClock) {
+ return EncodeKey16Bytes(k);
+ } else {
+ return EncodeKey32Bits(k);
+ }
+ }
+
+ int DecodeKey(const Slice& k) {
+ auto type = GetParam();
+ if (type == kHyperClock) {
+ return DecodeKey16Bytes(k);
+ } else {
+ return DecodeKey32Bits(k);
+ }
+ }
+
+ int Lookup(std::shared_ptr<Cache> cache, int key) {
+ Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+ const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
+ if (handle != nullptr) {
+ cache->Release(handle);
+ }
+ return r;
+ }
+
+ void Insert(std::shared_ptr<Cache> cache, int key, int value,
+ int charge = 1) {
+ EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+ &CacheTest::Deleter));
+ }
+
+ void Erase(std::shared_ptr<Cache> cache, int key) {
+ cache->Erase(EncodeKey(key));
+ }
+
+ int Lookup(int key) { return Lookup(cache_, key); }
+
+ void Insert(int key, int value, int charge = 1) {
+ Insert(cache_, key, value, charge);
+ }
+
+ void Erase(int key) { Erase(cache_, key); }
+
+ int Lookup2(int key) { return Lookup(cache2_, key); }
+
+ void Insert2(int key, int value, int charge = 1) {
+ Insert(cache2_, key, value, charge);
+ }
+
+ void Erase2(int key) { Erase(cache2_, key); }
+};
+
+CacheTest* CacheTest::current_;
+std::string CacheTest::type_;
+
+class LRUCacheTest : public CacheTest {};
+
+TEST_P(CacheTest, UsageTest) {
+ auto type = GetParam();
+
+ // cache is std::shared_ptr and will be automatically cleaned up.
+ const size_t kCapacity = 100000;
+ auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+ auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+ ASSERT_EQ(0, cache->GetUsage());
+ size_t baseline_meta_usage = precise_cache->GetUsage();
+ if (type != kHyperClock) {
+ ASSERT_EQ(0, baseline_meta_usage);
+ }
+
+ size_t usage = 0;
+ char value[10] = "abcdef";
+ // make sure everything will be cached
+ for (int i = 1; i < 100; ++i) {
+ std::string key;
+ if (type == kLRU) {
+ key = std::string(i, 'a');
+ } else {
+ key = EncodeKey(i);
+ }
+ auto kv_size = key.size() + 5;
+ ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+ DumbDeleter));
+ ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+ kv_size, DumbDeleter));
+ usage += kv_size;
+ ASSERT_EQ(usage, cache->GetUsage());
+ if (type == kHyperClock) {
+ ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage());
+ } else {
+ ASSERT_LT(usage, precise_cache->GetUsage());
+ }
+ }
+
+ cache->EraseUnRefEntries();
+ precise_cache->EraseUnRefEntries();
+ ASSERT_EQ(0, cache->GetUsage());
+ ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
+
+ // make sure the cache will be overloaded
+ for (size_t i = 1; i < kCapacity; ++i) {
+ std::string key;
+ if (type == kLRU) {
+ key = std::to_string(i);
+ } else {
+ key = EncodeKey(static_cast<int>(1000 + i));
+ }
+ ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+ DumbDeleter));
+ ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+ key.size() + 5, DumbDeleter));
+ }
+
+ // the usage should be close to the capacity
+ ASSERT_GT(kCapacity, cache->GetUsage());
+ ASSERT_GT(kCapacity, precise_cache->GetUsage());
+ ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+ if (type != kHyperClock) {
+ ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
+ } else {
+ // estimated value size of 1 is weird for clock cache, because
+ // almost all of the capacity will be used for metadata, and due to only
+ // using power of 2 table sizes, we might hit strict occupancy limit
+ // before hitting capacity limit.
+ ASSERT_LT(kCapacity * 0.80, precise_cache->GetUsage());
+ }
+}
+
+// TODO: This test takes longer than expected on ClockCache. This is
+// because the values size estimate at construction is too sloppy.
+// Fix this.
+// Why is it so slow? The cache is constructed with an estimate of 1, but
+// then the charge is claimed to be 21. This will cause the hash table
+// to be extremely sparse, which in turn means clock needs to scan too
+// many slots to find victims.
+TEST_P(CacheTest, PinnedUsageTest) {
+ auto type = GetParam();
+
+ // cache is std::shared_ptr and will be automatically cleaned up.
+ const size_t kCapacity = 200000;
+ auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+ auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
+ size_t baseline_meta_usage = precise_cache->GetUsage();
+ if (type != kHyperClock) {
+ ASSERT_EQ(0, baseline_meta_usage);
+ }
+
+ size_t pinned_usage = 0;
+ char value[10] = "abcdef";
+
+ std::forward_list<Cache::Handle*> unreleased_handles;
+ std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;
+
+ // Add entries. Unpin some of them after insertion. Then, pin some of them
+ // again. Check GetPinnedUsage().
+ for (int i = 1; i < 100; ++i) {
+ std::string key;
+ if (type == kLRU) {
+ key = std::string(i, 'a');
+ } else {
+ key = EncodeKey(i);
+ }
+ auto kv_size = key.size() + 5;
+ Cache::Handle* handle;
+ Cache::Handle* handle_in_precise_cache;
+ ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+ DumbDeleter, &handle));
+ assert(handle);
+ ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+ kv_size, DumbDeleter,
+ &handle_in_precise_cache));
+ assert(handle_in_precise_cache);
+ pinned_usage += kv_size;
+ ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+ ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+ if (i % 2 == 0) {
+ cache->Release(handle);
+ precise_cache->Release(handle_in_precise_cache);
+ pinned_usage -= kv_size;
+ ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+ ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+ } else {
+ unreleased_handles.push_front(handle);
+ unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
+ }
+ if (i % 3 == 0) {
+ unreleased_handles.push_front(cache->Lookup(key));
+ auto x = precise_cache->Lookup(key);
+ assert(x);
+ unreleased_handles_in_precise_cache.push_front(x);
+ // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
+ // usage increased
+ if (i % 2 == 0) {
+ pinned_usage += kv_size;
+ }
+ ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+ ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+ }
+ }
+ auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+ ASSERT_LT(pinned_usage, precise_cache_pinned_usage);
+
+ // check that overloading the cache does not change the pinned usage
+ for (size_t i = 1; i < 2 * kCapacity; ++i) {
+ std::string key;
+ if (type == kLRU) {
+ key = std::to_string(i);
+ } else {
+ key = EncodeKey(static_cast<int>(1000 + i));
+ }
+ ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+ DumbDeleter));
+ ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+ key.size() + 5, DumbDeleter));
+ }
+ ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+ ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+ cache->EraseUnRefEntries();
+ precise_cache->EraseUnRefEntries();
+ ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+ ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+ // release handles for pinned entries to prevent memory leaks
+ for (auto handle : unreleased_handles) {
+ cache->Release(handle);
+ }
+ for (auto handle : unreleased_handles_in_precise_cache) {
+ precise_cache->Release(handle);
+ }
+ ASSERT_EQ(0, cache->GetPinnedUsage());
+ ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+ cache->EraseUnRefEntries();
+ precise_cache->EraseUnRefEntries();
+ ASSERT_EQ(0, cache->GetUsage());
+ ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
+}
+
+TEST_P(CacheTest, HitAndMiss) {
+ ASSERT_EQ(-1, Lookup(100));
+
+ Insert(100, 101);
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(-1, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ Insert(200, 201);
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ Insert(100, 102);
+ if (GetParam() == kHyperClock) {
+ // ClockCache usually doesn't overwrite on Insert
+ ASSERT_EQ(101, Lookup(100));
+ } else {
+ ASSERT_EQ(102, Lookup(100));
+ }
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ ASSERT_EQ(1U, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ if (GetParam() == kHyperClock) {
+ ASSERT_EQ(102, deleted_values_[0]);
+ } else {
+ ASSERT_EQ(101, deleted_values_[0]);
+ }
+}
+
+TEST_P(CacheTest, InsertSameKey) {
+ if (GetParam() == kHyperClock) {
+ ROCKSDB_GTEST_BYPASS(
+ "ClockCache doesn't guarantee Insert overwrite same key.");
+ return;
+ }
+ Insert(1, 1);
+ Insert(1, 2);
+ ASSERT_EQ(2, Lookup(1));
+}
+
+TEST_P(CacheTest, Erase) {
+ Erase(200);
+ ASSERT_EQ(0U, deleted_keys_.size());
+
+ Insert(100, 101);
+ Insert(200, 201);
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(1U, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, EntriesArePinned) {
+ if (GetParam() == kHyperClock) {
+ ROCKSDB_GTEST_BYPASS(
+ "ClockCache doesn't guarantee Insert overwrite same key.");
+ return;
+ }
+ Insert(100, 101);
+ Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+ ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+ ASSERT_EQ(1U, cache_->GetUsage());
+
+ Insert(100, 102);
+ Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+ ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+ ASSERT_EQ(0U, deleted_keys_.size());
+ ASSERT_EQ(2U, cache_->GetUsage());
+
+ cache_->Release(h1);
+ ASSERT_EQ(1U, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+ ASSERT_EQ(1U, cache_->GetUsage());
+
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(1U, deleted_keys_.size());
+ ASSERT_EQ(1U, cache_->GetUsage());
+
+ cache_->Release(h2);
+ ASSERT_EQ(2U, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[1]);
+ ASSERT_EQ(102, deleted_values_[1]);
+ ASSERT_EQ(0U, cache_->GetUsage());
+}
+
+TEST_P(CacheTest, EvictionPolicy) {
+ Insert(100, 101);
+ Insert(200, 201);
+ // Frequently used entry must be kept around
+ for (int i = 0; i < 2 * kCacheSize; i++) {
+ Insert(1000 + i, 2000 + i);
+ ASSERT_EQ(101, Lookup(100));
+ }
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST_P(CacheTest, ExternalRefPinsEntries) {
+ Insert(100, 101);
+ Cache::Handle* h = cache_->Lookup(EncodeKey(100));
+ ASSERT_TRUE(cache_->Ref(h));
+ ASSERT_EQ(101, DecodeValue(cache_->Value(h)));
+ ASSERT_EQ(1U, cache_->GetUsage());
+
+ for (int i = 0; i < 3; ++i) {
+ if (i > 0) {
+ // First release (i == 1) corresponds to Ref(), second release (i == 2)
+ // corresponds to Lookup(). Then, since all external refs are released,
+ // the below insertions should push out the cache entry.
+ cache_->Release(h);
+ }
+ // double cache size because the usage bit in block cache prevents 100 from
+ // being evicted in the first kCacheSize iterations
+ for (int j = 0; j < 2 * kCacheSize + 100; j++) {
+ Insert(1000 + j, 2000 + j);
+ }
+ // Clock cache is even more stateful and needs more churn to evict
+ if (GetParam() == kHyperClock) {
+ for (int j = 0; j < kCacheSize; j++) {
+ Insert(11000 + j, 11000 + j);
+ }
+ }
+ if (i < 2) {
+ ASSERT_EQ(101, Lookup(100));
+ }
+ }
+ ASSERT_EQ(-1, Lookup(100));
+}
+
+TEST_P(CacheTest, EvictionPolicyRef) {
+ Insert(100, 101);
+ Insert(101, 102);
+ Insert(102, 103);
+ Insert(103, 104);
+ Insert(200, 101);
+ Insert(201, 102);
+ Insert(202, 103);
+ Insert(203, 104);
+ Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+ Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+ Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+ Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+ Insert(300, 101);
+ Insert(301, 102);
+ Insert(302, 103);
+ Insert(303, 104);
+
+ // Insert entries much more than cache capacity.
+ for (int i = 0; i < 100 * kCacheSize; i++) {
+ Insert(1000 + i, 2000 + i);
+ }
+
+ // Check whether the entries inserted in the beginning
+ // are evicted. Ones without extra ref are evicted and
+ // those with are not.
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(-1, Lookup(101));
+ ASSERT_EQ(-1, Lookup(102));
+ ASSERT_EQ(-1, Lookup(103));
+
+ ASSERT_EQ(-1, Lookup(300));
+ ASSERT_EQ(-1, Lookup(301));
+ ASSERT_EQ(-1, Lookup(302));
+ ASSERT_EQ(-1, Lookup(303));
+
+ ASSERT_EQ(101, Lookup(200));
+ ASSERT_EQ(102, Lookup(201));
+ ASSERT_EQ(103, Lookup(202));
+ ASSERT_EQ(104, Lookup(203));
+
+ // Cleaning up all the handles
+ cache_->Release(h201);
+ cache_->Release(h202);
+ cache_->Release(h203);
+ cache_->Release(h204);
+}
+
+TEST_P(CacheTest, EvictEmptyCache) {
+ auto type = GetParam();
+
+ // Insert item large than capacity to trigger eviction on empty cache.
+ auto cache = NewCache(1, 0, false);
+ if (type == kLRU) {
+ ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
+ } else {
+ ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
+ }
+}
+
+TEST_P(CacheTest, EraseFromDeleter) {
+ auto type = GetParam();
+
+ // Have deleter which will erase item from cache, which will re-enter
+ // the cache at that point.
+ std::shared_ptr<Cache> cache = NewCache(10, 0, false);
+ std::string foo, bar;
+ Cache::DeleterFn erase_deleter;
+ if (type == kLRU) {
+ foo = "foo";
+ bar = "bar";
+ erase_deleter = EraseDeleter1;
+ } else {
+ foo = EncodeKey(1234);
+ bar = EncodeKey(5678);
+ erase_deleter = EraseDeleter2;
+ }
+
+ ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
+ ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
+
+ cache->Erase(bar);
+ ASSERT_EQ(nullptr, cache->Lookup(foo));
+ ASSERT_EQ(nullptr, cache->Lookup(bar));
+}
+
+TEST_P(CacheTest, ErasedHandleState) {
+ // insert a key and get two handles
+ Insert(100, 1000);
+ Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+ Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+ ASSERT_EQ(h1, h2);
+ ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
+ ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
+
+ // delete the key from the cache
+ Erase(100);
+ // can no longer find in the cache
+ ASSERT_EQ(-1, Lookup(100));
+
+ // release one handle
+ cache_->Release(h1);
+ // still can't find in cache
+ ASSERT_EQ(-1, Lookup(100));
+
+ cache_->Release(h2);
+}
+
+TEST_P(CacheTest, HeavyEntries) {
+ // Add a bunch of light and heavy entries and then count the combined
+ // size of items still in the cache, which must be approximately the
+ // same as the total capacity.
+ const int kLight = 1;
+ const int kHeavy = 10;
+ int added = 0;
+ int index = 0;
+ while (added < 2 * kCacheSize) {
+ const int weight = (index & 1) ? kLight : kHeavy;
+ Insert(index, 1000 + index, weight);
+ added += weight;
+ index++;
+ }
+
+ int cached_weight = 0;
+ for (int i = 0; i < index; i++) {
+ const int weight = (i & 1 ? kLight : kHeavy);
+ int r = Lookup(i);
+ if (r >= 0) {
+ cached_weight += weight;
+ ASSERT_EQ(1000 + i, r);
+ }
+ }
+ ASSERT_LE(cached_weight, kCacheSize + kCacheSize / 10);
+}
+
+TEST_P(CacheTest, NewId) {
+ uint64_t a = cache_->NewId();
+ uint64_t b = cache_->NewId();
+ ASSERT_NE(a, b);
+}
+
+class Value {
+ public:
+ explicit Value(int v) : v_(v) {}
+
+ int v_;
+};
+
+namespace {
+void deleter(const Slice& /*key*/, void* value) {
+ delete static_cast<Value*>(value);
+}
+} // namespace
+
+TEST_P(CacheTest, ReleaseAndErase) {
+ std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+ Cache::Handle* handle;
+ Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+ &CacheTest::Deleter, &handle);
+ ASSERT_TRUE(s.ok());
+ ASSERT_EQ(5U, cache->GetCapacity());
+ ASSERT_EQ(1U, cache->GetUsage());
+ ASSERT_EQ(0U, deleted_keys_.size());
+ auto erased = cache->Release(handle, true);
+ ASSERT_TRUE(erased);
+ // This tests that deleter has been called
+ ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, ReleaseWithoutErase) {
+ std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+ Cache::Handle* handle;
+ Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+ &CacheTest::Deleter, &handle);
+ ASSERT_TRUE(s.ok());
+ ASSERT_EQ(5U, cache->GetCapacity());
+ ASSERT_EQ(1U, cache->GetUsage());
+ ASSERT_EQ(0U, deleted_keys_.size());
+ auto erased = cache->Release(handle);
+ ASSERT_FALSE(erased);
+ // This tests that deleter is not called. When cache has free capacity it is
+ // not expected to immediately erase the released items.
+ ASSERT_EQ(0U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, SetCapacity) {
+ auto type = GetParam();
+ if (type == kHyperClock) {
+ ROCKSDB_GTEST_BYPASS(
+ "FastLRUCache and HyperClockCache don't support arbitrary capacity "
+ "adjustments.");
+ return;
+ }
+ // test1: increase capacity
+ // lets create a cache with capacity 5,
+ // then, insert 5 elements, then increase capacity
+ // to 10, returned capacity should be 10, usage=5
+ std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+ std::vector<Cache::Handle*> handles(10);
+ // Insert 5 entries, but not releasing.
+ for (int i = 0; i < 5; i++) {
+ std::string key = EncodeKey(i + 1);
+ Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+ ASSERT_TRUE(s.ok());
+ }
+ ASSERT_EQ(5U, cache->GetCapacity());
+ ASSERT_EQ(5U, cache->GetUsage());
+ cache->SetCapacity(10);
+ ASSERT_EQ(10U, cache->GetCapacity());
+ ASSERT_EQ(5U, cache->GetUsage());
+
+ // test2: decrease capacity
+ // insert 5 more elements to cache, then release 5,
+ // then decrease capacity to 7, final capacity should be 7
+ // and usage should be 7
+ for (int i = 5; i < 10; i++) {
+ std::string key = EncodeKey(i + 1);
+ Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+ ASSERT_TRUE(s.ok());
+ }
+ ASSERT_EQ(10U, cache->GetCapacity());
+ ASSERT_EQ(10U, cache->GetUsage());
+ for (int i = 0; i < 5; i++) {
+ cache->Release(handles[i]);
+ }
+ ASSERT_EQ(10U, cache->GetCapacity());
+ ASSERT_EQ(10U, cache->GetUsage());
+ cache->SetCapacity(7);
+ ASSERT_EQ(7, cache->GetCapacity());
+ ASSERT_EQ(7, cache->GetUsage());
+
+ // release remaining 5 to keep valgrind happy
+ for (int i = 5; i < 10; i++) {
+ cache->Release(handles[i]);
+ }
+
+ // Make sure this doesn't crash or upset ASAN/valgrind
+ cache->DisownData();
+}
+
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
+ // test1: set the flag to false. Insert more keys than capacity. See if they
+ // all go through.
+ std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+ std::vector<Cache::Handle*> handles(10);
+ Status s;
+ for (int i = 0; i < 10; i++) {
+ std::string key = EncodeKey(i + 1);
+ s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+ ASSERT_OK(s);
+ ASSERT_NE(nullptr, handles[i]);
+ }
+ ASSERT_EQ(10, cache->GetUsage());
+
+ // test2: set the flag to true. Insert and check if it fails.
+ std::string extra_key = EncodeKey(100);
+ Value* extra_value = new Value(0);
+ cache->SetStrictCapacityLimit(true);
+ Cache::Handle* handle;
+ s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
+ ASSERT_TRUE(s.IsMemoryLimit());
+ ASSERT_EQ(nullptr, handle);
+ ASSERT_EQ(10, cache->GetUsage());
+
+ for (int i = 0; i < 10; i++) {
+ cache->Release(handles[i]);
+ }
+
+ // test3: init with flag being true.
+ std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
+ for (int i = 0; i < 5; i++) {
+ std::string key = EncodeKey(i + 1);
+ s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+ ASSERT_OK(s);
+ ASSERT_NE(nullptr, handles[i]);
+ }
+ s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
+ ASSERT_TRUE(s.IsMemoryLimit());
+ ASSERT_EQ(nullptr, handle);
+ // test insert without handle
+ s = cache2->Insert(extra_key, extra_value, 1, &deleter);
+ // AS if the key have been inserted into cache but get evicted immediately.
+ ASSERT_OK(s);
+ ASSERT_EQ(5, cache2->GetUsage());
+ ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
+
+ for (int i = 0; i < 5; i++) {
+ cache2->Release(handles[i]);
+ }
+}
+
+TEST_P(CacheTest, OverCapacity) {
+ size_t n = 10;
+
+ // a LRUCache with n entries and one shard only
+ std::shared_ptr<Cache> cache = NewCache(n, 0, false);
+
+ std::vector<Cache::Handle*> handles(n + 1);
+
+ // Insert n+1 entries, but not releasing.
+ for (int i = 0; i < static_cast<int>(n + 1); i++) {
+ std::string key = EncodeKey(i + 1);
+ Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+ ASSERT_TRUE(s.ok());
+ }
+
+ // Guess what's in the cache now?
+ for (int i = 0; i < static_cast<int>(n + 1); i++) {
+ std::string key = EncodeKey(i + 1);
+ auto h = cache->Lookup(key);
+ ASSERT_TRUE(h != nullptr);
+ if (h) cache->Release(h);
+ }
+
+ // the cache is over capacity since nothing could be evicted
+ ASSERT_EQ(n + 1U, cache->GetUsage());
+ for (int i = 0; i < static_cast<int>(n + 1); i++) {
+ cache->Release(handles[i]);
+ }
+
+ if (GetParam() == kHyperClock) {
+ // Make sure eviction is triggered.
+ ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
+
+ // cache is under capacity now since elements were released
+ ASSERT_GE(n, cache->GetUsage());
+
+ // clean up
+ cache->Release(handles[0]);
+ } else {
+ // LRUCache checks for over-capacity in Release.
+
+ // cache is exactly at capacity now with minimal eviction
+ ASSERT_EQ(n, cache->GetUsage());
+
+ // element 0 is evicted and the rest is there
+ // This is consistent with the LRU policy since the element 0
+ // was released first
+ for (int i = 0; i < static_cast<int>(n + 1); i++) {
+ std::string key = EncodeKey(i + 1);
+ auto h = cache->Lookup(key);
+ if (h) {
+ ASSERT_NE(static_cast<size_t>(i), 0U);
+ cache->Release(h);
+ } else {
+ ASSERT_EQ(static_cast<size_t>(i), 0U);
+ }
+ }
+ }
+}
+
+namespace {
+std::vector<std::pair<int, int>> legacy_callback_state;
+void legacy_callback(void* value, size_t charge) {
+ legacy_callback_state.push_back(
+ {DecodeValue(value), static_cast<int>(charge)});
+}
+}; // namespace
+
+TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
+ std::vector<std::pair<int, int>> inserted;
+ legacy_callback_state.clear();
+
+ for (int i = 0; i < 10; ++i) {
+ Insert(i, i * 2, i + 1);
+ inserted.push_back({i * 2, i + 1});
+ }
+ cache_->ApplyToAllCacheEntries(legacy_callback, true);
+
+ std::sort(inserted.begin(), inserted.end());
+ std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
+ ASSERT_EQ(inserted.size(), legacy_callback_state.size());
+ for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
+ EXPECT_EQ(inserted[i], legacy_callback_state[i]);
+ }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesTest) {
+ std::vector<std::string> callback_state;
+ const auto callback = [&](const Slice& key, void* value, size_t charge,
+ Cache::DeleterFn deleter) {
+ callback_state.push_back(std::to_string(DecodeKey(key)) + "," +
+ std::to_string(DecodeValue(value)) + "," +
+ std::to_string(charge));
+ assert(deleter == &CacheTest::Deleter);
+ };
+
+ std::vector<std::string> inserted;
+ callback_state.clear();
+
+ for (int i = 0; i < 10; ++i) {
+ Insert(i, i * 2, i + 1);
+ inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," +
+ std::to_string(i + 1));
+ }
+ cache_->ApplyToAllEntries(callback, /*opts*/ {});
+
+ std::sort(inserted.begin(), inserted.end());
+ std::sort(callback_state.begin(), callback_state.end());
+ ASSERT_EQ(inserted.size(), callback_state.size());
+ for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
+ EXPECT_EQ(inserted[i], callback_state[i]);
+ }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
+ // This is a mini-stress test of ApplyToAllEntries, to ensure
+ // items in the cache that are neither added nor removed
+ // during ApplyToAllEntries are counted exactly once.
+
+ // Insert some entries that we expect to be seen exactly once
+ // during iteration.
+ constexpr int kSpecialCharge = 2;
+ constexpr int kNotSpecialCharge = 1;
+ constexpr int kSpecialCount = 100;
+ size_t expected_usage = 0;
+ for (int i = 0; i < kSpecialCount; ++i) {
+ Insert(i, i * 2, kSpecialCharge);
+ expected_usage += kSpecialCharge;
+ }
+
+ // For callback
+ int special_count = 0;
+ const auto callback = [&](const Slice&, void*, size_t charge,
+ Cache::DeleterFn) {
+ if (charge == static_cast<size_t>(kSpecialCharge)) {
+ ++special_count;
+ }
+ };
+
+ // Start counting
+ std::thread apply_thread([&]() {
+ // Use small average_entries_per_lock to make the problem difficult
+ Cache::ApplyToAllEntriesOptions opts;
+ opts.average_entries_per_lock = 2;
+ cache_->ApplyToAllEntries(callback, opts);
+ });
+
+ // In parallel, add more entries, enough to cause resize but not enough
+ // to cause ejections. (Note: if any cache shard is over capacity, there
+ // will be ejections)
+ for (int i = kSpecialCount * 1; i < kSpecialCount * 5; ++i) {
+ Insert(i, i * 2, kNotSpecialCharge);
+ expected_usage += kNotSpecialCharge;
+ }
+
+ apply_thread.join();
+ // verify no evictions
+ ASSERT_EQ(cache_->GetUsage(), expected_usage);
+ // verify everything seen in ApplyToAllEntries
+ ASSERT_EQ(special_count, kSpecialCount);
+}
+
+TEST_P(CacheTest, DefaultShardBits) {
+ // Prevent excessive allocation (to save time & space)
+ estimated_value_size_ = 100000;
+ // Implementations use different minimum shard sizes
+ size_t min_shard_size =
+ (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U;
+
+ std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size);
+ ShardedCacheBase* sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+ ASSERT_EQ(5, sc->GetNumShardBits());
+
+ cache = NewCache(min_shard_size / 1000U * 999U);
+ sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+ ASSERT_EQ(0, sc->GetNumShardBits());
+
+ cache = NewCache(3U * 1024U * 1024U * 1024U);
+ sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+ // current maximum of 6
+ ASSERT_EQ(6, sc->GetNumShardBits());
+
+ if constexpr (sizeof(size_t) > 4) {
+ cache = NewCache(128U * min_shard_size);
+ sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+ // current maximum of 6
+ ASSERT_EQ(6, sc->GetNumShardBits());
+ }
+}
+
+TEST_P(CacheTest, GetChargeAndDeleter) {
+ Insert(1, 2);
+ Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
+ ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
+ ASSERT_EQ(1, cache_->GetCharge(h1));
+ ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
+ cache_->Release(h1);
+}
+
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
+ testing::Values(kLRU, kHyperClock));
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/charged_cache.cc b/src/rocksdb/cache/charged_cache.cc
new file mode 100644
index 000000000..a9ff969b8
--- /dev/null
+++ b/src/rocksdb/cache/charged_cache.cc
@@ -0,0 +1,117 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/charged_cache.h"
+
+#include "cache/cache_reservation_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
+ std::shared_ptr<Cache> block_cache)
+ : cache_(cache),
+ cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
+ std::make_shared<
+ CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
+ block_cache))) {}
+
+Status ChargedCache::Insert(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter, Handle** handle,
+ Priority priority) {
+ Status s = cache_->Insert(key, value, charge, deleter, handle, priority);
+ if (s.ok()) {
+ // Insert may cause the cache entry eviction if the cache is full. So we
+ // directly call the reservation manager to update the total memory used
+ // in the cache.
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+ }
+ return s;
+}
+
+Status ChargedCache::Insert(const Slice& key, void* value,
+ const CacheItemHelper* helper, size_t charge,
+ Handle** handle, Priority priority) {
+ Status s = cache_->Insert(key, value, helper, charge, handle, priority);
+ if (s.ok()) {
+ // Insert may cause the cache entry eviction if the cache is full. So we
+ // directly call the reservation manager to update the total memory used
+ // in the cache.
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+ }
+ return s;
+}
+
+Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) {
+ return cache_->Lookup(key, stats);
+}
+
+Cache::Handle* ChargedCache::Lookup(const Slice& key,
+ const CacheItemHelper* helper,
+ const CreateCallback& create_cb,
+ Priority priority, bool wait,
+ Statistics* stats) {
+ auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats);
+ // Lookup may promote the KV pair from the secondary cache to the primary
+ // cache. So we directly call the reservation manager to update the total
+ // memory used in the cache.
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+ return handle;
+}
+
+bool ChargedCache::Release(Cache::Handle* handle, bool useful,
+ bool erase_if_last_ref) {
+ size_t memory_used_delta = cache_->GetUsage(handle);
+ bool erased = cache_->Release(handle, useful, erase_if_last_ref);
+ if (erased) {
+ assert(cache_res_mgr_);
+ cache_res_mgr_
+ ->UpdateCacheReservation(memory_used_delta, /* increase */ false)
+ .PermitUncheckedError();
+ }
+ return erased;
+}
+
+bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
+ size_t memory_used_delta = cache_->GetUsage(handle);
+ bool erased = cache_->Release(handle, erase_if_last_ref);
+ if (erased) {
+ assert(cache_res_mgr_);
+ cache_res_mgr_
+ ->UpdateCacheReservation(memory_used_delta, /* increase */ false)
+ .PermitUncheckedError();
+ }
+ return erased;
+}
+
+void ChargedCache::Erase(const Slice& key) {
+ cache_->Erase(key);
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+}
+
+void ChargedCache::EraseUnRefEntries() {
+ cache_->EraseUnRefEntries();
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+}
+
+void ChargedCache::SetCapacity(size_t capacity) {
+ cache_->SetCapacity(capacity);
+ // SetCapacity can result in evictions when the cache capacity is decreased,
+ // so we would want to update the cache reservation here as well.
+ assert(cache_res_mgr_);
+ cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+ .PermitUncheckedError();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/charged_cache.h b/src/rocksdb/cache/charged_cache.h
new file mode 100644
index 000000000..1739e4088
--- /dev/null
+++ b/src/rocksdb/cache/charged_cache.h
@@ -0,0 +1,121 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ConcurrentCacheReservationManager;
+
+// A cache interface which wraps around another cache and takes care of
+// reserving space in block cache towards a single global memory limit, and
+// forwards all the calls to the underlying cache.
+class ChargedCache : public Cache {
+ public:
+ ChargedCache(std::shared_ptr<Cache> cache,
+ std::shared_ptr<Cache> block_cache);
+ ~ChargedCache() override = default;
+
+ Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+ Handle** handle, Priority priority) override;
+ Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+ size_t charge, Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override;
+
+ Cache::Handle* Lookup(const Slice& key, Statistics* stats) override;
+ Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+ const CreateCallback& create_cb, Priority priority,
+ bool wait, Statistics* stats = nullptr) override;
+
+ bool Release(Cache::Handle* handle, bool useful,
+ bool erase_if_last_ref = false) override;
+ bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
+
+ void Erase(const Slice& key) override;
+ void EraseUnRefEntries() override;
+
+ static const char* kClassName() { return "ChargedCache"; }
+ const char* Name() const override { return kClassName(); }
+
+ uint64_t NewId() override { return cache_->NewId(); }
+
+ void SetCapacity(size_t capacity) override;
+
+ void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+ cache_->SetStrictCapacityLimit(strict_capacity_limit);
+ }
+
+ bool HasStrictCapacityLimit() const override {
+ return cache_->HasStrictCapacityLimit();
+ }
+
+ void* Value(Cache::Handle* handle) override { return cache_->Value(handle); }
+
+ bool IsReady(Cache::Handle* handle) override {
+ return cache_->IsReady(handle);
+ }
+
+ void Wait(Cache::Handle* handle) override { cache_->Wait(handle); }
+
+ void WaitAll(std::vector<Handle*>& handles) override {
+ cache_->WaitAll(handles);
+ }
+
+ bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); }
+
+ size_t GetCapacity() const override { return cache_->GetCapacity(); }
+
+ size_t GetUsage() const override { return cache_->GetUsage(); }
+
+ size_t GetUsage(Cache::Handle* handle) const override {
+ return cache_->GetUsage(handle);
+ }
+
+ size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
+
+ size_t GetCharge(Cache::Handle* handle) const override {
+ return cache_->GetCharge(handle);
+ }
+
+ Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override {
+ return cache_->GetDeleter(handle);
+ }
+
+ void ApplyToAllEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ Cache::DeleterFn deleter)>& callback,
+ const Cache::ApplyToAllEntriesOptions& opts) override {
+ cache_->ApplyToAllEntries(callback, opts);
+ }
+
+ void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge),
+ bool thread_safe) override {
+ cache_->ApplyToAllCacheEntries(callback, thread_safe);
+ }
+
+ std::string GetPrintableOptions() const override {
+ return cache_->GetPrintableOptions();
+ }
+
+ void DisownData() override { return cache_->DisownData(); }
+
+ inline Cache* GetCache() const { return cache_.get(); }
+
+ inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
+ const {
+ return cache_res_mgr_.get();
+ }
+
+ private:
+ std::shared_ptr<Cache> cache_;
+ std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc
new file mode 100644
index 000000000..6c9f18c2f
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.cc
@@ -0,0 +1,1404 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/clock_cache.h"
+
+#include <cassert>
+#include <functional>
+#include <numeric>
+
+#include "cache/cache_key.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace clock_cache {
+
+namespace {
+inline uint64_t GetRefcount(uint64_t meta) {
+ return ((meta >> ClockHandle::kAcquireCounterShift) -
+ (meta >> ClockHandle::kReleaseCounterShift)) &
+ ClockHandle::kCounterMask;
+}
+
+inline uint64_t GetInitialCountdown(Cache::Priority priority) {
+ // Set initial clock data from priority
+ // TODO: configuration parameters for priority handling and clock cycle
+ // count?
+ switch (priority) {
+ case Cache::Priority::HIGH:
+ return ClockHandle::kHighCountdown;
+ default:
+ assert(false);
+ FALLTHROUGH_INTENDED;
+ case Cache::Priority::LOW:
+ return ClockHandle::kLowCountdown;
+ case Cache::Priority::BOTTOM:
+ return ClockHandle::kBottomCountdown;
+ }
+}
+
+inline void FreeDataMarkEmpty(ClockHandle& h) {
+ // NOTE: in theory there's more room for parallelism if we copy the handle
+ // data and delay actions like this until after marking the entry as empty,
+ // but performance tests only show a regression by copying the few words
+ // of data.
+ h.FreeData();
+
+#ifndef NDEBUG
+ // Mark slot as empty, with assertion
+ uint64_t meta = h.meta.exchange(0, std::memory_order_release);
+ assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
+#else
+ // Mark slot as empty
+ h.meta.store(0, std::memory_order_release);
+#endif
+}
+
+inline bool ClockUpdate(ClockHandle& h) {
+ uint64_t meta = h.meta.load(std::memory_order_relaxed);
+
+ uint64_t acquire_count =
+ (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
+ uint64_t release_count =
+ (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
+ // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
+ // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
+ if (acquire_count != release_count) {
+ // Only clock update entries with no outstanding refs
+ return false;
+ }
+ if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) {
+ // Only clock update Shareable entries
+ return false;
+ }
+ if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
+ acquire_count > 0) {
+ // Decrement clock
+ uint64_t new_count =
+ std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
+ // Compare-exchange in the decremented clock info, but
+ // not aggressively
+ uint64_t new_meta =
+ (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
+ (new_count << ClockHandle::kReleaseCounterShift) |
+ (new_count << ClockHandle::kAcquireCounterShift);
+ h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
+ return false;
+ }
+ // Otherwise, remove entry (either unreferenced invisible or
+ // unreferenced and expired visible).
+ if (h.meta.compare_exchange_strong(
+ meta,
+ uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+ std::memory_order_acquire)) {
+ // Took ownership.
+ return true;
+ } else {
+ // Compare-exchange failing probably
+ // indicates the entry was used, so skip it in that case.
+ return false;
+ }
+}
+
+} // namespace
+
+void ClockHandleBasicData::FreeData() const {
+ if (deleter) {
+ UniqueId64x2 unhashed;
+ (*deleter)(
+ ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed),
+ value);
+ }
+}
+
+HyperClockTable::HyperClockTable(
+ size_t capacity, bool /*strict_capacity_limit*/,
+ CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts)
+ : length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
+ metadata_charge_policy)),
+ length_bits_mask_((size_t{1} << length_bits_) - 1),
+ occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
+ kStrictLoadFactor)),
+ array_(new HandleImpl[size_t{1} << length_bits_]) {
+ if (metadata_charge_policy ==
+ CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
+ usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
+ }
+
+ static_assert(sizeof(HandleImpl) == 64U,
+ "Expecting size / alignment with common cache line size");
+}
+
+HyperClockTable::~HyperClockTable() {
+ // Assumes there are no references or active operations on any slot/element
+ // in the table.
+ for (size_t i = 0; i < GetTableSize(); i++) {
+ HandleImpl& h = array_[i];
+ switch (h.meta >> ClockHandle::kStateShift) {
+ case ClockHandle::kStateEmpty:
+ // noop
+ break;
+ case ClockHandle::kStateInvisible: // rare but possible
+ case ClockHandle::kStateVisible:
+ assert(GetRefcount(h.meta) == 0);
+ h.FreeData();
+#ifndef NDEBUG
+ Rollback(h.hashed_key, &h);
+ ReclaimEntryUsage(h.GetTotalCharge());
+#endif
+ break;
+ // otherwise
+ default:
+ assert(false);
+ break;
+ }
+ }
+
+#ifndef NDEBUG
+ for (size_t i = 0; i < GetTableSize(); i++) {
+ assert(array_[i].displacements.load() == 0);
+ }
+#endif
+
+ assert(usage_.load() == 0 ||
+ usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
+ assert(occupancy_ == 0);
+}
+
+// If an entry doesn't receive clock updates but is repeatedly referenced &
+// released, the acquire and release counters could overflow without some
+// intervention. This is that intervention, which should be inexpensive
+// because it only incurs a simple, very predictable check. (Applying a bit
+// mask in addition to an increment to every Release likely would be
+// relatively expensive, because it's an extra atomic update.)
+//
+// We do have to assume that we never have many millions of simultaneous
+// references to a cache handle, because we cannot represent so many
+// references with the difference in counters, masked to the number of
+// counter bits. Similarly, we assume there aren't millions of threads
+// holding transient references (which might be "undone" rather than
+// released by the way).
+//
+// Consider these possible states for each counter:
+// low: less than kMaxCountdown
+// medium: kMaxCountdown to half way to overflow + kMaxCountdown
+// high: half way to overflow + kMaxCountdown, or greater
+//
+// And these possible states for the combination of counters:
+// acquire / release
+// ------- -------
+// low low - Normal / common, with caveats (see below)
+// medium low - Can happen while holding some refs
+// high low - Violates assumptions (too many refs)
+// low medium - Violates assumptions (refs underflow, etc.)
+// medium medium - Normal (very read heavy cache)
+// high medium - Can happen while holding some refs
+// low high - This function is supposed to prevent
+// medium high - Violates assumptions (refs underflow, etc.)
+// high high - Needs CorrectNearOverflow
+//
+// Basically, this function detects (high, high) state (inferred from
+// release alone being high) and bumps it back down to (medium, medium)
+// state with the same refcount and the same logical countdown counter
+// (everything > kMaxCountdown is logically the same). Note that bumping
+// down to (low, low) would modify the countdown counter, so is "reserved"
+// in a sense.
+//
+// If near-overflow correction is triggered here, there's no guarantee
+// that another thread hasn't freed the entry and replaced it with another.
+// Therefore, it must be the case that the correction does not affect
+// entries unless they are very old (many millions of acquire-release cycles).
+// (Our bit manipulation is indeed idempotent and only affects entries in
+// exceptional cases.) We assume a pre-empted thread will not stall that long.
+// If it did, the state could be corrupted in the (unlikely) case that the top
+// bit of the acquire counter is set but not the release counter, and thus
+// we only clear the top bit of the acquire counter on resumption. It would
+// then appear that there are too many refs and the entry would be permanently
+// pinned (which is not terrible for an exceptionally rare occurrence), unless
+// it is referenced enough (at least kMaxCountdown more times) for the release
+// counter to reach "high" state again and bumped back to "medium." (This
+// motivates only checking for release counter in high state, not both in high
+// state.)
+inline void CorrectNearOverflow(uint64_t old_meta,
+ std::atomic<uint64_t>& meta) {
+ // We clear both top-most counter bits at the same time.
+ constexpr uint64_t kCounterTopBit = uint64_t{1}
+ << (ClockHandle::kCounterNumBits - 1);
+ constexpr uint64_t kClearBits =
+ (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
+ (kCounterTopBit << ClockHandle::kReleaseCounterShift);
+ // A simple check that allows us to initiate clearing the top bits for
+ // a large portion of the "high" state space on release counter.
+ constexpr uint64_t kCheckBits =
+ (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
+ << ClockHandle::kReleaseCounterShift;
+
+ if (UNLIKELY(old_meta & kCheckBits)) {
+ meta.fetch_and(~kClearBits, std::memory_order_relaxed);
+ }
+}
+
+inline Status HyperClockTable::ChargeUsageMaybeEvictStrict(
+ size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+ if (total_charge > capacity) {
+ return Status::MemoryLimit(
+ "Cache entry too large for a single cache shard: " +
+ std::to_string(total_charge) + " > " + std::to_string(capacity));
+ }
+ // Grab any available capacity, and free up any more required.
+ size_t old_usage = usage_.load(std::memory_order_relaxed);
+ size_t new_usage;
+ if (LIKELY(old_usage != capacity)) {
+ do {
+ new_usage = std::min(capacity, old_usage + total_charge);
+ } while (!usage_.compare_exchange_weak(old_usage, new_usage,
+ std::memory_order_relaxed));
+ } else {
+ new_usage = old_usage;
+ }
+ // How much do we need to evict then?
+ size_t need_evict_charge = old_usage + total_charge - new_usage;
+ size_t request_evict_charge = need_evict_charge;
+ if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
+ // Require at least 1 eviction.
+ request_evict_charge = 1;
+ }
+ if (request_evict_charge > 0) {
+ size_t evicted_charge = 0;
+ size_t evicted_count = 0;
+ Evict(request_evict_charge, &evicted_charge, &evicted_count);
+ occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+ if (LIKELY(evicted_charge > need_evict_charge)) {
+ assert(evicted_count > 0);
+ // Evicted more than enough
+ usage_.fetch_sub(evicted_charge - need_evict_charge,
+ std::memory_order_relaxed);
+ } else if (evicted_charge < need_evict_charge ||
+ (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
+ // Roll back to old usage minus evicted
+ usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
+ std::memory_order_relaxed);
+ if (evicted_charge < need_evict_charge) {
+ return Status::MemoryLimit(
+ "Insert failed because unable to evict entries to stay within "
+ "capacity limit.");
+ } else {
+ return Status::MemoryLimit(
+ "Insert failed because unable to evict entries to stay within "
+ "table occupancy limit.");
+ }
+ }
+ // If we needed to evict something and we are proceeding, we must have
+ // evicted something.
+ assert(evicted_count > 0);
+ }
+ return Status::OK();
+}
+
+inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
+ size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+ // For simplicity, we consider that either the cache can accept the insert
+ // with no evictions, or we must evict enough to make (at least) enough
+ // space. It could lead to unnecessary failures or excessive evictions in
+ // some extreme cases, but allows a fast, simple protocol. If we allow a
+ // race to get us over capacity, then we might never get back to capacity
+ // limit if the sizes of entries allow each insertion to evict the minimum
+ // charge. Thus, we should evict some extra if it's not a signifcant
+ // portion of the shard capacity. This can have the side benefit of
+ // involving fewer threads in eviction.
+ size_t old_usage = usage_.load(std::memory_order_relaxed);
+ size_t need_evict_charge;
+ // NOTE: if total_charge > old_usage, there isn't yet enough to evict
+ // `total_charge` amount. Even if we only try to evict `old_usage` amount,
+ // there's likely something referenced and we would eat CPU looking for
+ // enough to evict.
+ if (old_usage + total_charge <= capacity || total_charge > old_usage) {
+ // Good enough for me (might run over with a race)
+ need_evict_charge = 0;
+ } else {
+ // Try to evict enough space, and maybe some extra
+ need_evict_charge = total_charge;
+ if (old_usage > capacity) {
+ // Not too much to avoid thundering herd while avoiding strict
+ // synchronization, such as the compare_exchange used with strict
+ // capacity limit.
+ need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
+ }
+ }
+ if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
+ // Special case: require at least 1 eviction if we only have to
+ // deal with occupancy
+ need_evict_charge = 1;
+ }
+ size_t evicted_charge = 0;
+ size_t evicted_count = 0;
+ if (need_evict_charge > 0) {
+ Evict(need_evict_charge, &evicted_charge, &evicted_count);
+ // Deal with potential occupancy deficit
+ if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
+ assert(evicted_charge == 0);
+ // Can't meet occupancy requirement
+ return false;
+ } else {
+ // Update occupancy for evictions
+ occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+ }
+ }
+ // Track new usage even if we weren't able to evict enough
+ usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
+ // No underflow
+ assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+ // Success
+ return true;
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert(
+ const ClockHandleBasicData& proto) {
+ // Heap allocated separate from table
+ HandleImpl* h = new HandleImpl();
+ ClockHandleBasicData* h_alias = h;
+ *h_alias = proto;
+ h->SetDetached();
+ // Single reference (detached entries only created if returning a refed
+ // Handle back to user)
+ uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
+ << ClockHandle::kStateShift;
+ meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+ h->meta.store(meta, std::memory_order_release);
+ // Keep track of how much of usage is detached
+ detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed);
+ return h;
+}
+
+Status HyperClockTable::Insert(const ClockHandleBasicData& proto,
+ HandleImpl** handle, Cache::Priority priority,
+ size_t capacity, bool strict_capacity_limit) {
+ // Do we have the available occupancy? Optimistically assume we do
+ // and deal with it if we don't.
+ size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+ auto revert_occupancy_fn = [&]() {
+ occupancy_.fetch_sub(1, std::memory_order_relaxed);
+ };
+ // Whether we over-committed and need an eviction to make up for it
+ bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
+
+ // Usage/capacity handling is somewhat different depending on
+ // strict_capacity_limit, but mostly pessimistic.
+ bool use_detached_insert = false;
+ const size_t total_charge = proto.GetTotalCharge();
+ if (strict_capacity_limit) {
+ Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity,
+ need_evict_for_occupancy);
+ if (!s.ok()) {
+ revert_occupancy_fn();
+ return s;
+ }
+ } else {
+ // Case strict_capacity_limit == false
+ bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity,
+ need_evict_for_occupancy);
+ if (!success) {
+ revert_occupancy_fn();
+ if (handle == nullptr) {
+ // Don't insert the entry but still return ok, as if the entry
+ // inserted into cache and evicted immediately.
+ proto.FreeData();
+ return Status::OK();
+ } else {
+ // Need to track usage of fallback detached insert
+ usage_.fetch_add(total_charge, std::memory_order_relaxed);
+ use_detached_insert = true;
+ }
+ }
+ }
+ auto revert_usage_fn = [&]() {
+ usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+ // No underflow
+ assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+ };
+
+ if (!use_detached_insert) {
+ // Attempt a table insert, but abort if we find an existing entry for the
+ // key. If we were to overwrite old entries, we would either
+ // * Have to gain ownership over an existing entry to overwrite it, which
+ // would only work if there are no outstanding (read) references and would
+ // create a small gap in availability of the entry (old or new) to lookups.
+ // * Have to insert into a suboptimal location (more probes) so that the
+ // old entry can be kept around as well.
+
+ uint64_t initial_countdown = GetInitialCountdown(priority);
+ assert(initial_countdown > 0);
+
+ size_t probe = 0;
+ HandleImpl* e = FindSlot(
+ proto.hashed_key,
+ [&](HandleImpl* h) {
+ // Optimistically transition the slot from "empty" to
+ // "under construction" (no effect on other states)
+ uint64_t old_meta =
+ h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
+ << ClockHandle::kStateShift,
+ std::memory_order_acq_rel);
+ uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+
+ if (old_state == ClockHandle::kStateEmpty) {
+ // We've started inserting into an available slot, and taken
+ // ownership Save data fields
+ ClockHandleBasicData* h_alias = h;
+ *h_alias = proto;
+
+ // Transition from "under construction" state to "visible" state
+ uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
+ << ClockHandle::kStateShift;
+
+ // Maybe with an outstanding reference
+ new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
+ new_meta |= (initial_countdown - (handle != nullptr))
+ << ClockHandle::kReleaseCounterShift;
+
+#ifndef NDEBUG
+ // Save the state transition, with assertion
+ old_meta = h->meta.exchange(new_meta, std::memory_order_release);
+ assert(old_meta >> ClockHandle::kStateShift ==
+ ClockHandle::kStateConstruction);
+#else
+ // Save the state transition
+ h->meta.store(new_meta, std::memory_order_release);
+#endif
+ return true;
+ } else if (old_state != ClockHandle::kStateVisible) {
+ // Slot not usable / touchable now
+ return false;
+ }
+ // Existing, visible entry, which might be a match.
+ // But first, we need to acquire a ref to read it. In fact, number of
+ // refs for initial countdown, so that we boost the clock state if
+ // this is a match.
+ old_meta = h->meta.fetch_add(
+ ClockHandle::kAcquireIncrement * initial_countdown,
+ std::memory_order_acq_rel);
+ // Like Lookup
+ if ((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateVisible) {
+ // Acquired a read reference
+ if (h->hashed_key == proto.hashed_key) {
+ // Match. Release in a way that boosts the clock state
+ old_meta = h->meta.fetch_add(
+ ClockHandle::kReleaseIncrement * initial_countdown,
+ std::memory_order_acq_rel);
+ // Correct for possible (but rare) overflow
+ CorrectNearOverflow(old_meta, h->meta);
+ // Insert detached instead (only if return handle needed)
+ use_detached_insert = true;
+ return true;
+ } else {
+ // Mismatch. Pretend we never took the reference
+ old_meta = h->meta.fetch_sub(
+ ClockHandle::kAcquireIncrement * initial_countdown,
+ std::memory_order_acq_rel);
+ }
+ } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateInvisible)) {
+ // Pretend we never took the reference
+ // WART: there's a tiny chance we release last ref to invisible
+ // entry here. If that happens, we let eviction take care of it.
+ old_meta = h->meta.fetch_sub(
+ ClockHandle::kAcquireIncrement * initial_countdown,
+ std::memory_order_acq_rel);
+ } else {
+ // For other states, incrementing the acquire counter has no effect
+ // so we don't need to undo it.
+ // Slot not usable / touchable now.
+ }
+ (void)old_meta;
+ return false;
+ },
+ [&](HandleImpl* /*h*/) { return false; },
+ [&](HandleImpl* h) {
+ h->displacements.fetch_add(1, std::memory_order_relaxed);
+ },
+ probe);
+ if (e == nullptr) {
+ // Occupancy check and never abort FindSlot above should generally
+ // prevent this, except it's theoretically possible for other threads
+ // to evict and replace entries in the right order to hit every slot
+ // when it is populated. Assuming random hashing, the chance of that
+ // should be no higher than pow(kStrictLoadFactor, n) for n slots.
+ // That should be infeasible for roughly n >= 256, so if this assertion
+ // fails, that suggests something is going wrong.
+ assert(GetTableSize() < 256);
+ use_detached_insert = true;
+ }
+ if (!use_detached_insert) {
+ // Successfully inserted
+ if (handle) {
+ *handle = e;
+ }
+ return Status::OK();
+ }
+ // Roll back table insertion
+ Rollback(proto.hashed_key, e);
+ revert_occupancy_fn();
+ // Maybe fall back on detached insert
+ if (handle == nullptr) {
+ revert_usage_fn();
+ // As if unrefed entry immdiately evicted
+ proto.FreeData();
+ return Status::OK();
+ }
+ }
+
+ // Run detached insert
+ assert(use_detached_insert);
+
+ *handle = DetachedInsert(proto);
+
+ // The OkOverwritten status is used to count "redundant" insertions into
+ // block cache. This implementation doesn't strictly check for redundant
+ // insertions, but we instead are probably interested in how many insertions
+ // didn't go into the table (instead "detached"), which could be redundant
+ // Insert or some other reason (use_detached_insert reasons above).
+ return Status::OkOverwritten();
+}
+
+HyperClockTable::HandleImpl* HyperClockTable::Lookup(
+ const UniqueId64x2& hashed_key) {
+ size_t probe = 0;
+ HandleImpl* e = FindSlot(
+ hashed_key,
+ [&](HandleImpl* h) {
+ // Mostly branch-free version (similar performance)
+ /*
+ uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+ bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
+ bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
+ bool match = (h->key == key) & visible;
+ h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
+ ClockHandle::kAcquireCounterShift, std::memory_order_release); return
+ match;
+ */
+ // Optimistic lookup should pay off when the table is relatively
+ // sparse.
+ constexpr bool kOptimisticLookup = true;
+ uint64_t old_meta;
+ if (!kOptimisticLookup) {
+ old_meta = h->meta.load(std::memory_order_acquire);
+ if ((old_meta >> ClockHandle::kStateShift) !=
+ ClockHandle::kStateVisible) {
+ return false;
+ }
+ }
+ // (Optimistically) increment acquire counter
+ old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+ // Check if it's an entry visible to lookups
+ if ((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateVisible) {
+ // Acquired a read reference
+ if (h->hashed_key == hashed_key) {
+ // Match
+ return true;
+ } else {
+ // Mismatch. Pretend we never took the reference
+ old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ }
+ } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateInvisible)) {
+ // Pretend we never took the reference
+ // WART: there's a tiny chance we release last ref to invisible
+ // entry here. If that happens, we let eviction take care of it.
+ old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ } else {
+ // For other states, incrementing the acquire counter has no effect
+ // so we don't need to undo it. Furthermore, we cannot safely undo
+ // it because we did not acquire a read reference to lock the
+ // entry in a Shareable state.
+ }
+ (void)old_meta;
+ return false;
+ },
+ [&](HandleImpl* h) {
+ return h->displacements.load(std::memory_order_relaxed) == 0;
+ },
+ [&](HandleImpl* /*h*/) {}, probe);
+
+ return e;
+}
+
+bool HyperClockTable::Release(HandleImpl* h, bool useful,
+ bool erase_if_last_ref) {
+ // In contrast with LRUCache's Release, this function won't delete the handle
+ // when the cache is above capacity and the reference is the last one. Space
+ // is only freed up by EvictFromClock (called by Insert when space is needed)
+ // and Erase. We do this to avoid an extra atomic read of the variable usage_.
+
+ uint64_t old_meta;
+ if (useful) {
+ // Increment release counter to indicate was used
+ old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
+ std::memory_order_release);
+ } else {
+ // Decrement acquire counter to pretend it never happened
+ old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ }
+
+ assert((old_meta >> ClockHandle::kStateShift) &
+ ClockHandle::kStateShareableBit);
+ // No underflow
+ assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+ ClockHandle::kCounterMask) !=
+ ((old_meta >> ClockHandle::kReleaseCounterShift) &
+ ClockHandle::kCounterMask));
+
+ if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
+ ClockHandle::kStateInvisible)) {
+ // Update for last fetch_add op
+ if (useful) {
+ old_meta += ClockHandle::kReleaseIncrement;
+ } else {
+ old_meta -= ClockHandle::kAcquireIncrement;
+ }
+ // Take ownership if no refs
+ do {
+ if (GetRefcount(old_meta) != 0) {
+ // Not last ref at some point in time during this Release call
+ // Correct for possible (but rare) overflow
+ CorrectNearOverflow(old_meta, h->meta);
+ return false;
+ }
+ if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+ << ClockHandle::kStateShift)) == 0) {
+ // Someone else took ownership
+ return false;
+ }
+ // Note that there's a small chance that we release, another thread
+ // replaces this entry with another, reaches zero refs, and then we end
+ // up erasing that other entry. That's an acceptable risk / imprecision.
+ } while (!h->meta.compare_exchange_weak(
+ old_meta,
+ uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+ std::memory_order_acquire));
+ // Took ownership
+ size_t total_charge = h->GetTotalCharge();
+ if (UNLIKELY(h->IsDetached())) {
+ h->FreeData();
+ // Delete detached handle
+ delete h;
+ detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+ usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+ } else {
+ Rollback(h->hashed_key, h);
+ FreeDataMarkEmpty(*h);
+ ReclaimEntryUsage(total_charge);
+ }
+ return true;
+ } else {
+ // Correct for possible (but rare) overflow
+ CorrectNearOverflow(old_meta, h->meta);
+ return false;
+ }
+}
+
+void HyperClockTable::Ref(HandleImpl& h) {
+ // Increment acquire counter
+ uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+
+ assert((old_meta >> ClockHandle::kStateShift) &
+ ClockHandle::kStateShareableBit);
+ // Must have already had a reference
+ assert(GetRefcount(old_meta) > 0);
+ (void)old_meta;
+}
+
+void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) {
+ // Increment acquire counter
+ uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+
+ assert((old_meta >> ClockHandle::kStateShift) &
+ ClockHandle::kStateShareableBit);
+ (void)old_meta;
+}
+
+void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+ if (n > 0) {
+ // Split into n - 1 and 1 steps.
+ uint64_t old_meta = h->meta.fetch_add(
+ (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
+ assert((old_meta >> ClockHandle::kStateShift) &
+ ClockHandle::kStateShareableBit);
+ (void)old_meta;
+
+ Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
+ }
+}
+
+void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
+ size_t probe = 0;
+ (void)FindSlot(
+ hashed_key,
+ [&](HandleImpl* h) {
+ // Could be multiple entries in rare cases. Erase them all.
+ // Optimistically increment acquire counter
+ uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+ // Check if it's an entry visible to lookups
+ if ((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateVisible) {
+ // Acquired a read reference
+ if (h->hashed_key == hashed_key) {
+ // Match. Set invisible.
+ old_meta =
+ h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+ << ClockHandle::kStateShift),
+ std::memory_order_acq_rel);
+ // Apply update to local copy
+ old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
+ << ClockHandle::kStateShift);
+ for (;;) {
+ uint64_t refcount = GetRefcount(old_meta);
+ assert(refcount > 0);
+ if (refcount > 1) {
+ // Not last ref at some point in time during this Erase call
+ // Pretend we never took the reference
+ h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ break;
+ } else if (h->meta.compare_exchange_weak(
+ old_meta,
+ uint64_t{ClockHandle::kStateConstruction}
+ << ClockHandle::kStateShift,
+ std::memory_order_acq_rel)) {
+ // Took ownership
+ assert(hashed_key == h->hashed_key);
+ size_t total_charge = h->GetTotalCharge();
+ FreeDataMarkEmpty(*h);
+ ReclaimEntryUsage(total_charge);
+ // We already have a copy of hashed_key in this case, so OK to
+ // delay Rollback until after releasing the entry
+ Rollback(hashed_key, h);
+ break;
+ }
+ }
+ } else {
+ // Mismatch. Pretend we never took the reference
+ h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ }
+ } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+ ClockHandle::kStateInvisible)) {
+ // Pretend we never took the reference
+ // WART: there's a tiny chance we release last ref to invisible
+ // entry here. If that happens, we let eviction take care of it.
+ h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ } else {
+ // For other states, incrementing the acquire counter has no effect
+ // so we don't need to undo it.
+ }
+ return false;
+ },
+ [&](HandleImpl* h) {
+ return h->displacements.load(std::memory_order_relaxed) == 0;
+ },
+ [&](HandleImpl* /*h*/) {}, probe);
+}
+
+void HyperClockTable::ConstApplyToEntriesRange(
+ std::function<void(const HandleImpl&)> func, size_t index_begin,
+ size_t index_end, bool apply_if_will_be_deleted) const {
+ uint64_t check_state_mask = ClockHandle::kStateShareableBit;
+ if (!apply_if_will_be_deleted) {
+ check_state_mask |= ClockHandle::kStateVisibleBit;
+ }
+
+ for (size_t i = index_begin; i < index_end; i++) {
+ HandleImpl& h = array_[i];
+
+ // Note: to avoid using compare_exchange, we have to be extra careful.
+ uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+ // Check if it's an entry visible to lookups
+ if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+ // Increment acquire counter. Note: it's possible that the entry has
+ // completely changed since we loaded old_meta, but incrementing acquire
+ // count is always safe. (Similar to optimistic Lookup here.)
+ old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+ std::memory_order_acquire);
+ // Check whether we actually acquired a reference.
+ if ((old_meta >> ClockHandle::kStateShift) &
+ ClockHandle::kStateShareableBit) {
+ // Apply func if appropriate
+ if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+ func(h);
+ }
+ // Pretend we never took the reference
+ h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+ std::memory_order_release);
+ // No net change, so don't need to check for overflow
+ } else {
+ // For other states, incrementing the acquire counter has no effect
+ // so we don't need to undo it. Furthermore, we cannot safely undo
+ // it because we did not acquire a read reference to lock the
+ // entry in a Shareable state.
+ }
+ }
+ }
+}
+
+void HyperClockTable::EraseUnRefEntries() {
+ for (size_t i = 0; i <= this->length_bits_mask_; i++) {
+ HandleImpl& h = array_[i];
+
+ uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+ if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+ << ClockHandle::kStateShift) &&
+ GetRefcount(old_meta) == 0 &&
+ h.meta.compare_exchange_strong(old_meta,
+ uint64_t{ClockHandle::kStateConstruction}
+ << ClockHandle::kStateShift,
+ std::memory_order_acquire)) {
+ // Took ownership
+ size_t total_charge = h.GetTotalCharge();
+ Rollback(h.hashed_key, &h);
+ FreeDataMarkEmpty(h);
+ ReclaimEntryUsage(total_charge);
+ }
+ }
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
+ const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
+ std::function<bool(HandleImpl*)> abort_fn,
+ std::function<void(HandleImpl*)> update_fn, size_t& probe) {
+ // NOTE: upper 32 bits of hashed_key[0] is used for sharding
+ //
+ // We use double-hashing probing. Every probe in the sequence is a
+ // pseudorandom integer, computed as a linear function of two random hashes,
+ // which we call base and increment. Specifically, the i-th probe is base + i
+ // * increment modulo the table size.
+ size_t base = static_cast<size_t>(hashed_key[1]);
+ // We use an odd increment, which is relatively prime with the power-of-two
+ // table size. This implies that we cycle back to the first probe only
+ // after probing every slot exactly once.
+ // TODO: we could also reconsider linear probing, though locality benefits
+ // are limited because each slot is a full cache line
+ size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+ size_t current = ModTableSize(base + probe * increment);
+ while (probe <= length_bits_mask_) {
+ HandleImpl* h = &array_[current];
+ if (match_fn(h)) {
+ probe++;
+ return h;
+ }
+ if (abort_fn(h)) {
+ return nullptr;
+ }
+ probe++;
+ update_fn(h);
+ current = ModTableSize(current + increment);
+ }
+ // We looped back.
+ return nullptr;
+}
+
+inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key,
+ const HandleImpl* h) {
+ size_t current = ModTableSize(hashed_key[1]);
+ size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+ while (&array_[current] != h) {
+ array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
+ current = ModTableSize(current + increment);
+ }
+}
+
+inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) {
+ auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
+ (void)old_occupancy;
+ // No underflow
+ assert(old_occupancy > 0);
+ auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+ (void)old_usage;
+ // No underflow
+ assert(old_usage >= total_charge);
+}
+
+inline void HyperClockTable::Evict(size_t requested_charge,
+ size_t* freed_charge, size_t* freed_count) {
+ // precondition
+ assert(requested_charge > 0);
+
+ // TODO: make a tuning parameter?
+ constexpr size_t step_size = 4;
+
+ // First (concurrent) increment clock pointer
+ uint64_t old_clock_pointer =
+ clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+
+ // Cap the eviction effort at this thread (along with those operating in
+ // parallel) circling through the whole structure kMaxCountdown times.
+ // In other words, this eviction run must find something/anything that is
+ // unreferenced at start of and during the eviction run that isn't reclaimed
+ // by a concurrent eviction run.
+ uint64_t max_clock_pointer =
+ old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
+
+ for (;;) {
+ for (size_t i = 0; i < step_size; i++) {
+ HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
+ bool evicting = ClockUpdate(h);
+ if (evicting) {
+ Rollback(h.hashed_key, &h);
+ *freed_charge += h.GetTotalCharge();
+ *freed_count += 1;
+ FreeDataMarkEmpty(h);
+ }
+ }
+
+ // Loop exit condition
+ if (*freed_charge >= requested_charge) {
+ return;
+ }
+ if (old_clock_pointer >= max_clock_pointer) {
+ return;
+ }
+
+ // Advance clock pointer (concurrently)
+ old_clock_pointer =
+ clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+ }
+}
+
+template <class Table>
+ClockCacheShard<Table>::ClockCacheShard(
+ size_t capacity, bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ const typename Table::Opts& opts)
+ : CacheShardBase(metadata_charge_policy),
+ table_(capacity, strict_capacity_limit, metadata_charge_policy, opts),
+ capacity_(capacity),
+ strict_capacity_limit_(strict_capacity_limit) {
+ // Initial charge metadata should not exceed capacity
+ assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
+}
+
+template <class Table>
+void ClockCacheShard<Table>::EraseUnRefEntries() {
+ table_.EraseUnRefEntries();
+}
+
+template <class Table>
+void ClockCacheShard<Table>::ApplyToSomeEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ size_t average_entries_per_lock, size_t* state) {
+ // The state is essentially going to be the starting hash, which works
+ // nicely even if we resize between calls because we use upper-most
+ // hash bits for table indexes.
+ size_t length_bits = table_.GetLengthBits();
+ size_t length = table_.GetTableSize();
+
+ assert(average_entries_per_lock > 0);
+ // Assuming we are called with same average_entries_per_lock repeatedly,
+ // this simplifies some logic (index_end will not overflow).
+ assert(average_entries_per_lock < length || *state == 0);
+
+ size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+ size_t index_end = index_begin + average_entries_per_lock;
+ if (index_end >= length) {
+ // Going to end.
+ index_end = length;
+ *state = SIZE_MAX;
+ } else {
+ *state = index_end << (sizeof(size_t) * 8u - length_bits);
+ }
+
+ table_.ConstApplyToEntriesRange(
+ [callback](const HandleImpl& h) {
+ UniqueId64x2 unhashed;
+ callback(ReverseHash(h.hashed_key, &unhashed), h.value,
+ h.GetTotalCharge(), h.deleter);
+ },
+ index_begin, index_end, false);
+}
+
+int HyperClockTable::CalcHashBits(
+ size_t capacity, size_t estimated_value_size,
+ CacheMetadataChargePolicy metadata_charge_policy) {
+ double average_slot_charge = estimated_value_size * kLoadFactor;
+ if (metadata_charge_policy == kFullChargeCacheMetadata) {
+ average_slot_charge += sizeof(HandleImpl);
+ }
+ assert(average_slot_charge > 0.0);
+ uint64_t num_slots =
+ static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
+
+ int hash_bits = FloorLog2((num_slots << 1) - 1);
+ if (metadata_charge_policy == kFullChargeCacheMetadata) {
+ // For very small estimated value sizes, it's possible to overshoot
+ while (hash_bits > 0 &&
+ uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
+ hash_bits--;
+ }
+ }
+ return hash_bits;
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
+ capacity_.store(capacity, std::memory_order_relaxed);
+ // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetStrictCapacityLimit(
+ bool strict_capacity_limit) {
+ strict_capacity_limit_.store(strict_capacity_limit,
+ std::memory_order_relaxed);
+ // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+Status ClockCacheShard<Table>::Insert(const Slice& key,
+ const UniqueId64x2& hashed_key,
+ void* value, size_t charge,
+ Cache::DeleterFn deleter,
+ HandleImpl** handle,
+ Cache::Priority priority) {
+ if (UNLIKELY(key.size() != kCacheKeySize)) {
+ return Status::NotSupported("ClockCache only supports key size " +
+ std::to_string(kCacheKeySize) + "B");
+ }
+ ClockHandleBasicData proto;
+ proto.hashed_key = hashed_key;
+ proto.value = value;
+ proto.deleter = deleter;
+ proto.total_charge = charge;
+ Status s = table_.Insert(
+ proto, handle, priority, capacity_.load(std::memory_order_relaxed),
+ strict_capacity_limit_.load(std::memory_order_relaxed));
+ return s;
+}
+
+template <class Table>
+typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
+ const Slice& key, const UniqueId64x2& hashed_key) {
+ if (UNLIKELY(key.size() != kCacheKeySize)) {
+ return nullptr;
+ }
+ return table_.Lookup(hashed_key);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
+ if (h == nullptr) {
+ return false;
+ }
+ table_.Ref(*h);
+ return true;
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
+ bool erase_if_last_ref) {
+ if (handle == nullptr) {
+ return false;
+ }
+ return table_.Release(handle, useful, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
+ table_.TEST_RefN(*h, n);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
+ table_.TEST_ReleaseN(h, n);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle,
+ bool erase_if_last_ref) {
+ return Release(handle, /*useful=*/true, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::Erase(const Slice& key,
+ const UniqueId64x2& hashed_key) {
+ if (UNLIKELY(key.size() != kCacheKeySize)) {
+ return;
+ }
+ table_.Erase(hashed_key);
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetUsage() const {
+ return table_.GetUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetDetachedUsage() const {
+ return table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetCapacity() const {
+ return capacity_;
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetPinnedUsage() const {
+ // Computes the pinned usage by scanning the whole hash table. This
+ // is slow, but avoids keeping an exact counter on the clock usage,
+ // i.e., the number of not externally referenced elements.
+ // Why avoid this counter? Because Lookup removes elements from the clock
+ // list, so it would need to update the pinned usage every time,
+ // which creates additional synchronization costs.
+ size_t table_pinned_usage = 0;
+ const bool charge_metadata =
+ metadata_charge_policy_ == kFullChargeCacheMetadata;
+ table_.ConstApplyToEntriesRange(
+ [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
+ uint64_t meta = h.meta.load(std::memory_order_relaxed);
+ uint64_t refcount = GetRefcount(meta);
+ // Holding one ref for ConstApplyToEntriesRange
+ assert(refcount > 0);
+ if (refcount > 1) {
+ table_pinned_usage += h.GetTotalCharge();
+ if (charge_metadata) {
+ table_pinned_usage += sizeof(HandleImpl);
+ }
+ }
+ },
+ 0, table_.GetTableSize(), true);
+
+ return table_pinned_usage + table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyCount() const {
+ return table_.GetOccupancy();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
+ return table_.GetOccupancyLimit();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetTableAddressCount() const {
+ return table_.GetTableSize();
+}
+
+// Explicit instantiation
+template class ClockCacheShard<HyperClockTable>;
+
+HyperClockCache::HyperClockCache(
+ size_t capacity, size_t estimated_value_size, int num_shard_bits,
+ bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ std::shared_ptr<MemoryAllocator> memory_allocator)
+ : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+ std::move(memory_allocator)) {
+ assert(estimated_value_size > 0 ||
+ metadata_charge_policy != kDontChargeCacheMetadata);
+ // TODO: should not need to go through two levels of pointer indirection to
+ // get to table entries
+ size_t per_shard = GetPerShardCapacity();
+ InitShards([=](Shard* cs) {
+ HyperClockTable::Opts opts;
+ opts.estimated_value_size = estimated_value_size;
+ new (cs)
+ Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts);
+ });
+}
+
+void* HyperClockCache::Value(Handle* handle) {
+ return reinterpret_cast<const HandleImpl*>(handle)->value;
+}
+
+size_t HyperClockCache::GetCharge(Handle* handle) const {
+ return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge();
+}
+
+Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
+ auto h = reinterpret_cast<const HandleImpl*>(handle);
+ return h->deleter;
+}
+
+namespace {
+
+// For each cache shard, estimate what the table load factor would be if
+// cache filled to capacity with average entries. This is considered
+// indicative of a potential problem if the shard is essentially operating
+// "at limit", which we define as high actual usage (>80% of capacity)
+// or actual occupancy very close to limit (>95% of limit).
+// Also, for each shard compute the recommended estimated_entry_charge,
+// and keep the minimum one for use as overall recommendation.
+void AddShardEvaluation(const HyperClockCache::Shard& shard,
+ std::vector<double>& predicted_load_factors,
+ size_t& min_recommendation) {
+ size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
+ size_t capacity = shard.GetCapacity();
+ double usage_ratio = 1.0 * usage / capacity;
+
+ size_t occupancy = shard.GetOccupancyCount();
+ size_t occ_limit = shard.GetOccupancyLimit();
+ double occ_ratio = 1.0 * occupancy / occ_limit;
+ if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
+ // Skip as described above
+ return;
+ }
+
+ // If filled to capacity, what would the occupancy ratio be?
+ double ratio = occ_ratio / usage_ratio;
+ // Given max load factor, what that load factor be?
+ double lf = ratio * kStrictLoadFactor;
+ predicted_load_factors.push_back(lf);
+
+ // Update min_recommendation also
+ size_t recommendation = usage / occupancy;
+ min_recommendation = std::min(min_recommendation, recommendation);
+}
+
+} // namespace
+
+void HyperClockCache::ReportProblems(
+ const std::shared_ptr<Logger>& info_log) const {
+ uint32_t shard_count = GetNumShards();
+ std::vector<double> predicted_load_factors;
+ size_t min_recommendation = SIZE_MAX;
+ const_cast<HyperClockCache*>(this)->ForEachShard(
+ [&](HyperClockCache::Shard* shard) {
+ AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
+ });
+
+ if (predicted_load_factors.empty()) {
+ // None operating "at limit" -> nothing to report
+ return;
+ }
+ std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
+
+ // First, if the average load factor is within spec, we aren't going to
+ // complain about a few shards being out of spec.
+ // NOTE: this is only the average among cache shards operating "at limit,"
+ // which should be representative of what we care about. It it normal, even
+ // desirable, for a cache to operate "at limit" so this should not create
+ // selection bias. See AddShardEvaluation().
+ // TODO: Consider detecting cases where decreasing the number of shards
+ // would be good, e.g. serious imbalance among shards.
+ double average_load_factor =
+ std::accumulate(predicted_load_factors.begin(),
+ predicted_load_factors.end(), 0.0) /
+ shard_count;
+
+ constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
+ constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
+ if (average_load_factor > kLoadFactor) {
+ // Out of spec => Consider reporting load factor too high
+ // Estimate effective overall capacity loss due to enforcing occupancy limit
+ double lost_portion = 0.0;
+ int over_count = 0;
+ for (double lf : predicted_load_factors) {
+ if (lf > kStrictLoadFactor) {
+ ++over_count;
+ lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
+ }
+ }
+ // >= 20% loss -> error
+ // >= 10% loss -> consistent warning
+ // >= 1% loss -> intermittent warning
+ InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+ bool report = true;
+ if (lost_portion > 0.2) {
+ level = InfoLogLevel::ERROR_LEVEL;
+ } else if (lost_portion > 0.1) {
+ level = InfoLogLevel::WARN_LEVEL;
+ } else if (lost_portion > 0.01) {
+ int report_percent = static_cast<int>(lost_portion * 100.0);
+ if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
+ level = InfoLogLevel::WARN_LEVEL;
+ }
+ } else {
+ // don't report
+ report = false;
+ }
+ if (report) {
+ ROCKS_LOG_AT_LEVEL(
+ info_log, level,
+ "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
+ "of "
+ "full occupancy in %d/%u cache shards (estimated_entry_charge too "
+ "high). Recommend estimated_entry_charge=%zu",
+ this, lost_portion * 100.0, over_count, (unsigned)shard_count,
+ min_recommendation);
+ }
+ } else if (average_load_factor < kLowSpecLoadFactor) {
+ // Out of spec => Consider reporting load factor too low
+ // But cautiously because low is not as big of a problem.
+
+ // Only report if highest occupancy shard is also below
+ // spec and only if average is substantially out of spec
+ if (predicted_load_factors.back() < kLowSpecLoadFactor &&
+ average_load_factor < kLowSpecLoadFactor / 1.414) {
+ InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+ if (average_load_factor < kLowSpecLoadFactor / 2) {
+ level = InfoLogLevel::WARN_LEVEL;
+ }
+ ROCKS_LOG_AT_LEVEL(
+ info_log, level,
+ "HyperClockCache@%p table has low occupancy at full capacity. Higher "
+ "estimated_entry_charge (about %.1fx) would likely improve "
+ "performance. Recommend estimated_entry_charge=%zu",
+ this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
+ }
+ }
+}
+
+} // namespace clock_cache
+
+// DEPRECATED (see public API)
+std::shared_ptr<Cache> NewClockCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy) {
+ return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+ /* high_pri_pool_ratio */ 0.5, nullptr,
+ kDefaultToAdaptiveMutex, metadata_charge_policy,
+ /* low_pri_pool_ratio */ 0.0);
+}
+
+std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
+ auto my_num_shard_bits = num_shard_bits;
+ if (my_num_shard_bits >= 20) {
+ return nullptr; // The cache cannot be sharded into too many fine pieces.
+ }
+ if (my_num_shard_bits < 0) {
+ // Use larger shard size to reduce risk of large entries clustering
+ // or skewing individual shards.
+ constexpr size_t min_shard_size = 32U * 1024U * 1024U;
+ my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
+ }
+ return std::make_shared<clock_cache::HyperClockCache>(
+ capacity, estimated_entry_charge, my_num_shard_bits,
+ strict_capacity_limit, metadata_charge_policy, memory_allocator);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/clock_cache.h b/src/rocksdb/cache/clock_cache.h
new file mode 100644
index 000000000..ef1b0ccb7
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.h
@@ -0,0 +1,701 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "cache/cache_key.h"
+#include "cache/sharded_cache.h"
+#include "port/lang.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/secondary_cache.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace clock_cache {
+
+// Forward declaration of friend class.
+class ClockCacheTest;
+
+// HyperClockCache is an alternative to LRUCache specifically tailored for
+// use as BlockBasedTableOptions::block_cache
+//
+// Benefits
+// --------
+// * Fully lock free (no waits or spins) for efficiency under high concurrency
+// * Optimized for hot path reads. For concurrency control, most Lookup() and
+// essentially all Release() are a single atomic add operation.
+// * Eviction on insertion is fully parallel and lock-free.
+// * Uses a generalized + aging variant of CLOCK eviction that might outperform
+// LRU in some cases. (For background, see
+// https://en.wikipedia.org/wiki/Page_replacement_algorithm)
+//
+// Costs
+// -----
+// * Hash table is not resizable (for lock-free efficiency) so capacity is not
+// dynamically changeable. Rely on an estimated average value (block) size for
+// space+time efficiency. (See estimated_entry_charge option details.)
+// * Insert usually does not (but might) overwrite a previous entry associated
+// with a cache key. This is OK for RocksDB uses of Cache.
+// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
+// block cache (not row cache or table cache).
+// * SecondaryCache is not supported.
+// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
+// transient LOW or BOTTOM priority items can evict HIGH priority entries that
+// are not referenced recently (or often) enough.
+// * If pinned entries leave little or nothing eligible for eviction,
+// performance can degrade substantially, because of clock eviction eating
+// CPU looking for evictable entries and because Release does not
+// pro-actively delete unreferenced entries when the cache is over-full.
+// Specifically, this makes this implementation more susceptible to the
+// following combination:
+// * num_shard_bits is high (e.g. 6)
+// * capacity small (e.g. some MBs)
+// * some large individual entries (e.g. non-partitioned filters)
+// where individual entries occupy a large portion of their shard capacity.
+// This should be mostly mitigated by the implementation picking a lower
+// number of cache shards than LRUCache for a given capacity (when
+// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
+// * With strict_capacity_limit=false, respecting the capacity limit is not as
+// aggressive as LRUCache. The limit might be transiently exceeded by a very
+// small number of entries even when not strictly necessary, and slower to
+// recover after pinning forces limit to be substantially exceeded. (Even with
+// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
+// memory before discovering it is over the block cache capacity, so this
+// should not be a detectable regression in respecting memory limits, except
+// on exceptionally small caches.)
+// * In some cases, erased or duplicated entries might not be freed
+// immediately. They will eventually be freed by eviction from further Inserts.
+// * Internal metadata can overflow if the number of simultaneous references
+// to a cache handle reaches many millions.
+//
+// High-level eviction algorithm
+// -----------------------------
+// A score (or "countdown") is maintained for each entry, initially determined
+// by priority. The score is incremented on each Lookup, up to a max of 3,
+// though is easily returned to previous state if useful=false with Release.
+// During CLOCK-style eviction iteration, entries with score > 0 are
+// decremented if currently unreferenced and entries with score == 0 are
+// evicted if currently unreferenced. Note that scoring might not be perfect
+// because entries can be referenced transiently within the cache even when
+// there are no outside references to the entry.
+//
+// Cache sharding like LRUCache is used to reduce contention on usage+eviction
+// state, though here the performance improvement from more shards is small,
+// and (as noted above) potentially detrimental if shard capacity is too close
+// to largest entry size. Here cache sharding mostly only affects cache update
+// (Insert / Erase) performance, not read performance.
+//
+// Read efficiency (hot path)
+// --------------------------
+// Mostly to minimize the cost of accessing metadata blocks with
+// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
+// Release. In terms of concurrency, at a minimum, these operations have
+// to do reference counting (and Lookup has to compare full keys in a safe
+// way). Can we fold in all the other metadata tracking *for free* with
+// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
+// for the moment that Lookup succeeds on the first probe.)
+//
+// We have a clever way of encoding an entry's reference count and countdown
+// clock so that Lookup and Release are each usually a single atomic addition.
+// In a single metadata word we have both an "acquire" count, incremented by
+// Lookup, and a "release" count, incremented by Release. If useful=false,
+// Release can instead decrement the acquire count. Thus the current ref
+// count is (acquires - releases), and the countdown clock is min(3, acquires).
+// Note that only unreferenced entries (acquires == releases) are eligible
+// for CLOCK manipulation and eviction. We tolerate use of more expensive
+// compare_exchange operations for cache writes (insertions and erasures).
+//
+// In a cache receiving many reads and little or no writes, it is possible
+// for the acquire and release counters to overflow. Assuming the *current*
+// refcount never reaches to many millions, we only have to correct for
+// overflow in both counters in Release, not in Lookup. The overflow check
+// should be only 1-2 CPU cycles per Release because it is a predictable
+// branch on a simple condition on data already in registers.
+//
+// Slot states
+// -----------
+// We encode a state indicator into the same metadata word with the
+// acquire and release counters. This allows bigger state transitions to
+// be atomic. States:
+//
+// * Empty - slot is not in use and unowned. All other metadata and data is
+// in an undefined state.
+// * Construction - slot is exclusively owned by one thread, the thread
+// successfully entering this state, for populating or freeing data.
+// * Shareable (group) - slot holds an entry with counted references for
+// pinning and reading, including
+// * Visible - slot holds an entry that can be returned by Lookup
+// * Invisible - slot holds an entry that is not visible to Lookup
+// (erased by user) but can be read by existing references, and ref count
+// changed by Ref and Release.
+//
+// A special case is "detached" entries, which are heap-allocated handles
+// not in the table. They are always Invisible and freed on zero refs.
+//
+// State transitions:
+// Empty -> Construction (in Insert): The encoding of state enables Insert to
+// perform an optimistic atomic bitwise-or to take ownership if a slot is
+// empty, or otherwise make no state change.
+//
+// Construction -> Visible (in Insert): This can be a simple assignment to the
+// metadata word because the current thread has exclusive ownership and other
+// metadata is meaningless.
+//
+// Visible -> Invisible (in Erase): This can be a bitwise-and while holding
+// a shared reference, which is safe because the change is idempotent (in case
+// of parallel Erase). By the way, we never go Invisible->Visible.
+//
+// Shareable -> Construction (in Evict part of Insert, in Erase, and in
+// Release if Invisible): This is for starting to freeing/deleting an
+// unreferenced entry. We have to use compare_exchange to ensure we only make
+// this transition when there are zero refs.
+//
+// Construction -> Empty (in same places): This is for completing free/delete
+// of an entry. A "release" atomic store suffices, as we have exclusive
+// ownership of the slot but have to ensure none of the data member reads are
+// re-ordered after committing the state transition.
+//
+// Insert
+// ------
+// If Insert were to guarantee replacing an existing entry for a key, there
+// would be complications for concurrency and efficiency. First, consider how
+// many probes to get to an entry. To ensure Lookup never waits and
+// availability of a key is uninterrupted, we would need to use a different
+// slot for a new entry for the same key. This means it is most likely in a
+// later probing position than the old version, which should soon be removed.
+// (Also, an entry is too big to replace atomically, even if no current refs.)
+//
+// However, overwrite capability is not really needed by RocksDB. Also, we
+// know from our "redundant" stats that overwrites are very rare for the block
+// cache, so we should not spend much to make them effective.
+//
+// So instead we Insert as soon as we find an empty slot in the probing
+// sequence without seeing an existing (visible) entry for the same key. This
+// way we only insert if we can improve the probing performance, and we don't
+// need to probe beyond our insert position, assuming we are willing to let
+// the previous entry for the same key die of old age (eventual eviction from
+// not being used). We can reach a similar state with concurrent insertions,
+// where one will pass over the other while it is "under construction."
+// This temporary duplication is acceptable for RocksDB block cache because
+// we know redundant insertion is rare.
+//
+// Another problem to solve is what to return to the caller when we find an
+// existing entry whose probing position we cannot improve on, or when the
+// table occupancy limit has been reached. If strict_capacity_limit=false,
+// we must never fail Insert, and if a Handle* is provided, we have to return
+// a usable Cache handle on success. The solution to this (typically rare)
+// problem is "detached" handles, which are usable by the caller but not
+// actually available for Lookup in the Cache. Detached handles are allocated
+// independently on the heap and specially marked so that they are freed on
+// the heap when their last reference is released.
+//
+// Usage on capacity
+// -----------------
+// Insert takes different approaches to usage tracking depending on
+// strict_capacity_limit setting. If true, we enforce a kind of strong
+// consistency where compare-exchange is used to ensure the usage number never
+// exceeds its limit, and provide threads with an authoritative signal on how
+// much "usage" they have taken ownership of. With strict_capacity_limit=false,
+// we use a kind of "eventual consistency" where all threads Inserting to the
+// same cache shard might race on reserving the same space, but the
+// over-commitment will be worked out in later insertions. It is kind of a
+// dance because we don't want threads racing each other too much on paying
+// down the over-commitment (with eviction) either.
+//
+// Eviction
+// --------
+// A key part of Insert is evicting some entries currently unreferenced to
+// make room for new entries. The high-level eviction algorithm is described
+// above, but the details are also interesting. A key part is parallelizing
+// eviction with a single CLOCK pointer. This works by each thread working on
+// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
+// updating or evicting the incremented-over slot(s). To reduce contention at
+// the cost of possibly evicting too much, each thread increments the clock
+// pointer by 4, so commits to updating at least 4 slots per batch. As
+// described above, a CLOCK update will decrement the "countdown" of
+// unreferenced entries, or evict unreferenced entries with zero countdown.
+// Referenced entries are not updated, because we (presumably) don't want
+// long-referenced entries to age while referenced. Note however that we
+// cannot distinguish transiently referenced entries from cache user
+// references, so some CLOCK updates might be somewhat arbitrarily skipped.
+// This is OK as long as it is rare enough that eviction order is still
+// pretty good.
+//
+// There is no synchronization on the completion of the CLOCK updates, so it
+// is theoretically possible for another thread to cycle back around and have
+// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
+// on any implied exclusivity to make the updates or eviction more efficient.
+// These updates use an opportunistic compare-exchange (no loop), where a
+// racing thread might cause the update to be skipped without retry, but in
+// such case the update is likely not needed because the most likely update
+// to an entry is that it has become referenced. (TODO: test efficiency of
+// avoiding compare-exchange loop)
+//
+// Release
+// -------
+// In the common case, Release is a simple atomic increment of the release
+// counter. There is a simple overflow check that only does another atomic
+// update in extremely rare cases, so costs almost nothing.
+//
+// If the Release specifies "not useful", we can instead decrement the
+// acquire counter, which returns to the same CLOCK state as before Lookup
+// or Ref.
+//
+// Adding a check for over-full cache on every release to zero-refs would
+// likely be somewhat expensive, increasing read contention on cache shard
+// metadata. Instead we are less aggressive about deleting entries right
+// away in those cases.
+//
+// However Release tries to immediately delete entries reaching zero refs
+// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
+// marked invisible. Both of these are checks on values already in CPU
+// registers so do not increase cross-CPU contention when not applicable.
+// When applicable, they use a compare-exchange loop to take exclusive
+// ownership of the slot for freeing the entry. These are rare cases
+// that should not usually affect performance.
+//
+// Erase
+// -----
+// Searches for an entry like Lookup but moves it to Invisible state if found.
+// This state transition is with bit operations so is idempotent and safely
+// done while only holding a shared "read" reference. Like Release, it makes
+// a best effort to immediately release an Invisible entry that reaches zero
+// refs, but there are some corner cases where it will only be freed by the
+// clock eviction process.
+
+// ----------------------------------------------------------------------- //
+
+// The load factor p is a real number in (0, 1) such that at all
+// times at most a fraction p of all slots, without counting tombstones,
+// are occupied by elements. This means that the probability that a random
+// probe hits an occupied slot is at most p, and thus at most 1/p probes
+// are required on average. For example, p = 70% implies that between 1 and 2
+// probes are needed on average (bear in mind that this reasoning doesn't
+// consider the effects of clustering over time, which should be negligible
+// with double hashing).
+// Because the size of the hash table is always rounded up to the next
+// power of 2, p is really an upper bound on the actual load factor---the
+// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
+// but bear in mind that slots only hold metadata, not actual values.
+// Since space cost is dominated by the values (the LSM blocks),
+// overprovisioning the table with metadata only increases the total cache space
+// usage by a tiny fraction.
+constexpr double kLoadFactor = 0.7;
+
+// The user can exceed kLoadFactor if the sizes of the inserted values don't
+// match estimated_value_size, or in some rare cases with
+// strict_capacity_limit == false. To avoid degenerate performance, we set a
+// strict upper bound on the load factor.
+constexpr double kStrictLoadFactor = 0.84;
+
+struct ClockHandleBasicData {
+ void* value = nullptr;
+ Cache::DeleterFn deleter = nullptr;
+ // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
+ // eliminates the need to store a hash separately.
+ UniqueId64x2 hashed_key = kNullUniqueId64x2;
+ size_t total_charge = 0;
+
+ // For total_charge_and_flags
+ // "Detached" means the handle is allocated separately from hash table.
+ static constexpr uint64_t kFlagDetached = uint64_t{1} << 63;
+ // Extract just the total charge
+ static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1;
+
+ inline size_t GetTotalCharge() const { return total_charge; }
+
+ // Calls deleter (if non-null) on cache key and value
+ void FreeData() const;
+
+ // Required by concept HandleImpl
+ const UniqueId64x2& GetHash() const { return hashed_key; }
+};
+
+struct ClockHandle : public ClockHandleBasicData {
+ // Constants for handling the atomic `meta` word, which tracks most of the
+ // state of the handle. The meta word looks like this:
+ // low bits high bits
+ // -----------------------------------------------------------------------
+ // | acquire counter | release counter | state marker |
+ // -----------------------------------------------------------------------
+
+ // For reading or updating counters in meta word.
+ static constexpr uint8_t kCounterNumBits = 30;
+ static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
+
+ static constexpr uint8_t kAcquireCounterShift = 0;
+ static constexpr uint64_t kAcquireIncrement = uint64_t{1}
+ << kAcquireCounterShift;
+ static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
+ static constexpr uint64_t kReleaseIncrement = uint64_t{1}
+ << kReleaseCounterShift;
+
+ // For reading or updating the state marker in meta word
+ static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
+
+ // Bits contribution to state marker.
+ // Occupied means any state other than empty
+ static constexpr uint8_t kStateOccupiedBit = 0b100;
+ // Shareable means the entry is reference counted (visible or invisible)
+ // (only set if also occupied)
+ static constexpr uint8_t kStateShareableBit = 0b010;
+ // Visible is only set if also shareable
+ static constexpr uint8_t kStateVisibleBit = 0b001;
+
+ // Complete state markers (not shifted into full word)
+ static constexpr uint8_t kStateEmpty = 0b000;
+ static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
+ static constexpr uint8_t kStateInvisible =
+ kStateOccupiedBit | kStateShareableBit;
+ static constexpr uint8_t kStateVisible =
+ kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
+
+ // Constants for initializing the countdown clock. (Countdown clock is only
+ // in effect with zero refs, acquire counter == release counter, and in that
+ // case the countdown clock == both of those counters.)
+ static constexpr uint8_t kHighCountdown = 3;
+ static constexpr uint8_t kLowCountdown = 2;
+ static constexpr uint8_t kBottomCountdown = 1;
+ // During clock update, treat any countdown clock value greater than this
+ // value the same as this value.
+ static constexpr uint8_t kMaxCountdown = kHighCountdown;
+ // TODO: make these coundown values tuning parameters for eviction?
+
+ // See above
+ std::atomic<uint64_t> meta{};
+
+ // Anticipating use for SecondaryCache support
+ void* reserved_for_future_use = nullptr;
+}; // struct ClockHandle
+
+class HyperClockTable {
+ public:
+ // Target size to be exactly a common cache line size (see static_assert in
+ // clock_cache.cc)
+ struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
+ // The number of elements that hash to this slot or a lower one, but wind
+ // up in this slot or a higher one.
+ std::atomic<uint32_t> displacements{};
+
+ // Whether this is a "deteched" handle that is independently allocated
+ // with `new` (so must be deleted with `delete`).
+ // TODO: ideally this would be packed into some other data field, such
+ // as upper bits of total_charge, but that incurs a measurable performance
+ // regression.
+ bool detached = false;
+
+ inline bool IsDetached() const { return detached; }
+
+ inline void SetDetached() { detached = true; }
+ }; // struct HandleImpl
+
+ struct Opts {
+ size_t estimated_value_size;
+ };
+
+ HyperClockTable(size_t capacity, bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ const Opts& opts);
+ ~HyperClockTable();
+
+ Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
+ Cache::Priority priority, size_t capacity,
+ bool strict_capacity_limit);
+
+ HandleImpl* Lookup(const UniqueId64x2& hashed_key);
+
+ bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
+
+ void Ref(HandleImpl& handle);
+
+ void Erase(const UniqueId64x2& hashed_key);
+
+ void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
+ size_t index_begin, size_t index_end,
+ bool apply_if_will_be_deleted) const;
+
+ void EraseUnRefEntries();
+
+ size_t GetTableSize() const { return size_t{1} << length_bits_; }
+
+ int GetLengthBits() const { return length_bits_; }
+
+ size_t GetOccupancy() const {
+ return occupancy_.load(std::memory_order_relaxed);
+ }
+
+ size_t GetOccupancyLimit() const { return occupancy_limit_; }
+
+ size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+
+ size_t GetDetachedUsage() const {
+ return detached_usage_.load(std::memory_order_relaxed);
+ }
+
+ // Acquire/release N references
+ void TEST_RefN(HandleImpl& handle, size_t n);
+ void TEST_ReleaseN(HandleImpl* handle, size_t n);
+
+ private: // functions
+ // Returns x mod 2^{length_bits_}.
+ inline size_t ModTableSize(uint64_t x) {
+ return static_cast<size_t>(x) & length_bits_mask_;
+ }
+
+ // Runs the clock eviction algorithm trying to reclaim at least
+ // requested_charge. Returns how much is evicted, which could be less
+ // if it appears impossible to evict the requested amount without blocking.
+ inline void Evict(size_t requested_charge, size_t* freed_charge,
+ size_t* freed_count);
+
+ // Returns the first slot in the probe sequence, starting from the given
+ // probe number, with a handle e such that match(e) is true. At every
+ // step, the function first tests whether match(e) holds. If this is false,
+ // it evaluates abort(e) to decide whether the search should be aborted,
+ // and in the affirmative returns -1. For every handle e probed except
+ // the last one, the function runs update(e).
+ // The probe parameter is modified as follows. We say a probe to a handle
+ // e is aborting if match(e) is false and abort(e) is true. Then the final
+ // value of probe is one more than the last non-aborting probe during the
+ // call. This is so that that the variable can be used to keep track of
+ // progress across consecutive calls to FindSlot.
+ inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
+ std::function<bool(HandleImpl*)> match,
+ std::function<bool(HandleImpl*)> stop,
+ std::function<void(HandleImpl*)> update,
+ size_t& probe);
+
+ // Re-decrement all displacements in probe path starting from beginning
+ // until (not including) the given handle
+ inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h);
+
+ // Subtracts `total_charge` from `usage_` and 1 from `occupancy_`.
+ // Ideally this comes after releasing the entry itself so that we
+ // actually have the available occupancy/usage that is claimed.
+ // However, that means total_charge has to be saved from the handle
+ // before releasing it so that it can be provided to this function.
+ inline void ReclaimEntryUsage(size_t total_charge);
+
+ // Helper for updating `usage_` for new entry with given `total_charge`
+ // and evicting if needed under strict_capacity_limit=true rules. This
+ // means the operation might fail with Status::MemoryLimit. If
+ // `need_evict_for_occupancy`, then eviction of at least one entry is
+ // required, and the operation should fail if not possible.
+ // NOTE: Otherwise, occupancy_ is not managed in this function
+ inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
+ size_t capacity,
+ bool need_evict_for_occupancy);
+
+ // Helper for updating `usage_` for new entry with given `total_charge`
+ // and evicting if needed under strict_capacity_limit=false rules. This
+ // means that updating `usage_` always succeeds even if forced to exceed
+ // capacity. If `need_evict_for_occupancy`, then eviction of at least one
+ // entry is required, and the operation should return false if such eviction
+ // is not possible. `usage_` is not updated in that case. Otherwise, returns
+ // true, indicating success.
+ // NOTE: occupancy_ is not managed in this function
+ inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
+ size_t capacity,
+ bool need_evict_for_occupancy);
+
+ // Creates a "detached" handle for returning from an Insert operation that
+ // cannot be completed by actually inserting into the table.
+ // Updates `detached_usage_` but not `usage_` nor `occupancy_`.
+ inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto);
+
+ // Returns the number of bits used to hash an element in the hash
+ // table.
+ static int CalcHashBits(size_t capacity, size_t estimated_value_size,
+ CacheMetadataChargePolicy metadata_charge_policy);
+
+ private: // data
+ // Number of hash bits used for table index.
+ // The size of the table is 1 << length_bits_.
+ const int length_bits_;
+
+ // For faster computation of ModTableSize.
+ const size_t length_bits_mask_;
+
+ // Maximum number of elements the user can store in the table.
+ const size_t occupancy_limit_;
+
+ // Array of slots comprising the hash table.
+ const std::unique_ptr<HandleImpl[]> array_;
+
+ // We partition the following members into different cache lines
+ // to avoid false sharing among Lookup, Release, Erase and Insert
+ // operations in ClockCacheShard.
+
+ ALIGN_AS(CACHE_LINE_SIZE)
+ // Clock algorithm sweep pointer.
+ std::atomic<uint64_t> clock_pointer_{};
+
+ ALIGN_AS(CACHE_LINE_SIZE)
+ // Number of elements in the table.
+ std::atomic<size_t> occupancy_{};
+
+ // Memory usage by entries tracked by the cache (including detached)
+ std::atomic<size_t> usage_{};
+
+ // Part of usage by detached entries (not in table)
+ std::atomic<size_t> detached_usage_{};
+}; // class HyperClockTable
+
+// A single shard of sharded cache.
+template <class Table>
+class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
+ public:
+ ClockCacheShard(size_t capacity, bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ const typename Table::Opts& opts);
+
+ // For CacheShard concept
+ using HandleImpl = typename Table::HandleImpl;
+ // Hash is lossless hash of 128-bit key
+ using HashVal = UniqueId64x2;
+ using HashCref = const HashVal&;
+ static inline uint32_t HashPieceForSharding(HashCref hash) {
+ return Upper32of64(hash[0]);
+ }
+ static inline HashVal ComputeHash(const Slice& key) {
+ assert(key.size() == kCacheKeySize);
+ HashVal in;
+ HashVal out;
+ // NOTE: endian dependence
+ // TODO: use GetUnaligned?
+ std::memcpy(&in, key.data(), kCacheKeySize);
+ BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
+ return out;
+ }
+
+ // For reconstructing key from hashed_key. Requires the caller to provide
+ // backing storage for the Slice in `unhashed`
+ static inline Slice ReverseHash(const UniqueId64x2& hashed,
+ UniqueId64x2* unhashed) {
+ BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
+ // NOTE: endian dependence
+ return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
+ }
+
+ // Although capacity is dynamically changeable, the number of table slots is
+ // not, so growing capacity substantially could lead to hitting occupancy
+ // limit.
+ void SetCapacity(size_t capacity);
+
+ void SetStrictCapacityLimit(bool strict_capacity_limit);
+
+ Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+ size_t charge, Cache::DeleterFn deleter, HandleImpl** handle,
+ Cache::Priority priority);
+
+ HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
+
+ bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
+
+ bool Release(HandleImpl* handle, bool erase_if_last_ref = false);
+
+ bool Ref(HandleImpl* handle);
+
+ void Erase(const Slice& key, const UniqueId64x2& hashed_key);
+
+ size_t GetCapacity() const;
+
+ size_t GetUsage() const;
+
+ size_t GetDetachedUsage() const;
+
+ size_t GetPinnedUsage() const;
+
+ size_t GetOccupancyCount() const;
+
+ size_t GetOccupancyLimit() const;
+
+ size_t GetTableAddressCount() const;
+
+ void ApplyToSomeEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ size_t average_entries_per_lock, size_t* state);
+
+ void EraseUnRefEntries();
+
+ std::string GetPrintableOptions() const { return std::string{}; }
+
+ // SecondaryCache not yet supported
+ Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+ const Cache::CacheItemHelper* helper, size_t charge,
+ HandleImpl** handle, Cache::Priority priority) {
+ return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
+ priority);
+ }
+
+ HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+ const Cache::CacheItemHelper* /*helper*/,
+ const Cache::CreateCallback& /*create_cb*/,
+ Cache::Priority /*priority*/, bool /*wait*/,
+ Statistics* /*stats*/) {
+ return Lookup(key, hashed_key);
+ }
+
+ bool IsReady(HandleImpl* /*handle*/) { return true; }
+
+ void Wait(HandleImpl* /*handle*/) {}
+
+ // Acquire/release N references
+ void TEST_RefN(HandleImpl* handle, size_t n);
+ void TEST_ReleaseN(HandleImpl* handle, size_t n);
+
+ private: // data
+ Table table_;
+
+ // Maximum total charge of all elements stored in the table.
+ std::atomic<size_t> capacity_;
+
+ // Whether to reject insertion if cache reaches its full capacity.
+ std::atomic<bool> strict_capacity_limit_;
+}; // class ClockCacheShard
+
+class HyperClockCache
+#ifdef NDEBUG
+ final
+#endif
+ : public ShardedCache<ClockCacheShard<HyperClockTable>> {
+ public:
+ using Shard = ClockCacheShard<HyperClockTable>;
+
+ HyperClockCache(size_t capacity, size_t estimated_value_size,
+ int num_shard_bits, bool strict_capacity_limit,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ std::shared_ptr<MemoryAllocator> memory_allocator);
+
+ const char* Name() const override { return "HyperClockCache"; }
+
+ void* Value(Handle* handle) override;
+
+ size_t GetCharge(Handle* handle) const override;
+
+ DeleterFn GetDeleter(Handle* handle) const override;
+
+ void ReportProblems(
+ const std::shared_ptr<Logger>& /*info_log*/) const override;
+}; // class HyperClockCache
+
+} // namespace clock_cache
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache.cc b/src/rocksdb/cache/compressed_secondary_cache.cc
new file mode 100644
index 000000000..7d1bdc789
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache.cc
@@ -0,0 +1,325 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/compressed_secondary_cache.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+CompressedSecondaryCache::CompressedSecondaryCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio, double low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ CompressionType compression_type, uint32_t compress_format_version,
+ bool enable_custom_split_merge)
+ : cache_options_(capacity, num_shard_bits, strict_capacity_limit,
+ high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
+ use_adaptive_mutex, metadata_charge_policy,
+ compression_type, compress_format_version,
+ enable_custom_split_merge) {
+ cache_ =
+ NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+ high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+ metadata_charge_policy, low_pri_pool_ratio);
+}
+
+CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
+
+std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
+ const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+ bool advise_erase, bool& is_in_sec_cache) {
+ std::unique_ptr<SecondaryCacheResultHandle> handle;
+ is_in_sec_cache = false;
+ Cache::Handle* lru_handle = cache_->Lookup(key);
+ if (lru_handle == nullptr) {
+ return nullptr;
+ }
+
+ void* handle_value = cache_->Value(lru_handle);
+ if (handle_value == nullptr) {
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+ return nullptr;
+ }
+
+ CacheAllocationPtr* ptr{nullptr};
+ CacheAllocationPtr merged_value;
+ size_t handle_value_charge{0};
+ if (cache_options_.enable_custom_split_merge) {
+ CacheValueChunk* value_chunk_ptr =
+ reinterpret_cast<CacheValueChunk*>(handle_value);
+ merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge);
+ ptr = &merged_value;
+ } else {
+ ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
+ handle_value_charge = cache_->GetCharge(lru_handle);
+ }
+
+ Status s;
+ void* value{nullptr};
+ size_t charge{0};
+ if (cache_options_.compression_type == kNoCompression) {
+ s = create_cb(ptr->get(), handle_value_charge, &value, &charge);
+ } else {
+ UncompressionContext uncompression_context(cache_options_.compression_type);
+ UncompressionInfo uncompression_info(uncompression_context,
+ UncompressionDict::GetEmptyDict(),
+ cache_options_.compression_type);
+
+ size_t uncompressed_size{0};
+ CacheAllocationPtr uncompressed = UncompressData(
+ uncompression_info, (char*)ptr->get(), handle_value_charge,
+ &uncompressed_size, cache_options_.compress_format_version,
+ cache_options_.memory_allocator.get());
+
+ if (!uncompressed) {
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+ return nullptr;
+ }
+ s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge);
+ }
+
+ if (!s.ok()) {
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+ return nullptr;
+ }
+
+ if (advise_erase) {
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+ // Insert a dummy handle.
+ cache_
+ ->Insert(key, /*value=*/nullptr, /*charge=*/0,
+ GetDeletionCallback(cache_options_.enable_custom_split_merge))
+ .PermitUncheckedError();
+ } else {
+ is_in_sec_cache = true;
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+ }
+ handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
+ return handle;
+}
+
+Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper) {
+ if (value == nullptr) {
+ return Status::InvalidArgument();
+ }
+
+ Cache::Handle* lru_handle = cache_->Lookup(key);
+ Cache::DeleterFn del_cb =
+ GetDeletionCallback(cache_options_.enable_custom_split_merge);
+ if (lru_handle == nullptr) {
+ PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
+ // Insert a dummy handle if the handle is evicted for the first time.
+ return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb);
+ } else {
+ cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+ }
+
+ size_t size = (*helper->size_cb)(value);
+ CacheAllocationPtr ptr =
+ AllocateBlock(size, cache_options_.memory_allocator.get());
+
+ Status s = (*helper->saveto_cb)(value, 0, size, ptr.get());
+ if (!s.ok()) {
+ return s;
+ }
+ Slice val(ptr.get(), size);
+
+ std::string compressed_val;
+ if (cache_options_.compression_type != kNoCompression) {
+ PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
+ CompressionOptions compression_opts;
+ CompressionContext compression_context(cache_options_.compression_type);
+ uint64_t sample_for_compression{0};
+ CompressionInfo compression_info(
+ compression_opts, compression_context, CompressionDict::GetEmptyDict(),
+ cache_options_.compression_type, sample_for_compression);
+
+ bool success =
+ CompressData(val, compression_info,
+ cache_options_.compress_format_version, &compressed_val);
+
+ if (!success) {
+ return Status::Corruption("Error compressing value.");
+ }
+
+ val = Slice(compressed_val);
+ size = compressed_val.size();
+ PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, size);
+
+ if (!cache_options_.enable_custom_split_merge) {
+ ptr = AllocateBlock(size, cache_options_.memory_allocator.get());
+ memcpy(ptr.get(), compressed_val.data(), size);
+ }
+ }
+
+ PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
+ if (cache_options_.enable_custom_split_merge) {
+ size_t charge{0};
+ CacheValueChunk* value_chunks_head =
+ SplitValueIntoChunks(val, cache_options_.compression_type, charge);
+ return cache_->Insert(key, value_chunks_head, charge, del_cb);
+ } else {
+ CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
+ return cache_->Insert(key, buf, size, del_cb);
+ }
+}
+
+void CompressedSecondaryCache::Erase(const Slice& key) { cache_->Erase(key); }
+
+Status CompressedSecondaryCache::SetCapacity(size_t capacity) {
+ MutexLock l(&capacity_mutex_);
+ cache_options_.capacity = capacity;
+ cache_->SetCapacity(capacity);
+ return Status::OK();
+}
+
+Status CompressedSecondaryCache::GetCapacity(size_t& capacity) {
+ MutexLock l(&capacity_mutex_);
+ capacity = cache_options_.capacity;
+ return Status::OK();
+}
+
+std::string CompressedSecondaryCache::GetPrintableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize{200};
+ char buffer[kBufferSize];
+ ret.append(cache_->GetPrintableOptions());
+ snprintf(buffer, kBufferSize, " compression_type : %s\n",
+ CompressionTypeToString(cache_options_.compression_type).c_str());
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " compress_format_version : %d\n",
+ cache_options_.compress_format_version);
+ ret.append(buffer);
+ return ret;
+}
+
+CompressedSecondaryCache::CacheValueChunk*
+CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
+ CompressionType compression_type,
+ size_t& charge) {
+ assert(!value.empty());
+ const char* src_ptr = value.data();
+ size_t src_size{value.size()};
+
+ CacheValueChunk dummy_head = CacheValueChunk();
+ CacheValueChunk* current_chunk = &dummy_head;
+ // Do not split when value size is large or there is no compression.
+ size_t predicted_chunk_size{0};
+ size_t actual_chunk_size{0};
+ size_t tmp_size{0};
+ while (src_size > 0) {
+ predicted_chunk_size = sizeof(CacheValueChunk) - 1 + src_size;
+ auto upper =
+ std::upper_bound(malloc_bin_sizes_.begin(), malloc_bin_sizes_.end(),
+ predicted_chunk_size);
+ // Do not split when value size is too small, too large, close to a bin
+ // size, or there is no compression.
+ if (upper == malloc_bin_sizes_.begin() ||
+ upper == malloc_bin_sizes_.end() ||
+ *upper - predicted_chunk_size < malloc_bin_sizes_.front() ||
+ compression_type == kNoCompression) {
+ tmp_size = predicted_chunk_size;
+ } else {
+ tmp_size = *(--upper);
+ }
+
+ CacheValueChunk* new_chunk =
+ reinterpret_cast<CacheValueChunk*>(new char[tmp_size]);
+ current_chunk->next = new_chunk;
+ current_chunk = current_chunk->next;
+ actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1;
+ memcpy(current_chunk->data, src_ptr, actual_chunk_size);
+ current_chunk->size = actual_chunk_size;
+ src_ptr += actual_chunk_size;
+ src_size -= actual_chunk_size;
+ charge += tmp_size;
+ }
+ current_chunk->next = nullptr;
+
+ return dummy_head.next;
+}
+
+CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
+ const void* chunks_head, size_t& charge) {
+ const CacheValueChunk* head =
+ reinterpret_cast<const CacheValueChunk*>(chunks_head);
+ const CacheValueChunk* current_chunk = head;
+ charge = 0;
+ while (current_chunk != nullptr) {
+ charge += current_chunk->size;
+ current_chunk = current_chunk->next;
+ }
+
+ CacheAllocationPtr ptr =
+ AllocateBlock(charge, cache_options_.memory_allocator.get());
+ current_chunk = head;
+ size_t pos{0};
+ while (current_chunk != nullptr) {
+ memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size);
+ pos += current_chunk->size;
+ current_chunk = current_chunk->next;
+ }
+
+ return ptr;
+}
+
+Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback(
+ bool enable_custom_split_merge) {
+ if (enable_custom_split_merge) {
+ return [](const Slice& /*key*/, void* obj) {
+ CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj);
+ while (chunks_head != nullptr) {
+ CacheValueChunk* tmp_chunk = chunks_head;
+ chunks_head = chunks_head->next;
+ tmp_chunk->Free();
+ obj = nullptr;
+ };
+ };
+ } else {
+ return [](const Slice& /*key*/, void* obj) {
+ delete reinterpret_cast<CacheAllocationPtr*>(obj);
+ obj = nullptr;
+ };
+ }
+}
+
+std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio, double low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ CompressionType compression_type, uint32_t compress_format_version,
+ bool enable_custom_split_merge) {
+ return std::make_shared<CompressedSecondaryCache>(
+ capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+ low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+ metadata_charge_policy, compression_type, compress_format_version,
+ enable_custom_split_merge);
+}
+
+std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+ const CompressedSecondaryCacheOptions& opts) {
+ // The secondary_cache is disabled for this LRUCache instance.
+ assert(opts.secondary_cache == nullptr);
+ return NewCompressedSecondaryCache(
+ opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
+ opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
+ opts.use_adaptive_mutex, opts.metadata_charge_policy,
+ opts.compression_type, opts.compress_format_version,
+ opts.enable_custom_split_merge);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache.h b/src/rocksdb/cache/compressed_secondary_cache.h
new file mode 100644
index 000000000..4dee38802
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "cache/lru_cache.h"
+#include "memory/memory_allocator.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+ public:
+ CompressedSecondaryCacheResultHandle(void* value, size_t size)
+ : value_(value), size_(size) {}
+ ~CompressedSecondaryCacheResultHandle() override = default;
+
+ CompressedSecondaryCacheResultHandle(
+ const CompressedSecondaryCacheResultHandle&) = delete;
+ CompressedSecondaryCacheResultHandle& operator=(
+ const CompressedSecondaryCacheResultHandle&) = delete;
+
+ bool IsReady() override { return true; }
+
+ void Wait() override {}
+
+ void* Value() override { return value_; }
+
+ size_t Size() override { return size_; }
+
+ private:
+ void* value_;
+ size_t size_;
+};
+
+// The CompressedSecondaryCache is a concrete implementation of
+// rocksdb::SecondaryCache.
+//
+// When a block is found from CompressedSecondaryCache::Lookup, we check whether
+// there is a dummy block with the same key in the primary cache.
+// 1. If the dummy block exits, we erase the block from
+// CompressedSecondaryCache and insert it into the primary cache.
+// 2. If not, we just insert a dummy block into the primary cache
+// (charging the actual size of the block) and don not erase the block from
+// CompressedSecondaryCache. A standalone handle is returned to the caller.
+//
+// When a block is evicted from the primary cache, we check whether
+// there is a dummy block with the same key in CompressedSecondaryCache.
+// 1. If the dummy block exits, the block is inserted into
+// CompressedSecondaryCache.
+// 2. If not, we just insert a dummy block (size 0) in CompressedSecondaryCache.
+//
+// Users can also cast a pointer to CompressedSecondaryCache and call methods on
+// it directly, especially custom methods that may be added
+// in the future. For example -
+// std::unique_ptr<rocksdb::SecondaryCache> cache =
+// NewCompressedSecondaryCache(opts);
+// static_cast<CompressedSecondaryCache*>(cache.get())->Erase(key);
+
+class CompressedSecondaryCache : public SecondaryCache {
+ public:
+ CompressedSecondaryCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio, double low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDefaultCacheMetadataChargePolicy,
+ CompressionType compression_type = CompressionType::kLZ4Compression,
+ uint32_t compress_format_version = 2,
+ bool enable_custom_split_merge = false);
+ ~CompressedSecondaryCache() override;
+
+ const char* Name() const override { return "CompressedSecondaryCache"; }
+
+ Status Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper) override;
+
+ std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+ const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+ bool advise_erase, bool& is_in_sec_cache) override;
+
+ bool SupportForceErase() const override { return true; }
+
+ void Erase(const Slice& key) override;
+
+ void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
+
+ Status SetCapacity(size_t capacity) override;
+
+ Status GetCapacity(size_t& capacity) override;
+
+ std::string GetPrintableOptions() const override;
+
+ private:
+ friend class CompressedSecondaryCacheTest;
+ static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
+ 128, 256, 512, 1024, 2048, 4096, 8192, 16384};
+
+ struct CacheValueChunk {
+ // TODO try "CacheAllocationPtr next;".
+ CacheValueChunk* next;
+ size_t size;
+ // Beginning of the chunk data (MUST BE THE LAST FIELD IN THIS STRUCT!)
+ char data[1];
+
+ void Free() { delete[] reinterpret_cast<char*>(this); }
+ };
+
+ // Split value into chunks to better fit into jemalloc bins. The chunks
+ // are stored in CacheValueChunk and extra charge is needed for each chunk,
+ // so the cache charge is recalculated here.
+ CacheValueChunk* SplitValueIntoChunks(const Slice& value,
+ CompressionType compression_type,
+ size_t& charge);
+
+ // After merging chunks, the extra charge for each chunk is removed, so
+ // the charge is recalculated.
+ CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
+ size_t& charge);
+
+ // An implementation of Cache::DeleterFn.
+ static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge);
+ std::shared_ptr<Cache> cache_;
+ CompressedSecondaryCacheOptions cache_options_;
+ mutable port::Mutex capacity_mutex_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache_test.cc b/src/rocksdb/cache/compressed_secondary_cache_test.cc
new file mode 100644
index 000000000..574c257a7
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache_test.cc
@@ -0,0 +1,1005 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/compressed_secondary_cache.h"
+
+#include <iterator>
+#include <memory>
+#include <tuple>
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "rocksdb/convenience.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompressedSecondaryCacheTest : public testing::Test {
+ public:
+ CompressedSecondaryCacheTest() : fail_create_(false) {}
+ ~CompressedSecondaryCacheTest() override = default;
+
+ protected:
+ class TestItem {
+ public:
+ TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+ memcpy(buf_.get(), buf, size);
+ }
+ ~TestItem() = default;
+
+ char* Buf() { return buf_.get(); }
+ [[nodiscard]] size_t Size() const { return size_; }
+
+ private:
+ std::unique_ptr<char[]> buf_;
+ size_t size_;
+ };
+
+ static size_t SizeCallback(void* obj) {
+ return reinterpret_cast<TestItem*>(obj)->Size();
+ }
+
+ static Status SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out) {
+ auto item = reinterpret_cast<TestItem*>(from_obj);
+ const char* buf = item->Buf();
+ EXPECT_EQ(length, item->Size());
+ EXPECT_EQ(from_offset, 0);
+ memcpy(out, buf, length);
+ return Status::OK();
+ }
+
+ static void DeletionCallback(const Slice& /*key*/, void* obj) {
+ delete reinterpret_cast<TestItem*>(obj);
+ obj = nullptr;
+ }
+
+ static Cache::CacheItemHelper helper_;
+
+ static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+ size_t /*size*/, void* /*out*/) {
+ return Status::NotSupported();
+ }
+
+ static Cache::CacheItemHelper helper_fail_;
+
+ Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
+ void** out_obj,
+ size_t* charge) -> Status {
+ if (fail_create_) {
+ return Status::NotSupported();
+ }
+ *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+ *charge = size;
+ return Status::OK();
+ };
+
+ void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+ void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
+ bool sec_cache_is_compressed) {
+ get_perf_context()->Reset();
+ bool is_in_sec_cache{true};
+ // Lookup an non-existent key.
+ std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup(
+ "k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_EQ(handle0, nullptr);
+
+ Random rnd(301);
+ // Insert and Lookup the item k1 for the first time.
+ std::string str1(rnd.RandomString(1000));
+ TestItem item1(str1.data(), str1.length());
+ // A dummy handle is inserted if the item is inserted for the first time.
+ ASSERT_OK(sec_cache->Insert("k1", &item1,
+ &CompressedSecondaryCacheTest::helper_));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+
+ std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
+ "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_EQ(handle1_1, nullptr);
+
+ // Insert and Lookup the item k1 for the second time and advise erasing it.
+ ASSERT_OK(sec_cache->Insert("k1", &item1,
+ &CompressedSecondaryCacheTest::helper_));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
+
+ std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup(
+ "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_NE(handle1_2, nullptr);
+ ASSERT_FALSE(is_in_sec_cache);
+ if (sec_cache_is_compressed) {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+ 1000);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+ 1007);
+ } else {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+ }
+
+ std::unique_ptr<TestItem> val1 =
+ std::unique_ptr<TestItem>(static_cast<TestItem*>(handle1_2->Value()));
+ ASSERT_NE(val1, nullptr);
+ ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0);
+
+ // Lookup the item k1 again.
+ std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup(
+ "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_EQ(handle1_3, nullptr);
+
+ // Insert and Lookup the item k2.
+ std::string str2(rnd.RandomString(1000));
+ TestItem item2(str2.data(), str2.length());
+ ASSERT_OK(sec_cache->Insert("k2", &item2,
+ &CompressedSecondaryCacheTest::helper_));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
+ std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
+ "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_EQ(handle2_1, nullptr);
+
+ ASSERT_OK(sec_cache->Insert("k2", &item2,
+ &CompressedSecondaryCacheTest::helper_));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
+ if (sec_cache_is_compressed) {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+ 2000);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+ 2014);
+ } else {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+ }
+ std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup(
+ "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_NE(handle2_2, nullptr);
+ std::unique_ptr<TestItem> val2 =
+ std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value()));
+ ASSERT_NE(val2, nullptr);
+ ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
+
+ std::vector<SecondaryCacheResultHandle*> handles = {handle1_2.get(),
+ handle2_2.get()};
+ sec_cache->WaitAll(handles);
+
+ sec_cache.reset();
+ }
+
+ void BasicTest(bool sec_cache_is_compressed, bool use_jemalloc) {
+ CompressedSecondaryCacheOptions opts;
+ opts.capacity = 2048;
+ opts.num_shard_bits = 0;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ opts.compression_type = CompressionType::kNoCompression;
+ sec_cache_is_compressed = false;
+ }
+ } else {
+ opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ if (use_jemalloc) {
+ JemallocAllocatorOptions jopts;
+ std::shared_ptr<MemoryAllocator> allocator;
+ std::string msg;
+ if (JemallocNodumpAllocator::IsSupported(&msg)) {
+ Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+ if (s.ok()) {
+ opts.memory_allocator = allocator;
+ }
+ } else {
+ ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+ }
+ }
+ std::shared_ptr<SecondaryCache> sec_cache =
+ NewCompressedSecondaryCache(opts);
+
+ BasicTestHelper(sec_cache, sec_cache_is_compressed);
+ }
+
+ void FailsTest(bool sec_cache_is_compressed) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 1100;
+ secondary_cache_opts.num_shard_bits = 0;
+ std::shared_ptr<SecondaryCache> sec_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ // Insert and Lookup the first item.
+ Random rnd(301);
+ std::string str1(rnd.RandomString(1000));
+ TestItem item1(str1.data(), str1.length());
+ // Insert a dummy handle.
+ ASSERT_OK(sec_cache->Insert("k1", &item1,
+ &CompressedSecondaryCacheTest::helper_));
+ // Insert k1.
+ ASSERT_OK(sec_cache->Insert("k1", &item1,
+ &CompressedSecondaryCacheTest::helper_));
+
+ // Insert and Lookup the second item.
+ std::string str2(rnd.RandomString(200));
+ TestItem item2(str2.data(), str2.length());
+ // Insert a dummy handle, k1 is not evicted.
+ ASSERT_OK(sec_cache->Insert("k2", &item2,
+ &CompressedSecondaryCacheTest::helper_));
+ bool is_in_sec_cache{false};
+ std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup(
+ "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_EQ(handle1, nullptr);
+
+ // Insert k2 and k1 is evicted.
+ ASSERT_OK(sec_cache->Insert("k2", &item2,
+ &CompressedSecondaryCacheTest::helper_));
+ std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup(
+ "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_NE(handle2, nullptr);
+ std::unique_ptr<TestItem> val2 =
+ std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
+ ASSERT_NE(val2, nullptr);
+ ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
+
+ // Insert k1 again and a dummy handle is inserted.
+ ASSERT_OK(sec_cache->Insert("k1", &item1,
+ &CompressedSecondaryCacheTest::helper_));
+
+ std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
+ "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+ ASSERT_EQ(handle1_1, nullptr);
+
+ // Create Fails.
+ SetFailCreate(true);
+ std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
+ "k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+ ASSERT_EQ(handle2_1, nullptr);
+
+ // Save Fails.
+ std::string str3 = rnd.RandomString(10);
+ TestItem item3(str3.data(), str3.length());
+ // The Status is OK because a dummy handle is inserted.
+ ASSERT_OK(sec_cache->Insert("k3", &item3,
+ &CompressedSecondaryCacheTest::helper_fail_));
+ ASSERT_NOK(sec_cache->Insert("k3", &item3,
+ &CompressedSecondaryCacheTest::helper_fail_));
+
+ sec_cache.reset();
+ }
+
+ void BasicIntegrationTest(bool sec_cache_is_compressed,
+ bool enable_custom_split_merge) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ sec_cache_is_compressed = false;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 6000;
+ secondary_cache_opts.num_shard_bits = 0;
+ secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge;
+ std::shared_ptr<SecondaryCache> secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+ LRUCacheOptions lru_cache_opts(
+ /*_capacity =*/1300, /*_num_shard_bits =*/0,
+ /*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5,
+ /*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex,
+ kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0);
+ lru_cache_opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts);
+ std::shared_ptr<Statistics> stats = CreateDBStatistics();
+
+ get_perf_context()->Reset();
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1001);
+ auto item1_1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(
+ "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+ std::string str2 = rnd.RandomString(1012);
+ auto item2_1 = new TestItem(str2.data(), str2.length());
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+
+ std::string str3 = rnd.RandomString(1024);
+ auto item3_1 = new TestItem(str3.data(), str3.length());
+ // After this Insert, primary cache contains k3 and secondary cache contains
+ // k1's dummy item and k2's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
+
+ // After this Insert, primary cache contains k1 and secondary cache contains
+ // k1's dummy item, k2's dummy item, and k3's dummy item.
+ auto item1_2 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(
+ "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's item, k2's dummy item, and k3's dummy item.
+ auto item2_2 = new TestItem(str2.data(), str2.length());
+ ASSERT_OK(cache->Insert(
+ "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
+ if (sec_cache_is_compressed) {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+ str1.length());
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+ 1008);
+ } else {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+ }
+
+ // After this Insert, primary cache contains k3 and secondary cache contains
+ // k1's item and k2's item.
+ auto item3_2 = new TestItem(str3.data(), str3.length());
+ ASSERT_OK(cache->Insert(
+ "k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
+ if (sec_cache_is_compressed) {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+ str1.length() + str2.length());
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+ 2027);
+ } else {
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+ }
+
+ Cache::Handle* handle;
+ handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_NE(handle, nullptr);
+ auto val3 = static_cast<TestItem*>(cache->Value(handle));
+ ASSERT_NE(val3, nullptr);
+ ASSERT_EQ(memcmp(val3->Buf(), item3_2->Buf(), item3_2->Size()), 0);
+ cache->Release(handle);
+
+ // Lookup an non-existent key.
+ handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_EQ(handle, nullptr);
+
+ // This Lookup should just insert a dummy handle in the primary cache
+ // and the k1 is still in the secondary cache.
+ handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
+ auto val1_1 = static_cast<TestItem*>(cache->Value(handle));
+ ASSERT_NE(val1_1, nullptr);
+ ASSERT_EQ(memcmp(val1_1->Buf(), str1.data(), str1.size()), 0);
+ cache->Release(handle);
+
+ // This Lookup should erase k1 from the secondary cache and insert
+ // it into primary cache; then k3 is demoted.
+ // k2 and k3 are in secondary cache.
+ handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3);
+ cache->Release(handle);
+
+ // k2 is still in secondary cache.
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2);
+ cache->Release(handle);
+
+ // Testing SetCapacity().
+ ASSERT_OK(secondary_cache->SetCapacity(0));
+ handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+ ASSERT_EQ(handle, nullptr);
+
+ ASSERT_OK(secondary_cache->SetCapacity(7000));
+ size_t capacity;
+ ASSERT_OK(secondary_cache->GetCapacity(capacity));
+ ASSERT_EQ(capacity, 7000);
+ auto item1_3 = new TestItem(str1.data(), str1.length());
+ // After this Insert, primary cache contains k1.
+ ASSERT_OK(cache->Insert(
+ "k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4);
+
+ auto item2_3 = new TestItem(str2.data(), str2.length());
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4);
+
+ auto item1_4 = new TestItem(str1.data(), str1.length());
+ // After this Insert, primary cache contains k1 and secondary cache contains
+ // k1's dummy item and k2's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5);
+
+ auto item2_4 = new TestItem(str2.data(), str2.length());
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's real item and k2's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+ ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5);
+ // This Lookup should just insert a dummy handle in the primary cache
+ // and the k1 is still in the secondary cache.
+ handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true,
+ stats.get());
+
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 3);
+
+ cache.reset();
+ secondary_cache.reset();
+ }
+
+ void BasicIntegrationFailTest(bool sec_cache_is_compressed) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 6000;
+ secondary_cache_opts.num_shard_bits = 0;
+ std::shared_ptr<SecondaryCache> secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ LRUCacheOptions opts(
+ /*_capacity=*/1300, /*_num_shard_bits=*/0,
+ /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+ /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+ kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1001);
+ auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
+ ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length()));
+ ASSERT_OK(cache->Insert("k1", item1.get(),
+ &CompressedSecondaryCacheTest::helper_,
+ str1.length()));
+ item1.release(); // Appease clang-analyze "potential memory leak"
+
+ Cache::Handle* handle;
+ handle = cache->Lookup("k2", nullptr, test_item_creator,
+ Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, false);
+ ASSERT_EQ(handle, nullptr);
+
+ cache.reset();
+ secondary_cache.reset();
+ }
+
+ void IntegrationSaveFailTest(bool sec_cache_is_compressed) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 6000;
+ secondary_cache_opts.num_shard_bits = 0;
+
+ std::shared_ptr<SecondaryCache> secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ LRUCacheOptions opts(
+ /*_capacity=*/1300, /*_num_shard_bits=*/0,
+ /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+ /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+ kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1001);
+ auto item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert("k1", item1,
+ &CompressedSecondaryCacheTest::helper_fail_,
+ str1.length()));
+
+ std::string str2 = rnd.RandomString(1002);
+ auto item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to the secondary cache.
+ ASSERT_OK(cache->Insert("k2", item2,
+ &CompressedSecondaryCacheTest::helper_fail_,
+ str2.length()));
+
+ Cache::Handle* handle;
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ // This lookup should fail, since k1 demotion would have failed.
+ handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ // Since k1 was not promoted, k2 should still be in cache.
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+
+ cache.reset();
+ secondary_cache.reset();
+ }
+
+ void IntegrationCreateFailTest(bool sec_cache_is_compressed) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 6000;
+ secondary_cache_opts.num_shard_bits = 0;
+
+ std::shared_ptr<SecondaryCache> secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ LRUCacheOptions opts(
+ /*_capacity=*/1300, /*_num_shard_bits=*/0,
+ /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+ /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+ kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1001);
+ auto item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_,
+ str1.length()));
+
+ std::string str2 = rnd.RandomString(1002);
+ auto item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to the secondary cache.
+ ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
+ str2.length()));
+
+ Cache::Handle* handle;
+ SetFailCreate(true);
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ // This lookup should fail, since k1 creation would have failed
+ handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ // Since k1 didn't get promoted, k2 should still be in cache
+ handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+
+ cache.reset();
+ secondary_cache.reset();
+ }
+
+ void IntegrationFullCapacityTest(bool sec_cache_is_compressed) {
+ CompressedSecondaryCacheOptions secondary_cache_opts;
+
+ if (sec_cache_is_compressed) {
+ if (!LZ4_Supported()) {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+ } else {
+ secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+ }
+
+ secondary_cache_opts.capacity = 6000;
+ secondary_cache_opts.num_shard_bits = 0;
+
+ std::shared_ptr<SecondaryCache> secondary_cache =
+ NewCompressedSecondaryCache(secondary_cache_opts);
+
+ LRUCacheOptions opts(
+ /*_capacity=*/1300, /*_num_shard_bits=*/0,
+ /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+ /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+ kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1001);
+ auto item1_1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(
+ "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+ std::string str2 = rnd.RandomString(1002);
+ std::string str2_clone{str2};
+ auto item2 = new TestItem(str2.data(), str2.length());
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's dummy item.
+ ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
+ str2.length()));
+
+ // After this Insert, primary cache contains k1 and secondary cache contains
+ // k1's dummy item and k2's dummy item.
+ auto item1_2 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(
+ "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+ auto item2_2 = new TestItem(str2.data(), str2.length());
+ // After this Insert, primary cache contains k2 and secondary cache contains
+ // k1's item and k2's dummy item.
+ ASSERT_OK(cache->Insert(
+ "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+
+ Cache::Handle* handle2;
+ handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle2, nullptr);
+ cache->Release(handle2);
+
+ // k1 promotion should fail because cache is at capacity and
+ // strict_capacity_limit is true, but the lookup should still succeed.
+ // A k1's dummy item is inserted into primary cache.
+ Cache::Handle* handle1;
+ handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle1, nullptr);
+ cache->Release(handle1);
+
+ // Since k1 didn't get inserted, k2 should still be in cache
+ handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle2, nullptr);
+ cache->Release(handle2);
+
+ cache.reset();
+ secondary_cache.reset();
+ }
+
+ void SplitValueIntoChunksTest() {
+ JemallocAllocatorOptions jopts;
+ std::shared_ptr<MemoryAllocator> allocator;
+ std::string msg;
+ if (JemallocNodumpAllocator::IsSupported(&msg)) {
+ Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+ if (!s.ok()) {
+ ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+ }
+ } else {
+ ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+ }
+
+ using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+ std::unique_ptr<CompressedSecondaryCache> sec_cache =
+ std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
+ allocator);
+ Random rnd(301);
+ // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
+ size_t str_size{8500};
+ std::string str = rnd.RandomString(static_cast<int>(str_size));
+ size_t charge{0};
+ CacheValueChunk* chunks_head =
+ sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+ ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
+
+ CacheValueChunk* current_chunk = chunks_head;
+ ASSERT_EQ(current_chunk->size, 8192 - sizeof(CacheValueChunk) + 1);
+ current_chunk = current_chunk->next;
+ ASSERT_EQ(current_chunk->size, 256 - sizeof(CacheValueChunk) + 1);
+ current_chunk = current_chunk->next;
+ ASSERT_EQ(current_chunk->size, 98);
+
+ sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+ }
+
+ void MergeChunksIntoValueTest() {
+ using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+ Random rnd(301);
+ size_t size1{2048};
+ std::string str1 = rnd.RandomString(static_cast<int>(size1));
+ CacheValueChunk* current_chunk = reinterpret_cast<CacheValueChunk*>(
+ new char[sizeof(CacheValueChunk) - 1 + size1]);
+ CacheValueChunk* chunks_head = current_chunk;
+ memcpy(current_chunk->data, str1.data(), size1);
+ current_chunk->size = size1;
+
+ size_t size2{256};
+ std::string str2 = rnd.RandomString(static_cast<int>(size2));
+ current_chunk->next = reinterpret_cast<CacheValueChunk*>(
+ new char[sizeof(CacheValueChunk) - 1 + size2]);
+ current_chunk = current_chunk->next;
+ memcpy(current_chunk->data, str2.data(), size2);
+ current_chunk->size = size2;
+
+ size_t size3{31};
+ std::string str3 = rnd.RandomString(static_cast<int>(size3));
+ current_chunk->next = reinterpret_cast<CacheValueChunk*>(
+ new char[sizeof(CacheValueChunk) - 1 + size3]);
+ current_chunk = current_chunk->next;
+ memcpy(current_chunk->data, str3.data(), size3);
+ current_chunk->size = size3;
+ current_chunk->next = nullptr;
+
+ std::string str = str1 + str2 + str3;
+
+ std::unique_ptr<CompressedSecondaryCache> sec_cache =
+ std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0);
+ size_t charge{0};
+ CacheAllocationPtr value =
+ sec_cache->MergeChunksIntoValue(chunks_head, charge);
+ ASSERT_EQ(charge, size1 + size2 + size3);
+ std::string value_str{value.get(), charge};
+ ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+
+ while (chunks_head != nullptr) {
+ CacheValueChunk* tmp_chunk = chunks_head;
+ chunks_head = chunks_head->next;
+ tmp_chunk->Free();
+ }
+ }
+
+ void SplictValueAndMergeChunksTest() {
+ JemallocAllocatorOptions jopts;
+ std::shared_ptr<MemoryAllocator> allocator;
+ std::string msg;
+ if (JemallocNodumpAllocator::IsSupported(&msg)) {
+ Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+ if (!s.ok()) {
+ ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+ }
+ } else {
+ ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+ }
+
+ using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+ std::unique_ptr<CompressedSecondaryCache> sec_cache =
+ std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
+ allocator);
+ Random rnd(301);
+ // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
+ size_t str_size{8500};
+ std::string str = rnd.RandomString(static_cast<int>(str_size));
+ size_t charge{0};
+ CacheValueChunk* chunks_head =
+ sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+ ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
+
+ CacheAllocationPtr value =
+ sec_cache->MergeChunksIntoValue(chunks_head, charge);
+ ASSERT_EQ(charge, str_size);
+ std::string value_str{value.get(), charge};
+ ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+
+ sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+ }
+
+ private:
+ bool fail_create_;
+};
+
+Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_(
+ CompressedSecondaryCacheTest::SizeCallback,
+ CompressedSecondaryCacheTest::SaveToCallback,
+ CompressedSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_(
+ CompressedSecondaryCacheTest::SizeCallback,
+ CompressedSecondaryCacheTest::SaveToCallbackFail,
+ CompressedSecondaryCacheTest::DeletionCallback);
+
+class CompressedSecCacheTestWithCompressAndAllocatorParam
+ : public CompressedSecondaryCacheTest,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ CompressedSecCacheTestWithCompressAndAllocatorParam() {
+ sec_cache_is_compressed_ = std::get<0>(GetParam());
+ use_jemalloc_ = std::get<1>(GetParam());
+ }
+ bool sec_cache_is_compressed_;
+ bool use_jemalloc_;
+};
+
+TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) {
+ BasicTest(sec_cache_is_compressed_, use_jemalloc_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+ CompressedSecCacheTestWithCompressAndAllocatorParam,
+ ::testing::Combine(testing::Bool(), testing::Bool()));
+
+class CompressedSecondaryCacheTestWithCompressionParam
+ : public CompressedSecondaryCacheTest,
+ public ::testing::WithParamInterface<bool> {
+ public:
+ CompressedSecondaryCacheTestWithCompressionParam() {
+ sec_cache_is_compressed_ = GetParam();
+ }
+ bool sec_cache_is_compressed_;
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
+ std::shared_ptr<SecondaryCache> sec_cache{nullptr};
+ std::string sec_cache_uri;
+ if (sec_cache_is_compressed_) {
+ if (LZ4_Supported()) {
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
+ "compress_format_version=2";
+ } else {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kNoCompression";
+ sec_cache_is_compressed_ = false;
+ }
+ Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+ &sec_cache);
+ EXPECT_OK(s);
+ } else {
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kNoCompression";
+ Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+ &sec_cache);
+ EXPECT_OK(s);
+ }
+ BasicTestHelper(sec_cache, sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+ BasicTestFromStringWithSplit) {
+ std::shared_ptr<SecondaryCache> sec_cache{nullptr};
+ std::string sec_cache_uri;
+ if (sec_cache_is_compressed_) {
+ if (LZ4_Supported()) {
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
+ "compress_format_version=2;enable_custom_split_merge=true";
+ } else {
+ ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;"
+ "enable_custom_split_merge=true";
+ sec_cache_is_compressed_ = false;
+ }
+ Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+ &sec_cache);
+ EXPECT_OK(s);
+ } else {
+ sec_cache_uri =
+ "compressed_secondary_cache://"
+ "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;"
+ "enable_custom_split_merge=true";
+ Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+ &sec_cache);
+ EXPECT_OK(s);
+ }
+ BasicTestHelper(sec_cache, sec_cache_is_compressed_);
+}
+
+#endif // ROCKSDB_LITE
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) {
+ FailsTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+ BasicIntegrationFailTest) {
+ BasicIntegrationFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+ IntegrationSaveFailTest) {
+ IntegrationSaveFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+ IntegrationCreateFailTest) {
+ IntegrationCreateFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+ IntegrationFullCapacityTest) {
+ IntegrationFullCapacityTest(sec_cache_is_compressed_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+ CompressedSecondaryCacheTestWithCompressionParam,
+ testing::Bool());
+
+class CompressedSecCacheTestWithCompressAndSplitParam
+ : public CompressedSecondaryCacheTest,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ CompressedSecCacheTestWithCompressAndSplitParam() {
+ sec_cache_is_compressed_ = std::get<0>(GetParam());
+ enable_custom_split_merge_ = std::get<1>(GetParam());
+ }
+ bool sec_cache_is_compressed_;
+ bool enable_custom_split_merge_;
+};
+
+TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) {
+ BasicIntegrationTest(sec_cache_is_compressed_, enable_custom_split_merge_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+ CompressedSecCacheTestWithCompressAndSplitParam,
+ ::testing::Combine(testing::Bool(), testing::Bool()));
+
+TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
+ SplitValueIntoChunksTest();
+}
+
+TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
+ MergeChunksIntoValueTest();
+}
+
+TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
+ SplictValueAndMergeChunksTest();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/lru_cache.cc b/src/rocksdb/cache/lru_cache.cc
new file mode 100644
index 000000000..c8e4d29ba
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.cc
@@ -0,0 +1,921 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/lru_cache.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "util/distributed_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lru_cache {
+
+// A distinct pointer value for marking "dummy" cache entries
+void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker");
+
+LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
+ : length_bits_(/* historical starting size*/ 4),
+ list_(new LRUHandle* [size_t{1} << length_bits_] {}),
+ elems_(0),
+ max_length_bits_(max_upper_hash_bits) {}
+
+LRUHandleTable::~LRUHandleTable() {
+ ApplyToEntriesRange(
+ [](LRUHandle* h) {
+ if (!h->HasRefs()) {
+ h->Free();
+ }
+ },
+ 0, size_t{1} << length_bits_);
+}
+
+LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
+ return *FindPointer(key, hash);
+}
+
+LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
+ LRUHandle** ptr = FindPointer(h->key(), h->hash);
+ LRUHandle* old = *ptr;
+ h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+ *ptr = h;
+ if (old == nullptr) {
+ ++elems_;
+ if ((elems_ >> length_bits_) > 0) { // elems_ >= length
+ // Since each cache entry is fairly large, we aim for a small
+ // average linked list length (<= 1).
+ Resize();
+ }
+ }
+ return old;
+}
+
+LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
+ LRUHandle** ptr = FindPointer(key, hash);
+ LRUHandle* result = *ptr;
+ if (result != nullptr) {
+ *ptr = result->next_hash;
+ --elems_;
+ }
+ return result;
+}
+
+LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
+ LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
+ while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
+ ptr = &(*ptr)->next_hash;
+ }
+ return ptr;
+}
+
+void LRUHandleTable::Resize() {
+ if (length_bits_ >= max_length_bits_) {
+ // Due to reaching limit of hash information, if we made the table bigger,
+ // we would allocate more addresses but only the same number would be used.
+ return;
+ }
+ if (length_bits_ >= 31) {
+ // Avoid undefined behavior shifting uint32_t by 32.
+ return;
+ }
+
+ uint32_t old_length = uint32_t{1} << length_bits_;
+ int new_length_bits = length_bits_ + 1;
+ std::unique_ptr<LRUHandle* []> new_list {
+ new LRUHandle* [size_t{1} << new_length_bits] {}
+ };
+ uint32_t count = 0;
+ for (uint32_t i = 0; i < old_length; i++) {
+ LRUHandle* h = list_[i];
+ while (h != nullptr) {
+ LRUHandle* next = h->next_hash;
+ uint32_t hash = h->hash;
+ LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)];
+ h->next_hash = *ptr;
+ *ptr = h;
+ h = next;
+ count++;
+ }
+ }
+ assert(elems_ == count);
+ list_ = std::move(new_list);
+ length_bits_ = new_length_bits;
+}
+
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+ double high_pri_pool_ratio,
+ double low_pri_pool_ratio, bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ int max_upper_hash_bits,
+ SecondaryCache* secondary_cache)
+ : CacheShardBase(metadata_charge_policy),
+ capacity_(0),
+ high_pri_pool_usage_(0),
+ low_pri_pool_usage_(0),
+ strict_capacity_limit_(strict_capacity_limit),
+ high_pri_pool_ratio_(high_pri_pool_ratio),
+ high_pri_pool_capacity_(0),
+ low_pri_pool_ratio_(low_pri_pool_ratio),
+ low_pri_pool_capacity_(0),
+ table_(max_upper_hash_bits),
+ usage_(0),
+ lru_usage_(0),
+ mutex_(use_adaptive_mutex),
+ secondary_cache_(secondary_cache) {
+ // Make empty circular linked list.
+ lru_.next = &lru_;
+ lru_.prev = &lru_;
+ lru_low_pri_ = &lru_;
+ lru_bottom_pri_ = &lru_;
+ SetCapacity(capacity);
+}
+
+void LRUCacheShard::EraseUnRefEntries() {
+ autovector<LRUHandle*> last_reference_list;
+ {
+ DMutexLock l(mutex_);
+ while (lru_.next != &lru_) {
+ LRUHandle* old = lru_.next;
+ // LRU list contains only elements which can be evicted.
+ assert(old->InCache() && !old->HasRefs());
+ LRU_Remove(old);
+ table_.Remove(old->key(), old->hash);
+ old->SetInCache(false);
+ assert(usage_ >= old->total_charge);
+ usage_ -= old->total_charge;
+ last_reference_list.push_back(old);
+ }
+ }
+
+ for (auto entry : last_reference_list) {
+ entry->Free();
+ }
+}
+
+void LRUCacheShard::ApplyToSomeEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ size_t average_entries_per_lock, size_t* state) {
+ // The state is essentially going to be the starting hash, which works
+ // nicely even if we resize between calls because we use upper-most
+ // hash bits for table indexes.
+ DMutexLock l(mutex_);
+ int length_bits = table_.GetLengthBits();
+ size_t length = size_t{1} << length_bits;
+
+ assert(average_entries_per_lock > 0);
+ // Assuming we are called with same average_entries_per_lock repeatedly,
+ // this simplifies some logic (index_end will not overflow).
+ assert(average_entries_per_lock < length || *state == 0);
+
+ size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+ size_t index_end = index_begin + average_entries_per_lock;
+ if (index_end >= length) {
+ // Going to end
+ index_end = length;
+ *state = SIZE_MAX;
+ } else {
+ *state = index_end << (sizeof(size_t) * 8u - length_bits);
+ }
+
+ table_.ApplyToEntriesRange(
+ [callback,
+ metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
+ DeleterFn deleter = h->IsSecondaryCacheCompatible()
+ ? h->info_.helper->del_cb
+ : h->info_.deleter;
+ callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
+ deleter);
+ },
+ index_begin, index_end);
+}
+
+void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
+ LRUHandle** lru_bottom_pri) {
+ DMutexLock l(mutex_);
+ *lru = &lru_;
+ *lru_low_pri = lru_low_pri_;
+ *lru_bottom_pri = lru_bottom_pri_;
+}
+
+size_t LRUCacheShard::TEST_GetLRUSize() {
+ DMutexLock l(mutex_);
+ LRUHandle* lru_handle = lru_.next;
+ size_t lru_size = 0;
+ while (lru_handle != &lru_) {
+ lru_size++;
+ lru_handle = lru_handle->next;
+ }
+ return lru_size;
+}
+
+double LRUCacheShard::GetHighPriPoolRatio() {
+ DMutexLock l(mutex_);
+ return high_pri_pool_ratio_;
+}
+
+double LRUCacheShard::GetLowPriPoolRatio() {
+ DMutexLock l(mutex_);
+ return low_pri_pool_ratio_;
+}
+
+void LRUCacheShard::LRU_Remove(LRUHandle* e) {
+ assert(e->next != nullptr);
+ assert(e->prev != nullptr);
+ if (lru_low_pri_ == e) {
+ lru_low_pri_ = e->prev;
+ }
+ if (lru_bottom_pri_ == e) {
+ lru_bottom_pri_ = e->prev;
+ }
+ e->next->prev = e->prev;
+ e->prev->next = e->next;
+ e->prev = e->next = nullptr;
+ assert(lru_usage_ >= e->total_charge);
+ lru_usage_ -= e->total_charge;
+ assert(!e->InHighPriPool() || !e->InLowPriPool());
+ if (e->InHighPriPool()) {
+ assert(high_pri_pool_usage_ >= e->total_charge);
+ high_pri_pool_usage_ -= e->total_charge;
+ } else if (e->InLowPriPool()) {
+ assert(low_pri_pool_usage_ >= e->total_charge);
+ low_pri_pool_usage_ -= e->total_charge;
+ }
+}
+
+void LRUCacheShard::LRU_Insert(LRUHandle* e) {
+ assert(e->next == nullptr);
+ assert(e->prev == nullptr);
+ if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
+ // Inset "e" to head of LRU list.
+ e->next = &lru_;
+ e->prev = lru_.prev;
+ e->prev->next = e;
+ e->next->prev = e;
+ e->SetInHighPriPool(true);
+ e->SetInLowPriPool(false);
+ high_pri_pool_usage_ += e->total_charge;
+ MaintainPoolSize();
+ } else if (low_pri_pool_ratio_ > 0 &&
+ (e->IsHighPri() || e->IsLowPri() || e->HasHit())) {
+ // Insert "e" to the head of low-pri pool.
+ e->next = lru_low_pri_->next;
+ e->prev = lru_low_pri_;
+ e->prev->next = e;
+ e->next->prev = e;
+ e->SetInHighPriPool(false);
+ e->SetInLowPriPool(true);
+ low_pri_pool_usage_ += e->total_charge;
+ MaintainPoolSize();
+ lru_low_pri_ = e;
+ } else {
+ // Insert "e" to the head of bottom-pri pool.
+ e->next = lru_bottom_pri_->next;
+ e->prev = lru_bottom_pri_;
+ e->prev->next = e;
+ e->next->prev = e;
+ e->SetInHighPriPool(false);
+ e->SetInLowPriPool(false);
+ // if the low-pri pool is empty, lru_low_pri_ also needs to be updated.
+ if (lru_bottom_pri_ == lru_low_pri_) {
+ lru_low_pri_ = e;
+ }
+ lru_bottom_pri_ = e;
+ }
+ lru_usage_ += e->total_charge;
+}
+
+void LRUCacheShard::MaintainPoolSize() {
+ while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
+ // Overflow last entry in high-pri pool to low-pri pool.
+ lru_low_pri_ = lru_low_pri_->next;
+ assert(lru_low_pri_ != &lru_);
+ lru_low_pri_->SetInHighPriPool(false);
+ lru_low_pri_->SetInLowPriPool(true);
+ assert(high_pri_pool_usage_ >= lru_low_pri_->total_charge);
+ high_pri_pool_usage_ -= lru_low_pri_->total_charge;
+ low_pri_pool_usage_ += lru_low_pri_->total_charge;
+ }
+
+ while (low_pri_pool_usage_ > low_pri_pool_capacity_) {
+ // Overflow last entry in low-pri pool to bottom-pri pool.
+ lru_bottom_pri_ = lru_bottom_pri_->next;
+ assert(lru_bottom_pri_ != &lru_);
+ lru_bottom_pri_->SetInHighPriPool(false);
+ lru_bottom_pri_->SetInLowPriPool(false);
+ assert(low_pri_pool_usage_ >= lru_bottom_pri_->total_charge);
+ low_pri_pool_usage_ -= lru_bottom_pri_->total_charge;
+ }
+}
+
+void LRUCacheShard::EvictFromLRU(size_t charge,
+ autovector<LRUHandle*>* deleted) {
+ while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
+ LRUHandle* old = lru_.next;
+ // LRU list contains only elements which can be evicted.
+ assert(old->InCache() && !old->HasRefs());
+ LRU_Remove(old);
+ table_.Remove(old->key(), old->hash);
+ old->SetInCache(false);
+ assert(usage_ >= old->total_charge);
+ usage_ -= old->total_charge;
+ deleted->push_back(old);
+ }
+}
+
+void LRUCacheShard::TryInsertIntoSecondaryCache(
+ autovector<LRUHandle*> evicted_handles) {
+ for (auto entry : evicted_handles) {
+ if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+ !entry->IsInSecondaryCache()) {
+ secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+ .PermitUncheckedError();
+ }
+ // Free the entries here outside of mutex for performance reasons.
+ entry->Free();
+ }
+}
+
+void LRUCacheShard::SetCapacity(size_t capacity) {
+ autovector<LRUHandle*> last_reference_list;
+ {
+ DMutexLock l(mutex_);
+ capacity_ = capacity;
+ high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+ low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
+ EvictFromLRU(0, &last_reference_list);
+ }
+
+ TryInsertIntoSecondaryCache(last_reference_list);
+}
+
+void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+ DMutexLock l(mutex_);
+ strict_capacity_limit_ = strict_capacity_limit;
+}
+
+Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
+ bool free_handle_on_fail) {
+ Status s = Status::OK();
+ autovector<LRUHandle*> last_reference_list;
+
+ {
+ DMutexLock l(mutex_);
+
+ // Free the space following strict LRU policy until enough space
+ // is freed or the lru list is empty.
+ EvictFromLRU(e->total_charge, &last_reference_list);
+
+ if ((usage_ + e->total_charge) > capacity_ &&
+ (strict_capacity_limit_ || handle == nullptr)) {
+ e->SetInCache(false);
+ if (handle == nullptr) {
+ // Don't insert the entry but still return ok, as if the entry inserted
+ // into cache and get evicted immediately.
+ last_reference_list.push_back(e);
+ } else {
+ if (free_handle_on_fail) {
+ free(e);
+ *handle = nullptr;
+ }
+ s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
+ }
+ } else {
+ // Insert into the cache. Note that the cache might get larger than its
+ // capacity if not enough space was freed up.
+ LRUHandle* old = table_.Insert(e);
+ usage_ += e->total_charge;
+ if (old != nullptr) {
+ s = Status::OkOverwritten();
+ assert(old->InCache());
+ old->SetInCache(false);
+ if (!old->HasRefs()) {
+ // old is on LRU because it's in cache and its reference count is 0.
+ LRU_Remove(old);
+ assert(usage_ >= old->total_charge);
+ usage_ -= old->total_charge;
+ last_reference_list.push_back(old);
+ }
+ }
+ if (handle == nullptr) {
+ LRU_Insert(e);
+ } else {
+ // If caller already holds a ref, no need to take one here.
+ if (!e->HasRefs()) {
+ e->Ref();
+ }
+ *handle = e;
+ }
+ }
+ }
+
+ TryInsertIntoSecondaryCache(last_reference_list);
+
+ return s;
+}
+
+void LRUCacheShard::Promote(LRUHandle* e) {
+ SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
+
+ assert(secondary_handle->IsReady());
+ // e is not thread-shared here; OK to modify "immutable" fields as well as
+ // "mutable" (normally requiring mutex)
+ e->SetIsPending(false);
+ e->value = secondary_handle->Value();
+ assert(e->total_charge == 0);
+ size_t value_size = secondary_handle->Size();
+ delete secondary_handle;
+
+ if (e->value) {
+ e->CalcTotalCharge(value_size, metadata_charge_policy_);
+ Status s;
+ if (e->IsStandalone()) {
+ assert(secondary_cache_ && secondary_cache_->SupportForceErase());
+
+ // Insert a dummy handle and return a standalone handle to caller.
+ // Charge the standalone handle.
+ autovector<LRUHandle*> last_reference_list;
+ bool free_standalone_handle{false};
+ {
+ DMutexLock l(mutex_);
+
+ // Free the space following strict LRU policy until enough space
+ // is freed or the lru list is empty.
+ EvictFromLRU(e->total_charge, &last_reference_list);
+
+ if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) {
+ free_standalone_handle = true;
+ } else {
+ usage_ += e->total_charge;
+ }
+ }
+
+ TryInsertIntoSecondaryCache(last_reference_list);
+ if (free_standalone_handle) {
+ e->Unref();
+ e->Free();
+ e = nullptr;
+ } else {
+ PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
+ }
+
+ // Insert a dummy handle into the primary cache. This dummy handle is
+ // not IsSecondaryCacheCompatible().
+ // FIXME? This should not overwrite an existing non-dummy entry in the
+ // rare case that one exists
+ Cache::Priority priority =
+ e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW;
+ s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0,
+ /*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr,
+ priority);
+ } else {
+ e->SetInCache(true);
+ LRUHandle* handle = e;
+ // This InsertItem() could fail if the cache is over capacity and
+ // strict_capacity_limit_ is true. In such a case, we don't want
+ // InsertItem() to free the handle, since the item is already in memory
+ // and the caller will most likely just read it from disk if we erase it
+ // here.
+ s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
+ if (s.ok()) {
+ PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
+ }
+ }
+
+ if (!s.ok()) {
+ // Item is in memory, but not accounted against the cache capacity.
+ // When the handle is released, the item should get deleted.
+ assert(!e->InCache());
+ }
+ } else {
+ // Secondary cache lookup failed. The caller will take care of detecting
+ // this and eventually releasing e.
+ assert(!e->value);
+ assert(!e->InCache());
+ }
+}
+
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
+ const Cache::CacheItemHelper* helper,
+ const Cache::CreateCallback& create_cb,
+ Cache::Priority priority, bool wait,
+ Statistics* stats) {
+ LRUHandle* e = nullptr;
+ bool found_dummy_entry{false};
+ {
+ DMutexLock l(mutex_);
+ e = table_.Lookup(key, hash);
+ if (e != nullptr) {
+ assert(e->InCache());
+ if (e->value == kDummyValueMarker) {
+ // For a dummy handle, if it was retrieved from secondary cache,
+ // it may still exist in secondary cache.
+ // If the handle exists in secondary cache, the value should be
+ // erased from sec cache and be inserted into primary cache.
+ found_dummy_entry = true;
+ // Let the dummy entry be overwritten
+ e = nullptr;
+ } else {
+ if (!e->HasRefs()) {
+ // The entry is in LRU since it's in hash and has no external
+ // references.
+ LRU_Remove(e);
+ }
+ e->Ref();
+ e->SetHit();
+ }
+ }
+ }
+
+ // If handle table lookup failed or the handle is a dummy one, allocate
+ // a handle outside the mutex if we re going to lookup in the secondary cache.
+ //
+ // When a block is firstly Lookup from CompressedSecondaryCache, we just
+ // insert a dummy block into the primary cache (charging the actual size of
+ // the block) and don't erase the block from CompressedSecondaryCache. A
+ // standalone handle is returned to the caller. Only if the block is hit
+ // again, we erase it from CompressedSecondaryCache and add it into the
+ // primary cache.
+ if (!e && secondary_cache_ && helper && helper->saveto_cb) {
+ // For objects from the secondary cache, we expect the caller to provide
+ // a way to create/delete the primary cache object. The only case where
+ // a deleter would not be required is for dummy entries inserted for
+ // accounting purposes, which we won't demote to the secondary cache
+ // anyway.
+ assert(create_cb && helper->del_cb);
+ bool is_in_sec_cache{false};
+ std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+ secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry,
+ is_in_sec_cache);
+ if (secondary_handle != nullptr) {
+ e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
+
+ e->m_flags = 0;
+ e->im_flags = 0;
+ e->SetSecondaryCacheCompatible(true);
+ e->info_.helper = helper;
+ e->key_length = key.size();
+ e->hash = hash;
+ e->refs = 0;
+ e->next = e->prev = nullptr;
+ e->SetPriority(priority);
+ memcpy(e->key_data, key.data(), key.size());
+ e->value = nullptr;
+ e->sec_handle = secondary_handle.release();
+ e->total_charge = 0;
+ e->Ref();
+ e->SetIsInSecondaryCache(is_in_sec_cache);
+ e->SetIsStandalone(secondary_cache_->SupportForceErase() &&
+ !found_dummy_entry);
+
+ if (wait) {
+ Promote(e);
+ if (e) {
+ if (!e->value) {
+ // The secondary cache returned a handle, but the lookup failed.
+ e->Unref();
+ e->Free();
+ e = nullptr;
+ } else {
+ PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+ RecordTick(stats, SECONDARY_CACHE_HITS);
+ }
+ }
+ } else {
+ // If wait is false, we always return a handle and let the caller
+ // release the handle after checking for success or failure.
+ e->SetIsPending(true);
+ // This may be slightly inaccurate, if the lookup eventually fails.
+ // But the probability is very low.
+ PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+ RecordTick(stats, SECONDARY_CACHE_HITS);
+ }
+ } else {
+ // Caller will most likely overwrite the dummy entry with an Insert
+ // after this Lookup fails
+ assert(e == nullptr);
+ }
+ }
+ return e;
+}
+
+bool LRUCacheShard::Ref(LRUHandle* e) {
+ DMutexLock l(mutex_);
+ // To create another reference - entry must be already externally referenced.
+ assert(e->HasRefs());
+ // Pending handles are not for sharing
+ assert(!e->IsPending());
+ e->Ref();
+ return true;
+}
+
+void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) {
+ DMutexLock l(mutex_);
+ high_pri_pool_ratio_ = high_pri_pool_ratio;
+ high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+ MaintainPoolSize();
+}
+
+void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) {
+ DMutexLock l(mutex_);
+ low_pri_pool_ratio_ = low_pri_pool_ratio;
+ low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
+ MaintainPoolSize();
+}
+
+bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
+ bool erase_if_last_ref) {
+ if (e == nullptr) {
+ return false;
+ }
+ bool last_reference = false;
+ // Must Wait or WaitAll first on pending handles. Otherwise, would leak
+ // a secondary cache handle.
+ assert(!e->IsPending());
+ {
+ DMutexLock l(mutex_);
+ last_reference = e->Unref();
+ if (last_reference && e->InCache()) {
+ // The item is still in cache, and nobody else holds a reference to it.
+ if (usage_ > capacity_ || erase_if_last_ref) {
+ // The LRU list must be empty since the cache is full.
+ assert(lru_.next == &lru_ || erase_if_last_ref);
+ // Take this opportunity and remove the item.
+ table_.Remove(e->key(), e->hash);
+ e->SetInCache(false);
+ } else {
+ // Put the item back on the LRU list, and don't free it.
+ LRU_Insert(e);
+ last_reference = false;
+ }
+ }
+ // If it was the last reference, then decrement the cache usage.
+ if (last_reference) {
+ assert(usage_ >= e->total_charge);
+ usage_ -= e->total_charge;
+ }
+ }
+
+ // Free the entry here outside of mutex for performance reasons.
+ if (last_reference) {
+ e->Free();
+ }
+ return last_reference;
+}
+
+Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
+ size_t charge,
+ void (*deleter)(const Slice& key, void* value),
+ const Cache::CacheItemHelper* helper,
+ LRUHandle** handle, Cache::Priority priority) {
+ // Allocate the memory here outside of the mutex.
+ // If the cache is full, we'll have to release it.
+ // It shouldn't happen very often though.
+ LRUHandle* e =
+ static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
+
+ e->value = value;
+ e->m_flags = 0;
+ e->im_flags = 0;
+ if (helper) {
+ // Use only one of the two parameters
+ assert(deleter == nullptr);
+ // value == nullptr is reserved for indicating failure for when secondary
+ // cache compatible
+ assert(value != nullptr);
+ e->SetSecondaryCacheCompatible(true);
+ e->info_.helper = helper;
+ } else {
+ e->info_.deleter = deleter;
+ }
+ e->key_length = key.size();
+ e->hash = hash;
+ e->refs = 0;
+ e->next = e->prev = nullptr;
+ e->SetInCache(true);
+ e->SetPriority(priority);
+ memcpy(e->key_data, key.data(), key.size());
+ e->CalcTotalCharge(charge, metadata_charge_policy_);
+
+ return InsertItem(e, handle, /* free_handle_on_fail */ true);
+}
+
+void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
+ LRUHandle* e;
+ bool last_reference = false;
+ {
+ DMutexLock l(mutex_);
+ e = table_.Remove(key, hash);
+ if (e != nullptr) {
+ assert(e->InCache());
+ e->SetInCache(false);
+ if (!e->HasRefs()) {
+ // The entry is in LRU since it's in hash and has no external references
+ LRU_Remove(e);
+ assert(usage_ >= e->total_charge);
+ usage_ -= e->total_charge;
+ last_reference = true;
+ }
+ }
+ }
+
+ // Free the entry here outside of mutex for performance reasons.
+ // last_reference will only be true if e != nullptr.
+ if (last_reference) {
+ e->Free();
+ }
+}
+
+bool LRUCacheShard::IsReady(LRUHandle* e) {
+ bool ready = true;
+ if (e->IsPending()) {
+ assert(secondary_cache_);
+ assert(e->sec_handle);
+ ready = e->sec_handle->IsReady();
+ }
+ return ready;
+}
+
+size_t LRUCacheShard::GetUsage() const {
+ DMutexLock l(mutex_);
+ return usage_;
+}
+
+size_t LRUCacheShard::GetPinnedUsage() const {
+ DMutexLock l(mutex_);
+ assert(usage_ >= lru_usage_);
+ return usage_ - lru_usage_;
+}
+
+size_t LRUCacheShard::GetOccupancyCount() const {
+ DMutexLock l(mutex_);
+ return table_.GetOccupancyCount();
+}
+
+size_t LRUCacheShard::GetTableAddressCount() const {
+ DMutexLock l(mutex_);
+ return size_t{1} << table_.GetLengthBits();
+}
+
+void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+ {
+ DMutexLock l(mutex_);
+ snprintf(buffer, kBufferSize, " high_pri_pool_ratio: %.3lf\n",
+ high_pri_pool_ratio_);
+ snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer),
+ " low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_);
+ }
+ str.append(buffer);
+}
+
+LRUCache::LRUCache(size_t capacity, int num_shard_bits,
+ bool strict_capacity_limit, double high_pri_pool_ratio,
+ double low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> allocator,
+ bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ std::shared_ptr<SecondaryCache> _secondary_cache)
+ : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+ std::move(allocator)),
+ secondary_cache_(std::move(_secondary_cache)) {
+ size_t per_shard = GetPerShardCapacity();
+ SecondaryCache* secondary_cache = secondary_cache_.get();
+ InitShards([=](LRUCacheShard* cs) {
+ new (cs) LRUCacheShard(
+ per_shard, strict_capacity_limit, high_pri_pool_ratio,
+ low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
+ /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
+ });
+}
+
+void* LRUCache::Value(Handle* handle) {
+ auto h = reinterpret_cast<const LRUHandle*>(handle);
+ assert(!h->IsPending() || h->value == nullptr);
+ assert(h->value != kDummyValueMarker);
+ return h->value;
+}
+
+size_t LRUCache::GetCharge(Handle* handle) const {
+ return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
+ GetShard(0).metadata_charge_policy_);
+}
+
+Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
+ auto h = reinterpret_cast<const LRUHandle*>(handle);
+ if (h->IsSecondaryCacheCompatible()) {
+ return h->info_.helper->del_cb;
+ } else {
+ return h->info_.deleter;
+ }
+}
+
+size_t LRUCache::TEST_GetLRUSize() {
+ return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
+}
+
+double LRUCache::GetHighPriPoolRatio() {
+ return GetShard(0).GetHighPriPoolRatio();
+}
+
+void LRUCache::WaitAll(std::vector<Handle*>& handles) {
+ if (secondary_cache_) {
+ std::vector<SecondaryCacheResultHandle*> sec_handles;
+ sec_handles.reserve(handles.size());
+ for (Handle* handle : handles) {
+ if (!handle) {
+ continue;
+ }
+ LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+ if (!lru_handle->IsPending()) {
+ continue;
+ }
+ sec_handles.emplace_back(lru_handle->sec_handle);
+ }
+ secondary_cache_->WaitAll(sec_handles);
+ for (Handle* handle : handles) {
+ if (!handle) {
+ continue;
+ }
+ LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+ if (!lru_handle->IsPending()) {
+ continue;
+ }
+ GetShard(lru_handle->hash).Promote(lru_handle);
+ }
+ }
+}
+
+void LRUCache::AppendPrintableOptions(std::string& str) const {
+ ShardedCache::AppendPrintableOptions(str); // options from shard
+ if (secondary_cache_) {
+ str.append(" secondary_cache:\n");
+ str.append(secondary_cache_->GetPrintableOptions());
+ }
+}
+
+} // namespace lru_cache
+
+std::shared_ptr<Cache> NewLRUCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ const std::shared_ptr<SecondaryCache>& secondary_cache,
+ double low_pri_pool_ratio) {
+ if (num_shard_bits >= 20) {
+ return nullptr; // The cache cannot be sharded into too many fine pieces.
+ }
+ if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
+ // Invalid high_pri_pool_ratio
+ return nullptr;
+ }
+ if (low_pri_pool_ratio < 0.0 || low_pri_pool_ratio > 1.0) {
+ // Invalid low_pri_pool_ratio
+ return nullptr;
+ }
+ if (low_pri_pool_ratio + high_pri_pool_ratio > 1.0) {
+ // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
+ return nullptr;
+ }
+ if (num_shard_bits < 0) {
+ num_shard_bits = GetDefaultCacheShardBits(capacity);
+ }
+ return std::make_shared<LRUCache>(
+ capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+ low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
+ metadata_charge_policy, secondary_cache);
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+ return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
+ cache_opts.strict_capacity_limit,
+ cache_opts.high_pri_pool_ratio,
+ cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+ cache_opts.metadata_charge_policy,
+ cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
+}
+
+std::shared_ptr<Cache> NewLRUCache(
+ size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ double low_pri_pool_ratio) {
+ return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+ high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+ metadata_charge_policy, nullptr, low_pri_pool_ratio);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache.h b/src/rocksdb/cache/lru_cache.h
new file mode 100644
index 000000000..99b2f2b20
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.h
@@ -0,0 +1,546 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "cache/sharded_cache.h"
+#include "port/lang.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/secondary_cache.h"
+#include "util/autovector.h"
+#include "util/distributed_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lru_cache {
+
+// LRU cache implementation. This class is not thread-safe.
+
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in a hash table. Some elements
+// are also stored on LRU list.
+//
+// LRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+// In that case the entry is *not* in the LRU list
+// (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+// In that case the entry is in the LRU list and can be freed.
+// (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+// In that case the entry is not in the LRU list and not in hash table.
+// The entry must be freed if refs becomes 0 in this state.
+// (refs >= 1 && in_cache == false)
+// If you call LRUCacheShard::Release enough times on an entry in state 1, it
+// will go into state 2. To move from state 1 to state 3, either call
+// LRUCacheShard::Erase or LRUCacheShard::Insert with the same key (but
+// possibly different value). To move from state 2 to state 1, use
+// LRUCacheShard::Lookup.
+// While refs > 0, public properties like value and deleter must not change.
+
+struct LRUHandle {
+ void* value;
+ union Info {
+ Info() {}
+ ~Info() {}
+ Cache::DeleterFn deleter;
+ const Cache::CacheItemHelper* helper;
+ } info_;
+ // An entry is not added to the LRUHandleTable until the secondary cache
+ // lookup is complete, so its safe to have this union.
+ union {
+ LRUHandle* next_hash;
+ SecondaryCacheResultHandle* sec_handle;
+ };
+ LRUHandle* next;
+ LRUHandle* prev;
+ size_t total_charge; // TODO(opt): Only allow uint32_t?
+ size_t key_length;
+ // The hash of key(). Used for fast sharding and comparisons.
+ uint32_t hash;
+ // The number of external refs to this entry. The cache itself is not counted.
+ uint32_t refs;
+
+ // Mutable flags - access controlled by mutex
+ // The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid
+ // checking an M_ flag on im_flags or an IM_ flag on m_flags.
+ uint8_t m_flags;
+ enum MFlags : uint8_t {
+ // Whether this entry is referenced by the hash table.
+ M_IN_CACHE = (1 << 0),
+ // Whether this entry has had any lookups (hits).
+ M_HAS_HIT = (1 << 1),
+ // Whether this entry is in high-pri pool.
+ M_IN_HIGH_PRI_POOL = (1 << 2),
+ // Whether this entry is in low-pri pool.
+ M_IN_LOW_PRI_POOL = (1 << 3),
+ };
+
+ // "Immutable" flags - only set in single-threaded context and then
+ // can be accessed without mutex
+ uint8_t im_flags;
+ enum ImFlags : uint8_t {
+ // Whether this entry is high priority entry.
+ IM_IS_HIGH_PRI = (1 << 0),
+ // Whether this entry is low priority entry.
+ IM_IS_LOW_PRI = (1 << 1),
+ // Can this be inserted into the secondary cache.
+ IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
+ // Is the handle still being read from a lower tier.
+ IM_IS_PENDING = (1 << 3),
+ // Whether this handle is still in a lower tier
+ IM_IS_IN_SECONDARY_CACHE = (1 << 4),
+ // Marks result handles that should not be inserted into cache
+ IM_IS_STANDALONE = (1 << 5),
+ };
+
+ // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+ char key_data[1];
+
+ Slice key() const { return Slice(key_data, key_length); }
+
+ // For HandleImpl concept
+ uint32_t GetHash() const { return hash; }
+
+ // Increase the reference count by 1.
+ void Ref() { refs++; }
+
+ // Just reduce the reference count by 1. Return true if it was last reference.
+ bool Unref() {
+ assert(refs > 0);
+ refs--;
+ return refs == 0;
+ }
+
+ // Return true if there are external refs, false otherwise.
+ bool HasRefs() const { return refs > 0; }
+
+ bool InCache() const { return m_flags & M_IN_CACHE; }
+ bool IsHighPri() const { return im_flags & IM_IS_HIGH_PRI; }
+ bool InHighPriPool() const { return m_flags & M_IN_HIGH_PRI_POOL; }
+ bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
+ bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
+ bool HasHit() const { return m_flags & M_HAS_HIT; }
+ bool IsSecondaryCacheCompatible() const {
+ return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
+ }
+ bool IsPending() const { return im_flags & IM_IS_PENDING; }
+ bool IsInSecondaryCache() const {
+ return im_flags & IM_IS_IN_SECONDARY_CACHE;
+ }
+ bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
+
+ void SetInCache(bool in_cache) {
+ if (in_cache) {
+ m_flags |= M_IN_CACHE;
+ } else {
+ m_flags &= ~M_IN_CACHE;
+ }
+ }
+
+ void SetPriority(Cache::Priority priority) {
+ if (priority == Cache::Priority::HIGH) {
+ im_flags |= IM_IS_HIGH_PRI;
+ im_flags &= ~IM_IS_LOW_PRI;
+ } else if (priority == Cache::Priority::LOW) {
+ im_flags &= ~IM_IS_HIGH_PRI;
+ im_flags |= IM_IS_LOW_PRI;
+ } else {
+ im_flags &= ~IM_IS_HIGH_PRI;
+ im_flags &= ~IM_IS_LOW_PRI;
+ }
+ }
+
+ void SetInHighPriPool(bool in_high_pri_pool) {
+ if (in_high_pri_pool) {
+ m_flags |= M_IN_HIGH_PRI_POOL;
+ } else {
+ m_flags &= ~M_IN_HIGH_PRI_POOL;
+ }
+ }
+
+ void SetInLowPriPool(bool in_low_pri_pool) {
+ if (in_low_pri_pool) {
+ m_flags |= M_IN_LOW_PRI_POOL;
+ } else {
+ m_flags &= ~M_IN_LOW_PRI_POOL;
+ }
+ }
+
+ void SetHit() { m_flags |= M_HAS_HIT; }
+
+ void SetSecondaryCacheCompatible(bool compat) {
+ if (compat) {
+ im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
+ } else {
+ im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
+ }
+ }
+
+ void SetIsPending(bool pending) {
+ if (pending) {
+ im_flags |= IM_IS_PENDING;
+ } else {
+ im_flags &= ~IM_IS_PENDING;
+ }
+ }
+
+ void SetIsInSecondaryCache(bool is_in_secondary_cache) {
+ if (is_in_secondary_cache) {
+ im_flags |= IM_IS_IN_SECONDARY_CACHE;
+ } else {
+ im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
+ }
+ }
+
+ void SetIsStandalone(bool is_standalone) {
+ if (is_standalone) {
+ im_flags |= IM_IS_STANDALONE;
+ } else {
+ im_flags &= ~IM_IS_STANDALONE;
+ }
+ }
+
+ void Free() {
+ assert(refs == 0);
+
+ if (!IsSecondaryCacheCompatible() && info_.deleter) {
+ (*info_.deleter)(key(), value);
+ } else if (IsSecondaryCacheCompatible()) {
+ if (IsPending()) {
+ assert(sec_handle != nullptr);
+ SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
+ tmp_sec_handle->Wait();
+ value = tmp_sec_handle->Value();
+ delete tmp_sec_handle;
+ }
+ if (value) {
+ (*info_.helper->del_cb)(key(), value);
+ }
+ }
+
+ free(this);
+ }
+
+ inline size_t CalcuMetaCharge(
+ CacheMetadataChargePolicy metadata_charge_policy) const {
+ if (metadata_charge_policy != kFullChargeCacheMetadata) {
+ return 0;
+ } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size(
+ const_cast<void*>(static_cast<const void*>(this)));
+#else
+ // This is the size that is used when a new handle is created.
+ return sizeof(LRUHandle) - 1 + key_length;
+#endif
+ }
+ }
+
+ // Calculate the memory usage by metadata.
+ inline void CalcTotalCharge(
+ size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
+ total_charge = charge + CalcuMetaCharge(metadata_charge_policy);
+ }
+
+ inline size_t GetCharge(
+ CacheMetadataChargePolicy metadata_charge_policy) const {
+ size_t meta_charge = CalcuMetaCharge(metadata_charge_policy);
+ assert(total_charge >= meta_charge);
+ return total_charge - meta_charge;
+ }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested. E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class LRUHandleTable {
+ public:
+ explicit LRUHandleTable(int max_upper_hash_bits);
+ ~LRUHandleTable();
+
+ LRUHandle* Lookup(const Slice& key, uint32_t hash);
+ LRUHandle* Insert(LRUHandle* h);
+ LRUHandle* Remove(const Slice& key, uint32_t hash);
+
+ template <typename T>
+ void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+ for (size_t i = index_begin; i < index_end; i++) {
+ LRUHandle* h = list_[i];
+ while (h != nullptr) {
+ auto n = h->next_hash;
+ assert(h->InCache());
+ func(h);
+ h = n;
+ }
+ }
+ }
+
+ int GetLengthBits() const { return length_bits_; }
+
+ size_t GetOccupancyCount() const { return elems_; }
+
+ private:
+ // Return a pointer to slot that points to a cache entry that
+ // matches key/hash. If there is no such cache entry, return a
+ // pointer to the trailing slot in the corresponding linked list.
+ LRUHandle** FindPointer(const Slice& key, uint32_t hash);
+
+ void Resize();
+
+ // Number of hash bits (upper because lower bits used for sharding)
+ // used for table index. Length == 1 << length_bits_
+ int length_bits_;
+
+ // The table consists of an array of buckets where each bucket is
+ // a linked list of cache entries that hash into the bucket.
+ std::unique_ptr<LRUHandle*[]> list_;
+
+ // Number of elements currently in the table.
+ uint32_t elems_;
+
+ // Set from max_upper_hash_bits (see constructor).
+ const int max_length_bits_;
+};
+
+// A single shard of sharded cache.
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
+ public:
+ LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+ double high_pri_pool_ratio, double low_pri_pool_ratio,
+ bool use_adaptive_mutex,
+ CacheMetadataChargePolicy metadata_charge_policy,
+ int max_upper_hash_bits, SecondaryCache* secondary_cache);
+
+ public: // Type definitions expected as parameter to ShardedCache
+ using HandleImpl = LRUHandle;
+ using HashVal = uint32_t;
+ using HashCref = uint32_t;
+
+ public: // Function definitions expected as parameter to ShardedCache
+ static inline HashVal ComputeHash(const Slice& key) {
+ return Lower32of64(GetSliceNPHash64(key));
+ }
+
+ // Separate from constructor so caller can easily make an array of LRUCache
+ // if current usage is more than new capacity, the function will attempt to
+ // free the needed space.
+ void SetCapacity(size_t capacity);
+
+ // Set the flag to reject insertion if cache if full.
+ void SetStrictCapacityLimit(bool strict_capacity_limit);
+
+ // Set percentage of capacity reserved for high-pri cache entries.
+ void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
+
+ // Set percentage of capacity reserved for low-pri cache entries.
+ void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
+
+ // Like Cache methods, but with an extra "hash" parameter.
+ inline Status Insert(const Slice& key, uint32_t hash, void* value,
+ size_t charge, Cache::DeleterFn deleter,
+ LRUHandle** handle, Cache::Priority priority) {
+ return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
+ }
+ inline Status Insert(const Slice& key, uint32_t hash, void* value,
+ const Cache::CacheItemHelper* helper, size_t charge,
+ LRUHandle** handle, Cache::Priority priority) {
+ assert(helper);
+ return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
+ }
+ // If helper_cb is null, the values of the following arguments don't matter.
+ LRUHandle* Lookup(const Slice& key, uint32_t hash,
+ const Cache::CacheItemHelper* helper,
+ const Cache::CreateCallback& create_cb,
+ Cache::Priority priority, bool wait, Statistics* stats);
+ inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
+ return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
+ nullptr);
+ }
+ bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
+ bool IsReady(LRUHandle* /*handle*/);
+ void Wait(LRUHandle* /*handle*/) {}
+ bool Ref(LRUHandle* handle);
+ void Erase(const Slice& key, uint32_t hash);
+
+ // Although in some platforms the update of size_t is atomic, to make sure
+ // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
+ // protect them with mutex_.
+
+ size_t GetUsage() const;
+ size_t GetPinnedUsage() const;
+ size_t GetOccupancyCount() const;
+ size_t GetTableAddressCount() const;
+
+ void ApplyToSomeEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ size_t average_entries_per_lock, size_t* state);
+
+ void EraseUnRefEntries();
+
+ public: // other function definitions
+ void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
+ LRUHandle** lru_bottom_pri);
+
+ // Retrieves number of elements in LRU, for unit test purpose only.
+ // Not threadsafe.
+ size_t TEST_GetLRUSize();
+
+ // Retrieves high pri pool ratio
+ double GetHighPriPoolRatio();
+
+ // Retrieves low pri pool ratio
+ double GetLowPriPoolRatio();
+
+ void AppendPrintableOptions(std::string& /*str*/) const;
+
+ private:
+ friend class LRUCache;
+ // Insert an item into the hash table and, if handle is null, insert into
+ // the LRU list. Older items are evicted as necessary. If the cache is full
+ // and free_handle_on_fail is true, the item is deleted and handle is set to
+ // nullptr.
+ Status InsertItem(LRUHandle* item, LRUHandle** handle,
+ bool free_handle_on_fail);
+ Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+ DeleterFn deleter, const Cache::CacheItemHelper* helper,
+ LRUHandle** handle, Cache::Priority priority);
+ // Promote an item looked up from the secondary cache to the LRU cache.
+ // The item may be still in the secondary cache.
+ // It is only inserted into the hash table and not the LRU list, and only
+ // if the cache is not at full capacity, as is the case during Insert. The
+ // caller should hold a reference on the LRUHandle. When the caller releases
+ // the last reference, the item is added to the LRU list.
+ // The item is promoted to the high pri or low pri pool as specified by the
+ // caller in Lookup.
+ void Promote(LRUHandle* e);
+ void LRU_Remove(LRUHandle* e);
+ void LRU_Insert(LRUHandle* e);
+
+ // Overflow the last entry in high-pri pool to low-pri pool until size of
+ // high-pri pool is no larger than the size specify by high_pri_pool_pct.
+ void MaintainPoolSize();
+
+ // Free some space following strict LRU policy until enough space
+ // to hold (usage_ + charge) is freed or the lru list is empty
+ // This function is not thread safe - it needs to be executed while
+ // holding the mutex_.
+ void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
+
+ // Try to insert the evicted handles into the secondary cache.
+ void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
+
+ // Initialized before use.
+ size_t capacity_;
+
+ // Memory size for entries in high-pri pool.
+ size_t high_pri_pool_usage_;
+
+ // Memory size for entries in low-pri pool.
+ size_t low_pri_pool_usage_;
+
+ // Whether to reject insertion if cache reaches its full capacity.
+ bool strict_capacity_limit_;
+
+ // Ratio of capacity reserved for high priority cache entries.
+ double high_pri_pool_ratio_;
+
+ // High-pri pool size, equals to capacity * high_pri_pool_ratio.
+ // Remember the value to avoid recomputing each time.
+ double high_pri_pool_capacity_;
+
+ // Ratio of capacity reserved for low priority cache entries.
+ double low_pri_pool_ratio_;
+
+ // Low-pri pool size, equals to capacity * low_pri_pool_ratio.
+ // Remember the value to avoid recomputing each time.
+ double low_pri_pool_capacity_;
+
+ // Dummy head of LRU list.
+ // lru.prev is newest entry, lru.next is oldest entry.
+ // LRU contains items which can be evicted, ie reference only by cache
+ LRUHandle lru_;
+
+ // Pointer to head of low-pri pool in LRU list.
+ LRUHandle* lru_low_pri_;
+
+ // Pointer to head of bottom-pri pool in LRU list.
+ LRUHandle* lru_bottom_pri_;
+
+ // ------------^^^^^^^^^^^^^-----------
+ // Not frequently modified data members
+ // ------------------------------------
+ //
+ // We separate data members that are updated frequently from the ones that
+ // are not frequently updated so that they don't share the same cache line
+ // which will lead into false cache sharing
+ //
+ // ------------------------------------
+ // Frequently modified data members
+ // ------------vvvvvvvvvvvvv-----------
+ LRUHandleTable table_;
+
+ // Memory size for entries residing in the cache.
+ size_t usage_;
+
+ // Memory size for entries residing only in the LRU list.
+ size_t lru_usage_;
+
+ // mutex_ protects the following state.
+ // We don't count mutex_ as the cache's internal state so semantically we
+ // don't mind mutex_ invoking the non-const actions.
+ mutable DMutex mutex_;
+
+ // Owned by LRUCache
+ SecondaryCache* secondary_cache_;
+};
+
+class LRUCache
+#ifdef NDEBUG
+ final
+#endif
+ : public ShardedCache<LRUCacheShard> {
+ public:
+ LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ double high_pri_pool_ratio, double low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy metadata_charge_policy =
+ kDontChargeCacheMetadata,
+ std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
+ const char* Name() const override { return "LRUCache"; }
+ void* Value(Handle* handle) override;
+ size_t GetCharge(Handle* handle) const override;
+ DeleterFn GetDeleter(Handle* handle) const override;
+ void WaitAll(std::vector<Handle*>& handles) override;
+
+ // Retrieves number of elements in LRU, for unit test purpose only.
+ size_t TEST_GetLRUSize();
+ // Retrieves high pri pool ratio.
+ double GetHighPriPoolRatio();
+
+ void AppendPrintableOptions(std::string& str) const override;
+
+ private:
+ std::shared_ptr<SecondaryCache> secondary_cache_;
+};
+
+} // namespace lru_cache
+
+using LRUCache = lru_cache::LRUCache;
+using LRUHandle = lru_cache::LRUHandle;
+using LRUCacheShard = lru_cache::LRUCacheShard;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache_test.cc b/src/rocksdb/cache/lru_cache_test.cc
new file mode 100644
index 000000000..7904a196d
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache_test.cc
@@ -0,0 +1,2624 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/lru_cache.h"
+
+#include <string>
+#include <vector>
+
+#include "cache/cache_key.h"
+#include "cache/clock_cache.h"
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/utilities/cache_dump_load.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "utilities/cache_dump_load_impl.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LRUCacheTest : public testing::Test {
+ public:
+ LRUCacheTest() {}
+ ~LRUCacheTest() override { DeleteCache(); }
+
+ void DeleteCache() {
+ if (cache_ != nullptr) {
+ cache_->~LRUCacheShard();
+ port::cacheline_aligned_free(cache_);
+ cache_ = nullptr;
+ }
+ }
+
+ void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0,
+ double low_pri_pool_ratio = 1.0,
+ bool use_adaptive_mutex = kDefaultToAdaptiveMutex) {
+ DeleteCache();
+ cache_ = reinterpret_cast<LRUCacheShard*>(
+ port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+ new (cache_) LRUCacheShard(capacity, /*strict_capacity_limit=*/false,
+ high_pri_pool_ratio, low_pri_pool_ratio,
+ use_adaptive_mutex, kDontChargeCacheMetadata,
+ /*max_upper_hash_bits=*/24,
+ /*secondary_cache=*/nullptr);
+ }
+
+ void Insert(const std::string& key,
+ Cache::Priority priority = Cache::Priority::LOW) {
+ EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
+ nullptr /*deleter*/, nullptr /*handle*/,
+ priority));
+ }
+
+ void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
+ Insert(std::string(1, key), priority);
+ }
+
+ bool Lookup(const std::string& key) {
+ auto handle = cache_->Lookup(key, 0 /*hash*/);
+ if (handle) {
+ cache_->Release(handle, true /*useful*/, false /*erase*/);
+ return true;
+ }
+ return false;
+ }
+
+ bool Lookup(char key) { return Lookup(std::string(1, key)); }
+
+ void Erase(const std::string& key) { cache_->Erase(key, 0 /*hash*/); }
+
+ void ValidateLRUList(std::vector<std::string> keys,
+ size_t num_high_pri_pool_keys = 0,
+ size_t num_low_pri_pool_keys = 0,
+ size_t num_bottom_pri_pool_keys = 0) {
+ LRUHandle* lru;
+ LRUHandle* lru_low_pri;
+ LRUHandle* lru_bottom_pri;
+ cache_->TEST_GetLRUList(&lru, &lru_low_pri, &lru_bottom_pri);
+
+ LRUHandle* iter = lru;
+
+ bool in_low_pri_pool = false;
+ bool in_high_pri_pool = false;
+
+ size_t high_pri_pool_keys = 0;
+ size_t low_pri_pool_keys = 0;
+ size_t bottom_pri_pool_keys = 0;
+
+ if (iter == lru_bottom_pri) {
+ in_low_pri_pool = true;
+ in_high_pri_pool = false;
+ }
+ if (iter == lru_low_pri) {
+ in_low_pri_pool = false;
+ in_high_pri_pool = true;
+ }
+
+ for (const auto& key : keys) {
+ iter = iter->next;
+ ASSERT_NE(lru, iter);
+ ASSERT_EQ(key, iter->key().ToString());
+ ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool());
+ ASSERT_EQ(in_low_pri_pool, iter->InLowPriPool());
+ if (in_high_pri_pool) {
+ ASSERT_FALSE(iter->InLowPriPool());
+ high_pri_pool_keys++;
+ } else if (in_low_pri_pool) {
+ ASSERT_FALSE(iter->InHighPriPool());
+ low_pri_pool_keys++;
+ } else {
+ bottom_pri_pool_keys++;
+ }
+ if (iter == lru_bottom_pri) {
+ ASSERT_FALSE(in_low_pri_pool);
+ ASSERT_FALSE(in_high_pri_pool);
+ in_low_pri_pool = true;
+ in_high_pri_pool = false;
+ }
+ if (iter == lru_low_pri) {
+ ASSERT_TRUE(in_low_pri_pool);
+ ASSERT_FALSE(in_high_pri_pool);
+ in_low_pri_pool = false;
+ in_high_pri_pool = true;
+ }
+ }
+ ASSERT_EQ(lru, iter->next);
+ ASSERT_FALSE(in_low_pri_pool);
+ ASSERT_TRUE(in_high_pri_pool);
+ ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys);
+ ASSERT_EQ(num_low_pri_pool_keys, low_pri_pool_keys);
+ ASSERT_EQ(num_bottom_pri_pool_keys, bottom_pri_pool_keys);
+ }
+
+ private:
+ LRUCacheShard* cache_ = nullptr;
+};
+
+TEST_F(LRUCacheTest, BasicLRU) {
+ NewCache(5);
+ for (char ch = 'a'; ch <= 'e'; ch++) {
+ Insert(ch);
+ }
+ ValidateLRUList({"a", "b", "c", "d", "e"}, 0, 5);
+ for (char ch = 'x'; ch <= 'z'; ch++) {
+ Insert(ch);
+ }
+ ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5);
+ ASSERT_FALSE(Lookup("b"));
+ ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5);
+ ASSERT_TRUE(Lookup("e"));
+ ValidateLRUList({"d", "x", "y", "z", "e"}, 0, 5);
+ ASSERT_TRUE(Lookup("z"));
+ ValidateLRUList({"d", "x", "y", "e", "z"}, 0, 5);
+ Erase("x");
+ ValidateLRUList({"d", "y", "e", "z"}, 0, 4);
+ ASSERT_TRUE(Lookup("d"));
+ ValidateLRUList({"y", "e", "z", "d"}, 0, 4);
+ Insert("u");
+ ValidateLRUList({"y", "e", "z", "d", "u"}, 0, 5);
+ Insert("v");
+ ValidateLRUList({"e", "z", "d", "u", "v"}, 0, 5);
+}
+
+TEST_F(LRUCacheTest, LowPriorityMidpointInsertion) {
+ // Allocate 2 cache entries to high-pri pool and 3 to low-pri pool.
+ NewCache(5, /* high_pri_pool_ratio */ 0.40, /* low_pri_pool_ratio */ 0.60);
+
+ Insert("a", Cache::Priority::LOW);
+ Insert("b", Cache::Priority::LOW);
+ Insert("c", Cache::Priority::LOW);
+ Insert("x", Cache::Priority::HIGH);
+ Insert("y", Cache::Priority::HIGH);
+ ValidateLRUList({"a", "b", "c", "x", "y"}, 2, 3);
+
+ // Low-pri entries inserted to the tail of low-pri list (the midpoint).
+ // After lookup, it will move to the tail of the full list.
+ Insert("d", Cache::Priority::LOW);
+ ValidateLRUList({"b", "c", "d", "x", "y"}, 2, 3);
+ ASSERT_TRUE(Lookup("d"));
+ ValidateLRUList({"b", "c", "x", "y", "d"}, 2, 3);
+
+ // High-pri entries will be inserted to the tail of full list.
+ Insert("z", Cache::Priority::HIGH);
+ ValidateLRUList({"c", "x", "y", "d", "z"}, 2, 3);
+}
+
+TEST_F(LRUCacheTest, BottomPriorityMidpointInsertion) {
+ // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool.
+ NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35);
+
+ Insert("a", Cache::Priority::BOTTOM);
+ Insert("b", Cache::Priority::BOTTOM);
+ Insert("i", Cache::Priority::LOW);
+ Insert("j", Cache::Priority::LOW);
+ Insert("x", Cache::Priority::HIGH);
+ Insert("y", Cache::Priority::HIGH);
+ ValidateLRUList({"a", "b", "i", "j", "x", "y"}, 2, 2, 2);
+
+ // Low-pri entries will be inserted to the tail of low-pri list (the
+ // midpoint). After lookup, 'k' will move to the tail of the full list, and
+ // 'x' will spill over to the low-pri pool.
+ Insert("k", Cache::Priority::LOW);
+ ValidateLRUList({"b", "i", "j", "k", "x", "y"}, 2, 2, 2);
+ ASSERT_TRUE(Lookup("k"));
+ ValidateLRUList({"b", "i", "j", "x", "y", "k"}, 2, 2, 2);
+
+ // High-pri entries will be inserted to the tail of full list. Although y was
+ // inserted with high priority, it got spilled over to the low-pri pool. As
+ // a result, j also got spilled over to the bottom-pri pool.
+ Insert("z", Cache::Priority::HIGH);
+ ValidateLRUList({"i", "j", "x", "y", "k", "z"}, 2, 2, 2);
+ Erase("x");
+ ValidateLRUList({"i", "j", "y", "k", "z"}, 2, 1, 2);
+ Erase("y");
+ ValidateLRUList({"i", "j", "k", "z"}, 2, 0, 2);
+
+ // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+ Insert("c", Cache::Priority::BOTTOM);
+ ValidateLRUList({"i", "j", "c", "k", "z"}, 2, 0, 3);
+ Insert("d", Cache::Priority::BOTTOM);
+ ValidateLRUList({"i", "j", "c", "d", "k", "z"}, 2, 0, 4);
+ Insert("e", Cache::Priority::BOTTOM);
+ ValidateLRUList({"j", "c", "d", "e", "k", "z"}, 2, 0, 4);
+
+ // Low-pri entries will be inserted to the tail of low-pri list (the
+ // midpoint).
+ Insert("l", Cache::Priority::LOW);
+ ValidateLRUList({"c", "d", "e", "l", "k", "z"}, 2, 1, 3);
+ Insert("m", Cache::Priority::LOW);
+ ValidateLRUList({"d", "e", "l", "m", "k", "z"}, 2, 2, 2);
+
+ Erase("k");
+ ValidateLRUList({"d", "e", "l", "m", "z"}, 1, 2, 2);
+ Erase("z");
+ ValidateLRUList({"d", "e", "l", "m"}, 0, 2, 2);
+
+ // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+ Insert("f", Cache::Priority::BOTTOM);
+ ValidateLRUList({"d", "e", "f", "l", "m"}, 0, 2, 3);
+ Insert("g", Cache::Priority::BOTTOM);
+ ValidateLRUList({"d", "e", "f", "g", "l", "m"}, 0, 2, 4);
+
+ // High-pri entries will be inserted to the tail of full list.
+ Insert("o", Cache::Priority::HIGH);
+ ValidateLRUList({"e", "f", "g", "l", "m", "o"}, 1, 2, 3);
+ Insert("p", Cache::Priority::HIGH);
+ ValidateLRUList({"f", "g", "l", "m", "o", "p"}, 2, 2, 2);
+}
+
+TEST_F(LRUCacheTest, EntriesWithPriority) {
+ // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool.
+ NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35);
+
+ Insert("a", Cache::Priority::LOW);
+ Insert("b", Cache::Priority::LOW);
+ ValidateLRUList({"a", "b"}, 0, 2, 0);
+ // Low-pri entries can overflow to bottom-pri pool.
+ Insert("c", Cache::Priority::LOW);
+ ValidateLRUList({"a", "b", "c"}, 0, 2, 1);
+
+ // Bottom-pri entries can take high-pri pool capacity if available
+ Insert("t", Cache::Priority::LOW);
+ Insert("u", Cache::Priority::LOW);
+ ValidateLRUList({"a", "b", "c", "t", "u"}, 0, 2, 3);
+ Insert("v", Cache::Priority::LOW);
+ ValidateLRUList({"a", "b", "c", "t", "u", "v"}, 0, 2, 4);
+ Insert("w", Cache::Priority::LOW);
+ ValidateLRUList({"b", "c", "t", "u", "v", "w"}, 0, 2, 4);
+
+ Insert("X", Cache::Priority::HIGH);
+ Insert("Y", Cache::Priority::HIGH);
+ ValidateLRUList({"t", "u", "v", "w", "X", "Y"}, 2, 2, 2);
+
+ // After lookup, the high-pri entry 'X' got spilled over to the low-pri pool.
+ // The low-pri entry 'v' got spilled over to the bottom-pri pool.
+ Insert("Z", Cache::Priority::HIGH);
+ ValidateLRUList({"u", "v", "w", "X", "Y", "Z"}, 2, 2, 2);
+
+ // Low-pri entries will be inserted to head of low-pri pool.
+ Insert("a", Cache::Priority::LOW);
+ ValidateLRUList({"v", "w", "X", "a", "Y", "Z"}, 2, 2, 2);
+
+ // After lookup, the high-pri entry 'Y' got spilled over to the low-pri pool.
+ // The low-pri entry 'X' got spilled over to the bottom-pri pool.
+ ASSERT_TRUE(Lookup("v"));
+ ValidateLRUList({"w", "X", "a", "Y", "Z", "v"}, 2, 2, 2);
+
+ // After lookup, the high-pri entry 'Z' got spilled over to the low-pri pool.
+ // The low-pri entry 'a' got spilled over to the bottom-pri pool.
+ ASSERT_TRUE(Lookup("X"));
+ ValidateLRUList({"w", "a", "Y", "Z", "v", "X"}, 2, 2, 2);
+
+ // After lookup, the low pri entry 'Z' got promoted back to high-pri pool. The
+ // high-pri entry 'v' got spilled over to the low-pri pool.
+ ASSERT_TRUE(Lookup("Z"));
+ ValidateLRUList({"w", "a", "Y", "v", "X", "Z"}, 2, 2, 2);
+
+ Erase("Y");
+ ValidateLRUList({"w", "a", "v", "X", "Z"}, 2, 1, 2);
+ Erase("X");
+ ValidateLRUList({"w", "a", "v", "Z"}, 1, 1, 2);
+
+ Insert("d", Cache::Priority::LOW);
+ Insert("e", Cache::Priority::LOW);
+ ValidateLRUList({"w", "a", "v", "d", "e", "Z"}, 1, 2, 3);
+
+ Insert("f", Cache::Priority::LOW);
+ Insert("g", Cache::Priority::LOW);
+ ValidateLRUList({"v", "d", "e", "f", "g", "Z"}, 1, 2, 3);
+ ASSERT_TRUE(Lookup("d"));
+ ValidateLRUList({"v", "e", "f", "g", "Z", "d"}, 2, 2, 2);
+
+ // Erase some entries.
+ Erase("e");
+ Erase("f");
+ Erase("Z");
+ ValidateLRUList({"v", "g", "d"}, 1, 1, 1);
+
+ // Bottom-pri entries can take low- and high-pri pool capacity if available
+ Insert("o", Cache::Priority::BOTTOM);
+ ValidateLRUList({"v", "o", "g", "d"}, 1, 1, 2);
+ Insert("p", Cache::Priority::BOTTOM);
+ ValidateLRUList({"v", "o", "p", "g", "d"}, 1, 1, 3);
+ Insert("q", Cache::Priority::BOTTOM);
+ ValidateLRUList({"v", "o", "p", "q", "g", "d"}, 1, 1, 4);
+
+ // High-pri entries can overflow to low-pri pool, and bottom-pri entries will
+ // be evicted.
+ Insert("x", Cache::Priority::HIGH);
+ ValidateLRUList({"o", "p", "q", "g", "d", "x"}, 2, 1, 3);
+ Insert("y", Cache::Priority::HIGH);
+ ValidateLRUList({"p", "q", "g", "d", "x", "y"}, 2, 2, 2);
+ Insert("z", Cache::Priority::HIGH);
+ ValidateLRUList({"q", "g", "d", "x", "y", "z"}, 2, 2, 2);
+
+ // 'g' is bottom-pri before this lookup, it will be inserted to head of
+ // high-pri pool after lookup.
+ ASSERT_TRUE(Lookup("g"));
+ ValidateLRUList({"q", "d", "x", "y", "z", "g"}, 2, 2, 2);
+
+ // High-pri entries will be inserted to head of high-pri pool after lookup.
+ ASSERT_TRUE(Lookup("z"));
+ ValidateLRUList({"q", "d", "x", "y", "g", "z"}, 2, 2, 2);
+
+ // Bottom-pri entries will be inserted to head of high-pri pool after lookup.
+ ASSERT_TRUE(Lookup("d"));
+ ValidateLRUList({"q", "x", "y", "g", "z", "d"}, 2, 2, 2);
+
+ // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+ Insert("m", Cache::Priority::BOTTOM);
+ ValidateLRUList({"x", "m", "y", "g", "z", "d"}, 2, 2, 2);
+
+ // Bottom-pri entries will be inserted to head of high-pri pool after lookup.
+ ASSERT_TRUE(Lookup("m"));
+ ValidateLRUList({"x", "y", "g", "z", "d", "m"}, 2, 2, 2);
+}
+
+namespace clock_cache {
+
+class ClockCacheTest : public testing::Test {
+ public:
+ using Shard = HyperClockCache::Shard;
+ using Table = HyperClockTable;
+ using HandleImpl = Shard::HandleImpl;
+
+ ClockCacheTest() {}
+ ~ClockCacheTest() override { DeleteShard(); }
+
+ void DeleteShard() {
+ if (shard_ != nullptr) {
+ shard_->~ClockCacheShard();
+ port::cacheline_aligned_free(shard_);
+ shard_ = nullptr;
+ }
+ }
+
+ void NewShard(size_t capacity, bool strict_capacity_limit = true) {
+ DeleteShard();
+ shard_ =
+ reinterpret_cast<Shard*>(port::cacheline_aligned_alloc(sizeof(Shard)));
+
+ Table::Opts opts;
+ opts.estimated_value_size = 1;
+ new (shard_)
+ Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, opts);
+ }
+
+ Status Insert(const UniqueId64x2& hashed_key,
+ Cache::Priority priority = Cache::Priority::LOW) {
+ return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/,
+ 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+ priority);
+ }
+
+ Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
+ return Insert(TestHashedKey(key), priority);
+ }
+
+ Status InsertWithLen(char key, size_t len) {
+ std::string skey(len, key);
+ return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/,
+ 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+ Cache::Priority::LOW);
+ }
+
+ bool Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+ bool useful = true) {
+ auto handle = shard_->Lookup(key, hashed_key);
+ if (handle) {
+ shard_->Release(handle, useful, /*erase_if_last_ref=*/false);
+ return true;
+ }
+ return false;
+ }
+
+ bool Lookup(const UniqueId64x2& hashed_key, bool useful = true) {
+ return Lookup(TestKey(hashed_key), hashed_key, useful);
+ }
+
+ bool Lookup(char key, bool useful = true) {
+ return Lookup(TestHashedKey(key), useful);
+ }
+
+ void Erase(char key) {
+ UniqueId64x2 hashed_key = TestHashedKey(key);
+ shard_->Erase(TestKey(hashed_key), hashed_key);
+ }
+
+ static inline Slice TestKey(const UniqueId64x2& hashed_key) {
+ return Slice(reinterpret_cast<const char*>(&hashed_key), 16U);
+ }
+
+ static inline UniqueId64x2 TestHashedKey(char key) {
+ // For testing hash near-collision behavior, put the variance in
+ // hashed_key in bits that are unlikely to be used as hash bits.
+ return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U};
+ }
+
+ Shard* shard_ = nullptr;
+};
+
+TEST_F(ClockCacheTest, Misc) {
+ NewShard(3);
+
+ // Key size stuff
+ EXPECT_OK(InsertWithLen('a', 16));
+ EXPECT_NOK(InsertWithLen('b', 15));
+ EXPECT_OK(InsertWithLen('b', 16));
+ EXPECT_NOK(InsertWithLen('c', 17));
+ EXPECT_NOK(InsertWithLen('d', 1000));
+ EXPECT_NOK(InsertWithLen('e', 11));
+ EXPECT_NOK(InsertWithLen('f', 0));
+
+ // Some of this is motivated by code coverage
+ std::string wrong_size_key(15, 'x');
+ EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x')));
+ EXPECT_FALSE(shard_->Ref(nullptr));
+ EXPECT_FALSE(shard_->Release(nullptr));
+ shard_->Erase(wrong_size_key, TestHashedKey('x')); // no-op
+}
+
+TEST_F(ClockCacheTest, Limits) {
+ constexpr size_t kCapacity = 3;
+ NewShard(kCapacity, false /*strict_capacity_limit*/);
+ for (bool strict_capacity_limit : {false, true, false}) {
+ SCOPED_TRACE("strict_capacity_limit = " +
+ std::to_string(strict_capacity_limit));
+
+ // Also tests switching between strict limit and not
+ shard_->SetStrictCapacityLimit(strict_capacity_limit);
+
+ UniqueId64x2 hkey = TestHashedKey('x');
+
+ // Single entry charge beyond capacity
+ {
+ Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+ 5 /*charge*/, nullptr /*deleter*/,
+ nullptr /*handle*/, Cache::Priority::LOW);
+ if (strict_capacity_limit) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ } else {
+ EXPECT_OK(s);
+ }
+ }
+
+ // Single entry fills capacity
+ {
+ HandleImpl* h;
+ ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+ 3 /*charge*/, nullptr /*deleter*/, &h,
+ Cache::Priority::LOW));
+ // Try to insert more
+ Status s = Insert('a');
+ if (strict_capacity_limit) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ } else {
+ EXPECT_OK(s);
+ }
+ // Release entry filling capacity.
+ // Cover useful = false case.
+ shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/);
+ }
+
+ // Insert more than table size can handle to exceed occupancy limit.
+ // (Cleverly using mostly zero-charge entries, but some non-zero to
+ // verify usage tracking on detached entries.)
+ {
+ size_t n = shard_->GetTableAddressCount() + 1;
+ std::unique_ptr<HandleImpl* []> ha { new HandleImpl* [n] {} };
+ Status s;
+ for (size_t i = 0; i < n && s.ok(); ++i) {
+ hkey[1] = i;
+ s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+ (i + kCapacity < n) ? 0 : 1 /*charge*/,
+ nullptr /*deleter*/, &ha[i], Cache::Priority::LOW);
+ if (i == 0) {
+ EXPECT_OK(s);
+ }
+ }
+ if (strict_capacity_limit) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ } else {
+ EXPECT_OK(s);
+ }
+ // Same result if not keeping a reference
+ s = Insert('a');
+ if (strict_capacity_limit) {
+ EXPECT_TRUE(s.IsMemoryLimit());
+ } else {
+ EXPECT_OK(s);
+ }
+
+ // Regardless, we didn't allow table to actually get full
+ EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount());
+
+ // Release handles
+ for (size_t i = 0; i < n; ++i) {
+ if (ha[i]) {
+ shard_->Release(ha[i]);
+ }
+ }
+ }
+ }
+}
+
+TEST_F(ClockCacheTest, ClockEvictionTest) {
+ for (bool strict_capacity_limit : {false, true}) {
+ SCOPED_TRACE("strict_capacity_limit = " +
+ std::to_string(strict_capacity_limit));
+
+ NewShard(6, strict_capacity_limit);
+ EXPECT_OK(Insert('a', Cache::Priority::BOTTOM));
+ EXPECT_OK(Insert('b', Cache::Priority::LOW));
+ EXPECT_OK(Insert('c', Cache::Priority::HIGH));
+ EXPECT_OK(Insert('d', Cache::Priority::BOTTOM));
+ EXPECT_OK(Insert('e', Cache::Priority::LOW));
+ EXPECT_OK(Insert('f', Cache::Priority::HIGH));
+
+ EXPECT_TRUE(Lookup('a', /*use*/ false));
+ EXPECT_TRUE(Lookup('b', /*use*/ false));
+ EXPECT_TRUE(Lookup('c', /*use*/ false));
+ EXPECT_TRUE(Lookup('d', /*use*/ false));
+ EXPECT_TRUE(Lookup('e', /*use*/ false));
+ EXPECT_TRUE(Lookup('f', /*use*/ false));
+
+ // Ensure bottom are evicted first, even if new entries are low
+ EXPECT_OK(Insert('g', Cache::Priority::LOW));
+ EXPECT_OK(Insert('h', Cache::Priority::LOW));
+
+ EXPECT_FALSE(Lookup('a', /*use*/ false));
+ EXPECT_TRUE(Lookup('b', /*use*/ false));
+ EXPECT_TRUE(Lookup('c', /*use*/ false));
+ EXPECT_FALSE(Lookup('d', /*use*/ false));
+ EXPECT_TRUE(Lookup('e', /*use*/ false));
+ EXPECT_TRUE(Lookup('f', /*use*/ false));
+ // Mark g & h useful
+ EXPECT_TRUE(Lookup('g', /*use*/ true));
+ EXPECT_TRUE(Lookup('h', /*use*/ true));
+
+ // Then old LOW entries
+ EXPECT_OK(Insert('i', Cache::Priority::LOW));
+ EXPECT_OK(Insert('j', Cache::Priority::LOW));
+
+ EXPECT_FALSE(Lookup('b', /*use*/ false));
+ EXPECT_TRUE(Lookup('c', /*use*/ false));
+ EXPECT_FALSE(Lookup('e', /*use*/ false));
+ EXPECT_TRUE(Lookup('f', /*use*/ false));
+ // Mark g & h useful once again
+ EXPECT_TRUE(Lookup('g', /*use*/ true));
+ EXPECT_TRUE(Lookup('h', /*use*/ true));
+ EXPECT_TRUE(Lookup('i', /*use*/ false));
+ EXPECT_TRUE(Lookup('j', /*use*/ false));
+
+ // Then old HIGH entries
+ EXPECT_OK(Insert('k', Cache::Priority::LOW));
+ EXPECT_OK(Insert('l', Cache::Priority::LOW));
+
+ EXPECT_FALSE(Lookup('c', /*use*/ false));
+ EXPECT_FALSE(Lookup('f', /*use*/ false));
+ EXPECT_TRUE(Lookup('g', /*use*/ false));
+ EXPECT_TRUE(Lookup('h', /*use*/ false));
+ EXPECT_TRUE(Lookup('i', /*use*/ false));
+ EXPECT_TRUE(Lookup('j', /*use*/ false));
+ EXPECT_TRUE(Lookup('k', /*use*/ false));
+ EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+ // Then the (roughly) least recently useful
+ EXPECT_OK(Insert('m', Cache::Priority::HIGH));
+ EXPECT_OK(Insert('n', Cache::Priority::HIGH));
+
+ EXPECT_TRUE(Lookup('g', /*use*/ false));
+ EXPECT_TRUE(Lookup('h', /*use*/ false));
+ EXPECT_FALSE(Lookup('i', /*use*/ false));
+ EXPECT_FALSE(Lookup('j', /*use*/ false));
+ EXPECT_TRUE(Lookup('k', /*use*/ false));
+ EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+ // Now try changing capacity down
+ shard_->SetCapacity(4);
+ // Insert to ensure evictions happen
+ EXPECT_OK(Insert('o', Cache::Priority::LOW));
+ EXPECT_OK(Insert('p', Cache::Priority::LOW));
+
+ EXPECT_FALSE(Lookup('g', /*use*/ false));
+ EXPECT_FALSE(Lookup('h', /*use*/ false));
+ EXPECT_FALSE(Lookup('k', /*use*/ false));
+ EXPECT_FALSE(Lookup('l', /*use*/ false));
+ EXPECT_TRUE(Lookup('m', /*use*/ false));
+ EXPECT_TRUE(Lookup('n', /*use*/ false));
+ EXPECT_TRUE(Lookup('o', /*use*/ false));
+ EXPECT_TRUE(Lookup('p', /*use*/ false));
+
+ // Now try changing capacity up
+ EXPECT_TRUE(Lookup('m', /*use*/ true));
+ EXPECT_TRUE(Lookup('n', /*use*/ true));
+ shard_->SetCapacity(6);
+ EXPECT_OK(Insert('q', Cache::Priority::HIGH));
+ EXPECT_OK(Insert('r', Cache::Priority::HIGH));
+ EXPECT_OK(Insert('s', Cache::Priority::HIGH));
+ EXPECT_OK(Insert('t', Cache::Priority::HIGH));
+
+ EXPECT_FALSE(Lookup('o', /*use*/ false));
+ EXPECT_FALSE(Lookup('p', /*use*/ false));
+ EXPECT_TRUE(Lookup('m', /*use*/ false));
+ EXPECT_TRUE(Lookup('n', /*use*/ false));
+ EXPECT_TRUE(Lookup('q', /*use*/ false));
+ EXPECT_TRUE(Lookup('r', /*use*/ false));
+ EXPECT_TRUE(Lookup('s', /*use*/ false));
+ EXPECT_TRUE(Lookup('t', /*use*/ false));
+ }
+}
+
+void IncrementIntDeleter(const Slice& /*key*/, void* value) {
+ *reinterpret_cast<int*>(value) += 1;
+}
+
+// Testing calls to CorrectNearOverflow in Release
+TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
+ NewShard(6, /*strict_capacity_limit*/ false);
+ HandleImpl* h;
+ int deleted = 0;
+ UniqueId64x2 hkey = TestHashedKey('x');
+ ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1,
+ IncrementIntDeleter, &h, Cache::Priority::HIGH));
+
+ // Some large number outstanding
+ shard_->TEST_RefN(h, 123456789);
+ // Simulate many lookup/ref + release, plenty to overflow counters
+ for (int i = 0; i < 10000; ++i) {
+ shard_->TEST_RefN(h, 1234567);
+ shard_->TEST_ReleaseN(h, 1234567);
+ }
+ // Mark it invisible (to reach a different CorrectNearOverflow() in Release)
+ shard_->Erase(TestKey(hkey), hkey);
+ // Simulate many more lookup/ref + release (one-by-one would be too
+ // expensive for unit test)
+ for (int i = 0; i < 10000; ++i) {
+ shard_->TEST_RefN(h, 1234567);
+ shard_->TEST_ReleaseN(h, 1234567);
+ }
+ // Free all but last 1
+ shard_->TEST_ReleaseN(h, 123456789);
+ // Still alive
+ ASSERT_EQ(deleted, 0);
+ // Free last ref, which will finalize erasure
+ shard_->Release(h);
+ // Deleted
+ ASSERT_EQ(deleted, 1);
+}
+
+// This test is mostly to exercise some corner case logic, by forcing two
+// keys to have the same hash, and more
+TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
+ NewShard(6, /*strict_capacity_limit*/ false);
+ int deleted = 0;
+ UniqueId64x2 hkey1 = TestHashedKey('x');
+ Slice key1 = TestKey(hkey1);
+ UniqueId64x2 hkey2 = TestHashedKey('y');
+ Slice key2 = TestKey(hkey2);
+ UniqueId64x2 hkey3 = TestHashedKey('z');
+ Slice key3 = TestKey(hkey3);
+ HandleImpl* h1;
+ ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1,
+ Cache::Priority::HIGH));
+ HandleImpl* h2;
+ ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2,
+ Cache::Priority::HIGH));
+ HandleImpl* h3;
+ ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3,
+ Cache::Priority::HIGH));
+
+ // Can repeatedly lookup+release despite the hash collision
+ HandleImpl* tmp_h;
+ for (bool erase_if_last_ref : {true, false}) { // but not last ref
+ tmp_h = shard_->Lookup(key1, hkey1);
+ ASSERT_EQ(h1, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+ tmp_h = shard_->Lookup(key2, hkey2);
+ ASSERT_EQ(h2, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+ tmp_h = shard_->Lookup(key3, hkey3);
+ ASSERT_EQ(h3, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+ }
+
+ // Make h1 invisible
+ shard_->Erase(key1, hkey1);
+ // Redundant erase
+ shard_->Erase(key1, hkey1);
+
+ // All still alive
+ ASSERT_EQ(deleted, 0);
+
+ // Invisible to Lookup
+ tmp_h = shard_->Lookup(key1, hkey1);
+ ASSERT_EQ(nullptr, tmp_h);
+
+ // Can still find h2, h3
+ for (bool erase_if_last_ref : {true, false}) { // but not last ref
+ tmp_h = shard_->Lookup(key2, hkey2);
+ ASSERT_EQ(h2, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+ tmp_h = shard_->Lookup(key3, hkey3);
+ ASSERT_EQ(h3, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+ }
+
+ // Also Insert with invisible entry there
+ ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter,
+ nullptr, Cache::Priority::HIGH));
+ tmp_h = shard_->Lookup(key1, hkey1);
+ // Found but distinct handle
+ ASSERT_NE(nullptr, tmp_h);
+ ASSERT_NE(h1, tmp_h);
+ ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true));
+
+ // tmp_h deleted
+ ASSERT_EQ(deleted--, 1);
+
+ // Release last ref on h1 (already invisible)
+ ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false));
+
+ // h1 deleted
+ ASSERT_EQ(deleted--, 1);
+ h1 = nullptr;
+
+ // Can still find h2, h3
+ for (bool erase_if_last_ref : {true, false}) { // but not last ref
+ tmp_h = shard_->Lookup(key2, hkey2);
+ ASSERT_EQ(h2, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+ tmp_h = shard_->Lookup(key3, hkey3);
+ ASSERT_EQ(h3, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+ }
+
+ // Release last ref on h2
+ ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false));
+
+ // h2 still not deleted (unreferenced in cache)
+ ASSERT_EQ(deleted, 0);
+
+ // Can still find it
+ tmp_h = shard_->Lookup(key2, hkey2);
+ ASSERT_EQ(h2, tmp_h);
+
+ // Release last ref on h2, with erase
+ ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true));
+
+ // h2 deleted
+ ASSERT_EQ(deleted--, 1);
+ tmp_h = shard_->Lookup(key2, hkey2);
+ ASSERT_EQ(nullptr, tmp_h);
+
+ // Can still find h3
+ for (bool erase_if_last_ref : {true, false}) { // but not last ref
+ tmp_h = shard_->Lookup(key3, hkey3);
+ ASSERT_EQ(h3, tmp_h);
+ ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+ }
+
+ // Release last ref on h3, without erase
+ ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false));
+
+ // h3 still not deleted (unreferenced in cache)
+ ASSERT_EQ(deleted, 0);
+
+ // Explicit erase
+ shard_->Erase(key3, hkey3);
+
+ // h3 deleted
+ ASSERT_EQ(deleted--, 1);
+ tmp_h = shard_->Lookup(key3, hkey3);
+ ASSERT_EQ(nullptr, tmp_h);
+}
+
+// This uses the public API to effectively test CalcHashBits etc.
+TEST_F(ClockCacheTest, TableSizesTest) {
+ for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) {
+ SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size));
+ for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) {
+ SCOPED_TRACE("est_count = " + std::to_string(est_count));
+ size_t capacity = static_cast<size_t>(est_val_size * est_count);
+ // kDontChargeCacheMetadata
+ auto cache = HyperClockCacheOptions(
+ capacity, est_val_size, /*num shard_bits*/ -1,
+ /*strict_capacity_limit*/ false,
+ /*memory_allocator*/ nullptr, kDontChargeCacheMetadata)
+ .MakeSharedCache();
+ // Table sizes are currently only powers of two
+ EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor);
+ EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0);
+ EXPECT_EQ(cache->GetUsage(), 0);
+
+ // kFullChargeMetaData
+ // Because table sizes are currently only powers of two, sizes get
+ // really weird when metadata is a huge portion of capacity. For example,
+ // doubling the table size could cut by 90% the space available to
+ // values. Therefore, we omit those weird cases for now.
+ if (est_val_size >= 512) {
+ cache = HyperClockCacheOptions(
+ capacity, est_val_size, /*num shard_bits*/ -1,
+ /*strict_capacity_limit*/ false,
+ /*memory_allocator*/ nullptr, kFullChargeCacheMetadata)
+ .MakeSharedCache();
+ double est_count_after_meta =
+ (capacity - cache->GetUsage()) * 1.0 / est_val_size;
+ EXPECT_GE(cache->GetTableAddressCount(),
+ est_count_after_meta / kLoadFactor);
+ EXPECT_LE(cache->GetTableAddressCount(),
+ est_count_after_meta / kLoadFactor * 2.0);
+ }
+ }
+ }
+}
+
+} // namespace clock_cache
+
+class TestSecondaryCache : public SecondaryCache {
+ public:
+ // Specifies what action to take on a lookup for a particular key
+ enum ResultType {
+ SUCCESS,
+ // Fail lookup immediately
+ FAIL,
+ // Defer the result. It will returned after Wait/WaitAll is called
+ DEFER,
+ // Defer the result and eventually return failure
+ DEFER_AND_FAIL
+ };
+
+ using ResultMap = std::unordered_map<std::string, ResultType>;
+
+ explicit TestSecondaryCache(size_t capacity)
+ : num_inserts_(0), num_lookups_(0), inject_failure_(false) {
+ cache_ =
+ NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr,
+ kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+ }
+ ~TestSecondaryCache() override { cache_.reset(); }
+
+ const char* Name() const override { return "TestSecondaryCache"; }
+
+ void InjectFailure() { inject_failure_ = true; }
+
+ void ResetInjectFailure() { inject_failure_ = false; }
+
+ Status Insert(const Slice& key, void* value,
+ const Cache::CacheItemHelper* helper) override {
+ if (inject_failure_) {
+ return Status::Corruption("Insertion Data Corrupted");
+ }
+ CheckCacheKeyCommonPrefix(key);
+ size_t size;
+ char* buf;
+ Status s;
+
+ num_inserts_++;
+ size = (*helper->size_cb)(value);
+ buf = new char[size + sizeof(uint64_t)];
+ EncodeFixed64(buf, size);
+ s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t));
+ if (!s.ok()) {
+ delete[] buf;
+ return s;
+ }
+ return cache_->Insert(key, buf, size,
+ [](const Slice& /*key*/, void* val) -> void {
+ delete[] static_cast<char*>(val);
+ });
+ }
+
+ std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+ const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+ bool /*advise_erase*/, bool& is_in_sec_cache) override {
+ std::string key_str = key.ToString();
+ TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str);
+
+ std::unique_ptr<SecondaryCacheResultHandle> secondary_handle;
+ is_in_sec_cache = false;
+ ResultType type = ResultType::SUCCESS;
+ auto iter = result_map_.find(key.ToString());
+ if (iter != result_map_.end()) {
+ type = iter->second;
+ }
+ if (type == ResultType::FAIL) {
+ return secondary_handle;
+ }
+
+ Cache::Handle* handle = cache_->Lookup(key);
+ num_lookups_++;
+ if (handle) {
+ void* value = nullptr;
+ size_t charge = 0;
+ Status s;
+ if (type != ResultType::DEFER_AND_FAIL) {
+ char* ptr = (char*)cache_->Value(handle);
+ size_t size = DecodeFixed64(ptr);
+ ptr += sizeof(uint64_t);
+ s = create_cb(ptr, size, &value, &charge);
+ }
+ if (s.ok()) {
+ secondary_handle.reset(new TestSecondaryCacheResultHandle(
+ cache_.get(), handle, value, charge, type));
+ is_in_sec_cache = true;
+ } else {
+ cache_->Release(handle);
+ }
+ }
+ return secondary_handle;
+ }
+
+ bool SupportForceErase() const override { return false; }
+
+ void Erase(const Slice& /*key*/) override {}
+
+ void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override {
+ for (SecondaryCacheResultHandle* handle : handles) {
+ TestSecondaryCacheResultHandle* sec_handle =
+ static_cast<TestSecondaryCacheResultHandle*>(handle);
+ sec_handle->SetReady();
+ }
+ }
+
+ std::string GetPrintableOptions() const override { return ""; }
+
+ void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); }
+
+ uint32_t num_inserts() { return num_inserts_; }
+
+ uint32_t num_lookups() { return num_lookups_; }
+
+ void CheckCacheKeyCommonPrefix(const Slice& key) {
+ Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
+ if (ckey_prefix_.empty()) {
+ ckey_prefix_ = current_prefix.ToString();
+ } else {
+ EXPECT_EQ(ckey_prefix_, current_prefix.ToString());
+ }
+ }
+
+ private:
+ class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+ public:
+ TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle,
+ void* value, size_t size, ResultType type)
+ : cache_(cache),
+ handle_(handle),
+ value_(value),
+ size_(size),
+ is_ready_(true) {
+ if (type != ResultType::SUCCESS) {
+ is_ready_ = false;
+ }
+ }
+
+ ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); }
+
+ bool IsReady() override { return is_ready_; }
+
+ void Wait() override {}
+
+ void* Value() override {
+ assert(is_ready_);
+ return value_;
+ }
+
+ size_t Size() override { return Value() ? size_ : 0; }
+
+ void SetReady() { is_ready_ = true; }
+
+ private:
+ Cache* cache_;
+ Cache::Handle* handle_;
+ void* value_;
+ size_t size_;
+ bool is_ready_;
+ };
+
+ std::shared_ptr<Cache> cache_;
+ uint32_t num_inserts_;
+ uint32_t num_lookups_;
+ bool inject_failure_;
+ std::string ckey_prefix_;
+ ResultMap result_map_;
+};
+
+class DBSecondaryCacheTest : public DBTestBase {
+ public:
+ DBSecondaryCacheTest()
+ : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) {
+ fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+ fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+ }
+
+ std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+ std::unique_ptr<Env> fault_env_;
+};
+
+class LRUCacheSecondaryCacheTest : public LRUCacheTest {
+ public:
+ LRUCacheSecondaryCacheTest() : fail_create_(false) {}
+ ~LRUCacheSecondaryCacheTest() {}
+
+ protected:
+ class TestItem {
+ public:
+ TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+ memcpy(buf_.get(), buf, size);
+ }
+ ~TestItem() {}
+
+ char* Buf() { return buf_.get(); }
+ size_t Size() { return size_; }
+ std::string ToString() { return std::string(Buf(), Size()); }
+
+ private:
+ std::unique_ptr<char[]> buf_;
+ size_t size_;
+ };
+
+ static size_t SizeCallback(void* obj) {
+ return reinterpret_cast<TestItem*>(obj)->Size();
+ }
+
+ static Status SaveToCallback(void* from_obj, size_t from_offset,
+ size_t length, void* out) {
+ TestItem* item = reinterpret_cast<TestItem*>(from_obj);
+ char* buf = item->Buf();
+ EXPECT_EQ(length, item->Size());
+ EXPECT_EQ(from_offset, 0);
+ memcpy(out, buf, length);
+ return Status::OK();
+ }
+
+ static void DeletionCallback(const Slice& /*key*/, void* obj) {
+ delete reinterpret_cast<TestItem*>(obj);
+ }
+
+ static Cache::CacheItemHelper helper_;
+
+ static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+ size_t /*size*/, void* /*out*/) {
+ return Status::NotSupported();
+ }
+
+ static Cache::CacheItemHelper helper_fail_;
+
+ Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
+ void** out_obj,
+ size_t* charge) -> Status {
+ if (fail_create_) {
+ return Status::NotSupported();
+ }
+ *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+ *charge = size;
+ return Status::OK();
+ };
+
+ void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+ private:
+ bool fail_create_;
+};
+
+Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_(
+ LRUCacheSecondaryCacheTest::SizeCallback,
+ LRUCacheSecondaryCacheTest::SaveToCallback,
+ LRUCacheSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_(
+ LRUCacheSecondaryCacheTest::SizeCallback,
+ LRUCacheSecondaryCacheTest::SaveToCallbackFail,
+ LRUCacheSecondaryCacheTest::DeletionCallback);
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(4096);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ std::shared_ptr<Statistics> stats = CreateDBStatistics();
+ CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k3 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+ Random rnd(301);
+ // Start with warming k3
+ std::string str3 = rnd.RandomString(1021);
+ ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3));
+
+ std::string str1 = rnd.RandomString(1020);
+ TestItem* item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+ &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+ std::string str2 = rnd.RandomString(1021);
+ TestItem* item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to NVM
+ ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+ &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+ get_perf_context()->Reset();
+ Cache::Handle* handle;
+ handle =
+ cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true, stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str2.size());
+ cache->Release(handle);
+
+ // This lookup should promote k1 and demote k2
+ handle =
+ cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true, stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str1.size());
+ cache->Release(handle);
+
+ // This lookup should promote k3 and demote k1
+ handle =
+ cache->Lookup(k3.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true, stats.get());
+ ASSERT_NE(handle, nullptr);
+ ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str3.size());
+ cache->Release(handle);
+
+ ASSERT_EQ(secondary_cache->num_inserts(), 3u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+ ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS),
+ secondary_cache->num_lookups());
+ PerfContext perf_ctx = *get_perf_context();
+ ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups());
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1020);
+ auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
+ ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length())
+ .IsInvalidArgument());
+ ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(),
+ &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+ item1.release(); // Appease clang-analyze "potential memory leak"
+
+ Cache::Handle* handle;
+ handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator,
+ Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, false);
+ ASSERT_EQ(handle, nullptr);
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1020);
+ TestItem* item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+ &LRUCacheSecondaryCacheTest::helper_fail_,
+ str1.length()));
+ std::string str2 = rnd.RandomString(1020);
+ TestItem* item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to NVM
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+ &LRUCacheSecondaryCacheTest::helper_fail_,
+ str2.length()));
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+
+ Cache::Handle* handle;
+ handle =
+ cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ // This lookup should fail, since k1 demotion would have failed
+ handle =
+ cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ // Since k1 didn't get promoted, k2 should still be in cache
+ handle =
+ cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1020);
+ TestItem* item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+ &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+ std::string str2 = rnd.RandomString(1020);
+ TestItem* item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to NVM
+ ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+ &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+ Cache::Handle* handle;
+ SetFailCreate(true);
+ handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ // This lookup should fail, since k1 creation would have failed
+ handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_EQ(handle, nullptr);
+ // Since k1 didn't get promoted, k2 should still be in cache
+ handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+ true /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+ CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+ Random rnd(301);
+ std::string str1 = rnd.RandomString(1020);
+ TestItem* item1 = new TestItem(str1.data(), str1.length());
+ ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+ &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+ std::string str2 = rnd.RandomString(1020);
+ TestItem* item2 = new TestItem(str2.data(), str2.length());
+ // k1 should be demoted to NVM
+ ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+ &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+ Cache::Handle* handle;
+ handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ // k1 promotion should fail due to the block cache being at capacity,
+ // but the lookup should still succeed
+ Cache::Handle* handle2;
+ handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle2, nullptr);
+ // Since k1 didn't get inserted, k2 should still be in cache
+ cache->Release(handle);
+ cache->Release(handle2);
+ handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, true);
+ ASSERT_NE(handle, nullptr);
+ cache->Release(handle);
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) {
+ LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+
+ // Set the file paranoid check, so after flush, the file will be read
+ // all the blocks will be accessed.
+ options.paranoid_file_checks = true;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+ // After Flush is successful, RocksDB will do the paranoid check for the new
+ // SST file. Meta blocks are always cached in the block cache and they
+ // will not be evicted. When block_2 is cache miss and read out, it is
+ // inserted to the block cache. Note that, block_1 is never successfully
+ // inserted to the block cache. Here are 2 lookups in the secondary cache
+ // for block_1 and block_2
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ Compact("a", "z");
+ // Compaction will create the iterator to scan the whole file. So all the
+ // blocks are needed. Meta blocks are always cached. When block_1 is read
+ // out, block_2 is evicted from block cache and inserted to secondary
+ // cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // The first data block is not in the cache, similarly, trigger the block
+ // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+ // be inserted successfully due to the size. Currently, cache only has
+ // the meta blocks.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // The second data block is not in the cache, similarly, trigger the block
+ // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+ // in the secondary cache. Now block cache has block_2
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // block_2 is in the block cache. There is a block cache hit. No need to
+ // lookup or insert the secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // Lookup the first data block, not in the block cache, so lookup the
+ // secondary cache. Also not in the secondary cache. After Get, still
+ // block_1 is will not be cached.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // Lookup the first data block, not in the block cache, so lookup the
+ // secondary cache. Also not in the secondary cache. After Get, still
+ // block_1 is will not be cached.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+
+ Destroy(options);
+}
+
+// In this test, the block cache size is set to 6100, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// insert and cache block_1 in the block cache (this is the different place
+// from TestSecondaryCacheCorrectness1)
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
+ LRUCacheOptions opts(6100 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.paranoid_file_checks = true;
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+ // After Flush is successful, RocksDB will do the paranoid check for the new
+ // SST file. Meta blocks are always cached in the block cache and they
+ // will not be evicted. When block_2 is cache miss and read out, it is
+ // inserted to the block cache. Thefore, block_1 is evicted from block
+ // cache and successfully inserted to the secondary cache. Here are 2
+ // lookups in the secondary cache for block_1 and block_2.
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ Compact("a", "z");
+ // Compaction will create the iterator to scan the whole file. So all the
+ // blocks are needed. After Flush, only block_2 is cached in block cache
+ // and block_1 is in the secondary cache. So when read block_1, it is
+ // read out from secondary cache and inserted to block cache. At the same
+ // time, block_2 is inserted to secondary cache. Now, secondary cache has
+ // both block_1 and block_2. After compaction, block_1 is in the cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // This Get needs to access block_1, since block_1 is cached in block cache
+ // there is no secondary cache lookup.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // This Get needs to access block_2 which is not in the block cache. So
+ // it will lookup the secondary cache for block_2 and cache it in the
+ // block_cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // This Get needs to access block_2 which is already in the block cache.
+ // No need to lookup secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // This Get needs to access block_1, since block_1 is not in block cache
+ // there is one econdary cache lookup. Then, block_1 is cached in the
+ // block cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // This Get needs to access block_1, since block_1 is cached in block cache
+ // there is no secondary cache lookup.
+ ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ Destroy(options);
+}
+
+// The block cache size is set to 1024*1024, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// cache all the blocks in the block cache and there is not secondary cache
+// insertion. 2 lookup is needed for the blocks.
+TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
+ LRUCacheOptions opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.paranoid_file_checks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1000);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+ // After Flush is successful, RocksDB will do the paranoid check for the new
+ // SST file. Meta blocks are always cached in the block cache and they
+ // will not be evicted. Now, block cache is large enough, it cache
+ // both block_1 and block_2. When first time read block_1 and block_2
+ // there are cache misses. So 2 secondary cache lookups are needed for
+ // the 2 blocks
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ Compact("a", "z");
+ // Compaction will iterate the whole SST file. Since all the data blocks
+ // are in the block cache. No need to lookup the secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1000, v.size());
+ // Since the block cache is large enough, all the blocks are cached. we
+ // do not need to lookup the seondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) {
+ LRUCacheOptions opts(8 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 256;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1000);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+ ASSERT_OK(Flush());
+ Compact("a", "z");
+
+ Random r_index(47);
+ std::string v;
+ for (int i = 0; i < 1000; i++) {
+ uint32_t key_i = r_index.Next() % N;
+ v = Get(Key(key_i));
+ }
+
+ // We have over 200 data blocks there will be multiple insertion
+ // and lookups.
+ ASSERT_GE(secondary_cache->num_inserts(), 1u);
+ ASSERT_GE(secondary_cache->num_lookups(), 1u);
+
+ Destroy(options);
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) {
+ LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.paranoid_file_checks = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+ // After Flush is successful, RocksDB will do the paranoid check for the new
+ // SST file. Meta blocks are always cached in the block cache and they
+ // will not be evicted. When block_2 is cache miss and read out, it is
+ // inserted to the block cache. Note that, block_1 is never successfully
+ // inserted to the block cache. Here are 2 lookups in the secondary cache
+ // for block_1 and block_2
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ // Fail the insertion, in LRU cache, the secondary insertion returned status
+ // is not checked, therefore, the DB will not be influenced.
+ secondary_cache->InjectFailure();
+ Compact("a", "z");
+ // Compaction will create the iterator to scan the whole file. So all the
+ // blocks are needed. Meta blocks are always cached. When block_1 is read
+ // out, block_2 is evicted from block cache and inserted to secondary
+ // cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // The first data block is not in the cache, similarly, trigger the block
+ // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+ // be inserted successfully due to the size. Currently, cache only has
+ // the meta blocks.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // The second data block is not in the cache, similarly, trigger the block
+ // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+ // in the secondary cache. Now block cache has block_2
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+ // block_2 is in the block cache. There is a block cache hit. No need to
+ // lookup or insert the secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // Lookup the first data block, not in the block cache, so lookup the
+ // secondary cache. Also not in the secondary cache. After Get, still
+ // block_1 is will not be cached.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+ v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+ // Lookup the first data block, not in the block cache, so lookup the
+ // secondary cache. Also not in the secondary cache. After Get, still
+ // block_1 is will not be cached.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+ secondary_cache->ResetInjectFailure();
+
+ Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, TestSecondaryWithCompressedCache) {
+ if (!Snappy_Supported()) {
+ ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+ return;
+ }
+ LRUCacheOptions opts(2000 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache_compressed = cache;
+ table_options.no_block_cache = true;
+ table_options.block_size = 1234;
+ Options options = GetDefaultOptions();
+ options.compression = kSnappyCompression;
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ // Partly compressible
+ std::string p_v = rnd.RandomString(507) + std::string(500, ' ');
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+ ASSERT_OK(Flush());
+ for (int i = 0; i < 2 * N; i++) {
+ std::string v = Get(Key(i % N));
+ ASSERT_EQ(1007, v.size());
+ }
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) {
+ LRUCacheOptions opts(1024 /* capacity */, 2 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(32 * 1024);
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ const int num_keys = 32;
+ OffsetableCacheKey ock{"foo", "bar", 1};
+
+ Random rnd(301);
+ std::vector<std::string> values;
+ for (int i = 0; i < num_keys; ++i) {
+ std::string str = rnd.RandomString(1020);
+ values.emplace_back(str);
+ TestItem* item = new TestItem(str.data(), str.length());
+ ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item,
+ &LRUCacheSecondaryCacheTest::helper_,
+ str.length()));
+ }
+ // Force all entries to be evicted to the secondary cache
+ cache->SetCapacity(0);
+ ASSERT_EQ(secondary_cache->num_inserts(), 32u);
+ cache->SetCapacity(32 * 1024);
+
+ secondary_cache->SetResultMap(
+ {{ock.WithOffset(3).AsSlice().ToString(),
+ TestSecondaryCache::ResultType::DEFER},
+ {ock.WithOffset(4).AsSlice().ToString(),
+ TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+ {ock.WithOffset(5).AsSlice().ToString(),
+ TestSecondaryCache::ResultType::FAIL}});
+ std::vector<Cache::Handle*> results;
+ for (int i = 0; i < 6; ++i) {
+ results.emplace_back(cache->Lookup(
+ ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+ test_item_creator, Cache::Priority::LOW, false));
+ }
+ cache->WaitAll(results);
+ for (int i = 0; i < 6; ++i) {
+ if (i == 4) {
+ ASSERT_EQ(cache->Value(results[i]), nullptr);
+ } else if (i == 5) {
+ ASSERT_EQ(results[i], nullptr);
+ continue;
+ } else {
+ TestItem* item = static_cast<TestItem*>(cache->Value(results[i]));
+ ASSERT_EQ(item->ToString(), values[i]);
+ }
+ cache->Release(results[i]);
+ }
+
+ cache.reset();
+ secondary_cache.reset();
+}
+
+// In this test, we have one KV pair per data block. We indirectly determine
+// the cache key associated with each data block (and thus each KV) by using
+// a sync point callback in TestSecondaryCache::Lookup. We then control the
+// lookup result by setting the ResultMap.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) {
+ LRUCacheOptions opts(1 << 20 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ table_options.cache_index_and_filter_blocks = false;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.paranoid_file_checks = true;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 8;
+ std::vector<std::string> keys;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(4000);
+ keys.emplace_back(p_v);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+ // After Flush is successful, RocksDB does the paranoid check for the new
+ // SST file. This will try to lookup all data blocks in the secondary
+ // cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 8u);
+
+ cache->SetCapacity(0);
+ ASSERT_EQ(secondary_cache->num_inserts(), 8u);
+ cache->SetCapacity(1 << 20);
+
+ std::vector<std::string> cache_keys;
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void {
+ cache_keys.emplace_back(*(static_cast<std::string*>(key)));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ for (int i = 0; i < N; ++i) {
+ std::string v = Get(Key(i));
+ ASSERT_EQ(4000, v.size());
+ ASSERT_EQ(v, keys[i]);
+ }
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ASSERT_EQ(secondary_cache->num_lookups(), 16u);
+ cache->SetCapacity(0);
+ cache->SetCapacity(1 << 20);
+
+ ASSERT_EQ(Get(Key(2)), keys[2]);
+ ASSERT_EQ(Get(Key(7)), keys[7]);
+ secondary_cache->SetResultMap(
+ {{cache_keys[3], TestSecondaryCache::ResultType::DEFER},
+ {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+ {cache_keys[5], TestSecondaryCache::ResultType::FAIL}});
+
+ std::vector<std::string> mget_keys(
+ {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)});
+ std::vector<PinnableSlice> values(mget_keys.size());
+ std::vector<Status> s(keys.size());
+ std::vector<Slice> key_slices;
+ for (const std::string& key : mget_keys) {
+ key_slices.emplace_back(key);
+ }
+ uint32_t num_lookups = secondary_cache->num_lookups();
+ dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(),
+ key_slices.size(), key_slices.data(), values.data(),
+ s.data(), false);
+ ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5);
+ for (int i = 0; i < N; ++i) {
+ ASSERT_OK(s[i]);
+ ASSERT_EQ(values[i].ToString(), keys[i]);
+ values[i].Reset();
+ }
+ Destroy(options);
+}
+
+class LRUCacheWithStat : public LRUCache {
+ public:
+ LRUCacheWithStat(
+ size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+ double _high_pri_pool_ratio, double _low_pri_pool_ratio,
+ std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+ bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+ CacheMetadataChargePolicy _metadata_charge_policy =
+ kDontChargeCacheMetadata,
+ const std::shared_ptr<SecondaryCache>& _secondary_cache = nullptr)
+ : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit,
+ _high_pri_pool_ratio, _low_pri_pool_ratio, _memory_allocator,
+ _use_adaptive_mutex, _metadata_charge_policy,
+ _secondary_cache) {
+ insert_count_ = 0;
+ lookup_count_ = 0;
+ }
+ ~LRUCacheWithStat() {}
+
+ Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+ Handle** handle, Priority priority) override {
+ insert_count_++;
+ return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+ }
+ Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+ size_t charge, Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override {
+ insert_count_++;
+ return LRUCache::Insert(key, value, helper, charge, handle, priority);
+ }
+ Handle* Lookup(const Slice& key, Statistics* stats) override {
+ lookup_count_++;
+ return LRUCache::Lookup(key, stats);
+ }
+ Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+ const CreateCallback& create_cb, Priority priority, bool wait,
+ Statistics* stats = nullptr) override {
+ lookup_count_++;
+ return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats);
+ }
+
+ uint32_t GetInsertCount() { return insert_count_; }
+ uint32_t GetLookupcount() { return lookup_count_; }
+ void ResetCount() {
+ insert_count_ = 0;
+ lookup_count_ = 0;
+ }
+
+ private:
+ uint32_t insert_count_;
+ uint32_t lookup_count_;
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
+ LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */,
+ kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+ LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+ cache_opts.capacity, cache_opts.num_shard_bits,
+ cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+ cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+ cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+ cache_opts.secondary_cache);
+ std::shared_ptr<Cache> cache(tmp_cache);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ DestroyAndReopen(options);
+ fault_fs_->SetFailGetUniqueId(true);
+
+ Random rnd(301);
+ const int N = 256;
+ std::vector<std::string> value;
+ char buf[1000];
+ memset(buf, 'a', 1000);
+ value.resize(N);
+ for (int i = 0; i < N; i++) {
+ // std::string p_v = rnd.RandomString(1000);
+ std::string p_v(buf, 1000);
+ value[i] = p_v;
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+ ASSERT_OK(Flush());
+ Compact("a", "z");
+
+ // do th eread for all the key value pairs, so all the blocks should be in
+ // cache
+ uint32_t start_insert = tmp_cache->GetInsertCount();
+ uint32_t start_lookup = tmp_cache->GetLookupcount();
+ std::string v;
+ for (int i = 0; i < N; i++) {
+ v = Get(Key(i));
+ ASSERT_EQ(v, value[i]);
+ }
+ uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+ uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+ ASSERT_EQ(63,
+ static_cast<int>(dump_insert)); // the insert in the block cache
+ ASSERT_EQ(256,
+ static_cast<int>(dump_lookup)); // the lookup in the block cache
+ // We have enough blocks in the block cache
+
+ CacheDumpOptions cd_options;
+ cd_options.clock = fault_env_->GetSystemClock().get();
+ std::string dump_path = db_->GetName() + "/cache_dump";
+ std::unique_ptr<CacheDumpWriter> dump_writer;
+ Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+ &dump_writer);
+ ASSERT_OK(s);
+ std::unique_ptr<CacheDumper> cache_dumper;
+ s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+ &cache_dumper);
+ ASSERT_OK(s);
+ std::vector<DB*> db_list;
+ db_list.push_back(db_);
+ s = cache_dumper->SetDumpFilter(db_list);
+ ASSERT_OK(s);
+ s = cache_dumper->DumpCacheEntriesToWriter();
+ ASSERT_OK(s);
+ cache_dumper.reset();
+
+ // we have a new cache it is empty, then, before we do the Get, we do the
+ // dumpload
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048 * 1024);
+ cache_opts.secondary_cache = secondary_cache;
+ tmp_cache = new LRUCacheWithStat(
+ cache_opts.capacity, cache_opts.num_shard_bits,
+ cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+ cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+ cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+ cache_opts.secondary_cache);
+ std::shared_ptr<Cache> cache_new(tmp_cache);
+ table_options.block_cache = cache_new;
+ table_options.block_size = 4 * 1024;
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+
+ // start to load the data to new block cache
+ start_insert = secondary_cache->num_inserts();
+ start_lookup = secondary_cache->num_lookups();
+ std::unique_ptr<CacheDumpReader> dump_reader;
+ s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+ &dump_reader);
+ ASSERT_OK(s);
+ std::unique_ptr<CacheDumpedLoader> cache_loader;
+ s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+ std::move(dump_reader), &cache_loader);
+ ASSERT_OK(s);
+ s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+ ASSERT_OK(s);
+ uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+ uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+ // check the number we inserted
+ ASSERT_EQ(64, static_cast<int>(load_insert));
+ ASSERT_EQ(0, static_cast<int>(load_lookup));
+ ASSERT_OK(s);
+
+ Reopen(options);
+
+ // After load, we do the Get again
+ start_insert = secondary_cache->num_inserts();
+ start_lookup = secondary_cache->num_lookups();
+ uint32_t cache_insert = tmp_cache->GetInsertCount();
+ uint32_t cache_lookup = tmp_cache->GetLookupcount();
+ for (int i = 0; i < N; i++) {
+ v = Get(Key(i));
+ ASSERT_EQ(v, value[i]);
+ }
+ uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+ uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+ // no insert to secondary cache
+ ASSERT_EQ(0, static_cast<int>(final_insert));
+ // lookup the secondary to get all blocks
+ ASSERT_EQ(64, static_cast<int>(final_lookup));
+ uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+ uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+ // Check the new block cache insert and lookup, should be no insert since all
+ // blocks are from the secondary cache.
+ ASSERT_EQ(0, static_cast<int>(block_insert));
+ ASSERT_EQ(256, static_cast<int>(block_lookup));
+
+ fault_fs_->SetFailGetUniqueId(false);
+ Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
+ LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */,
+ kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+ LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+ cache_opts.capacity, cache_opts.num_shard_bits,
+ cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+ cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+ cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+ cache_opts.secondary_cache);
+ std::shared_ptr<Cache> cache(tmp_cache);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ std::string dbname1 = test::PerThreadDBPath("db_1");
+ ASSERT_OK(DestroyDB(dbname1, options));
+ DB* db1 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname1, &db1));
+ std::string dbname2 = test::PerThreadDBPath("db_2");
+ ASSERT_OK(DestroyDB(dbname2, options));
+ DB* db2 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname2, &db2));
+ fault_fs_->SetFailGetUniqueId(true);
+
+ // write the KVs to db1
+ Random rnd(301);
+ const int N = 256;
+ std::vector<std::string> value1;
+ WriteOptions wo;
+ char buf[1000];
+ memset(buf, 'a', 1000);
+ value1.resize(N);
+ for (int i = 0; i < N; i++) {
+ std::string p_v(buf, 1000);
+ value1[i] = p_v;
+ ASSERT_OK(db1->Put(wo, Key(i), p_v));
+ }
+ ASSERT_OK(db1->Flush(FlushOptions()));
+ Slice bg("a");
+ Slice ed("b");
+ ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+ // Write the KVs to DB2
+ std::vector<std::string> value2;
+ memset(buf, 'b', 1000);
+ value2.resize(N);
+ for (int i = 0; i < N; i++) {
+ std::string p_v(buf, 1000);
+ value2[i] = p_v;
+ ASSERT_OK(db2->Put(wo, Key(i), p_v));
+ }
+ ASSERT_OK(db2->Flush(FlushOptions()));
+ ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+ // do th eread for all the key value pairs, so all the blocks should be in
+ // cache
+ uint32_t start_insert = tmp_cache->GetInsertCount();
+ uint32_t start_lookup = tmp_cache->GetLookupcount();
+ ReadOptions ro;
+ std::string v;
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(db1->Get(ro, Key(i), &v));
+ ASSERT_EQ(v, value1[i]);
+ }
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(db2->Get(ro, Key(i), &v));
+ ASSERT_EQ(v, value2[i]);
+ }
+ uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+ uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+ ASSERT_EQ(128,
+ static_cast<int>(dump_insert)); // the insert in the block cache
+ ASSERT_EQ(512,
+ static_cast<int>(dump_lookup)); // the lookup in the block cache
+ // We have enough blocks in the block cache
+
+ CacheDumpOptions cd_options;
+ cd_options.clock = fault_env_->GetSystemClock().get();
+ std::string dump_path = db1->GetName() + "/cache_dump";
+ std::unique_ptr<CacheDumpWriter> dump_writer;
+ Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+ &dump_writer);
+ ASSERT_OK(s);
+ std::unique_ptr<CacheDumper> cache_dumper;
+ s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+ &cache_dumper);
+ ASSERT_OK(s);
+ std::vector<DB*> db_list;
+ db_list.push_back(db1);
+ s = cache_dumper->SetDumpFilter(db_list);
+ ASSERT_OK(s);
+ s = cache_dumper->DumpCacheEntriesToWriter();
+ ASSERT_OK(s);
+ cache_dumper.reset();
+
+ // we have a new cache it is empty, then, before we do the Get, we do the
+ // dumpload
+ std::shared_ptr<TestSecondaryCache> secondary_cache =
+ std::make_shared<TestSecondaryCache>(2048 * 1024);
+ cache_opts.secondary_cache = secondary_cache;
+ tmp_cache = new LRUCacheWithStat(
+ cache_opts.capacity, cache_opts.num_shard_bits,
+ cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+ cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+ cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+ cache_opts.secondary_cache);
+ std::shared_ptr<Cache> cache_new(tmp_cache);
+ table_options.block_cache = cache_new;
+ table_options.block_size = 4 * 1024;
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+
+ // Start the cache loading process
+ start_insert = secondary_cache->num_inserts();
+ start_lookup = secondary_cache->num_lookups();
+ std::unique_ptr<CacheDumpReader> dump_reader;
+ s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+ &dump_reader);
+ ASSERT_OK(s);
+ std::unique_ptr<CacheDumpedLoader> cache_loader;
+ s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+ std::move(dump_reader), &cache_loader);
+ ASSERT_OK(s);
+ s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+ ASSERT_OK(s);
+ uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+ uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+ // check the number we inserted
+ ASSERT_EQ(64, static_cast<int>(load_insert));
+ ASSERT_EQ(0, static_cast<int>(load_lookup));
+ ASSERT_OK(s);
+
+ ASSERT_OK(db1->Close());
+ delete db1;
+ ASSERT_OK(DB::Open(options, dbname1, &db1));
+
+ // After load, we do the Get again. To validate the cache, we do not allow any
+ // I/O, so we set the file system to false.
+ IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+ fault_fs_->SetFilesystemActive(false, error_msg);
+ start_insert = secondary_cache->num_inserts();
+ start_lookup = secondary_cache->num_lookups();
+ uint32_t cache_insert = tmp_cache->GetInsertCount();
+ uint32_t cache_lookup = tmp_cache->GetLookupcount();
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(db1->Get(ro, Key(i), &v));
+ ASSERT_EQ(v, value1[i]);
+ }
+ uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+ uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+ // no insert to secondary cache
+ ASSERT_EQ(0, static_cast<int>(final_insert));
+ // lookup the secondary to get all blocks
+ ASSERT_EQ(64, static_cast<int>(final_lookup));
+ uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+ uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+ // Check the new block cache insert and lookup, should be no insert since all
+ // blocks are from the secondary cache.
+ ASSERT_EQ(0, static_cast<int>(block_insert));
+ ASSERT_EQ(256, static_cast<int>(block_lookup));
+ fault_fs_->SetFailGetUniqueId(false);
+ fault_fs_->SetFilesystemActive(true);
+ delete db1;
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname1, options));
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+// Test the option not to use the secondary cache in a certain DB.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
+ LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+ options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ // Set the file paranoid check, so after flush, the file will be read
+ // all the blocks will be accessed.
+ options.paranoid_file_checks = true;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i + 70), p_v));
+ }
+
+ ASSERT_OK(Flush());
+
+ // Flush will trigger the paranoid check and read blocks. But only block cache
+ // will be read. No operations for secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ Compact("a", "z");
+
+ // Compaction will also insert and evict blocks, no operations to the block
+ // cache. No operations for secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the data in first block. Cache miss, direclty read from SST file.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the second block.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+
+ // block cache hit
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(70));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the first block in the second SST file. Cache miss and trigger SST
+ // file read. No operations for secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(75));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the second block in the second SST file. Cache miss and trigger SST
+ // file read. No operations for secondary cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ Destroy(options);
+}
+
+// We disable the secondary cache in DBOptions at first. Close and reopen the DB
+// with new options, which set the lowest_used_cache_tier to
+// kNonVolatileBlockTier. So secondary cache will be used.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
+ LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ fault_fs_->SetFailGetUniqueId(true);
+ options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+ // Set the file paranoid check, so after flush, the file will be read
+ // all the blocks will be accessed.
+ options.paranoid_file_checks = true;
+ DestroyAndReopen(options);
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i), p_v));
+ }
+
+ ASSERT_OK(Flush());
+
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(Put(Key(i + 70), p_v));
+ }
+
+ ASSERT_OK(Flush());
+
+ // Flush will trigger the paranoid check and read blocks. But only block cache
+ // will be read.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ Compact("a", "z");
+
+ // Compaction will also insert and evict blocks, no operations to the block
+ // cache.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ std::string v = Get(Key(0));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the data in first block. Cache miss, direclty read from SST file.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+
+ // Check the second block.
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ v = Get(Key(5));
+ ASSERT_EQ(1007, v.size());
+
+ // block cache hit
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+ // Change the option to enable secondary cache after we Reopen the DB
+ options.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+ Reopen(options);
+
+ v = Get(Key(70));
+ ASSERT_EQ(1007, v.size());
+
+ // Enable the secondary cache, trigger lookup of the first block in second SST
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+ v = Get(Key(75));
+ ASSERT_EQ(1007, v.size());
+
+ // trigger lookup of the second block in second SST
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+ Destroy(options);
+}
+
+// Two DB test. We create 2 DBs sharing the same block cache and secondary
+// cache. We diable the secondary cache option for DB2.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
+ LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+ false /* strict_capacity_limit */,
+ 0.5 /* high_pri_pool_ratio */,
+ nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+ kDontChargeCacheMetadata);
+ std::shared_ptr<TestSecondaryCache> secondary_cache(
+ new TestSecondaryCache(2048 * 1024));
+ opts.secondary_cache = secondary_cache;
+ std::shared_ptr<Cache> cache = NewLRUCache(opts);
+ BlockBasedTableOptions table_options;
+ table_options.block_cache = cache;
+ table_options.block_size = 4 * 1024;
+ Options options = GetDefaultOptions();
+ options.create_if_missing = true;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.env = fault_env_.get();
+ options.paranoid_file_checks = true;
+ std::string dbname1 = test::PerThreadDBPath("db_t_1");
+ ASSERT_OK(DestroyDB(dbname1, options));
+ DB* db1 = nullptr;
+ ASSERT_OK(DB::Open(options, dbname1, &db1));
+ std::string dbname2 = test::PerThreadDBPath("db_t_2");
+ ASSERT_OK(DestroyDB(dbname2, options));
+ DB* db2 = nullptr;
+ Options options2 = options;
+ options2.lowest_used_cache_tier = CacheTier::kVolatileTier;
+ ASSERT_OK(DB::Open(options2, dbname2, &db2));
+ fault_fs_->SetFailGetUniqueId(true);
+
+ WriteOptions wo;
+ Random rnd(301);
+ const int N = 6;
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(db1->Put(wo, Key(i), p_v));
+ }
+
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+ ASSERT_OK(db1->Flush(FlushOptions()));
+
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ for (int i = 0; i < N; i++) {
+ std::string p_v = rnd.RandomString(1007);
+ ASSERT_OK(db2->Put(wo, Key(i), p_v));
+ }
+
+ // No change in the secondary cache, since it is disabled in DB2
+ ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+ ASSERT_OK(db2->Flush(FlushOptions()));
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ Slice bg("a");
+ Slice ed("b");
+ ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+ ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+ ReadOptions ro;
+ std::string v;
+ ASSERT_OK(db1->Get(ro, Key(0), &v));
+ ASSERT_EQ(1007, v.size());
+
+ // DB 1 has lookup block 1 and it is miss in block cache, trigger secondary
+ // cache lookup
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+ ASSERT_OK(db1->Get(ro, Key(5), &v));
+ ASSERT_EQ(1007, v.size());
+
+ // DB 1 lookup the second block and it is miss in block cache, trigger
+ // secondary cache lookup
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ ASSERT_OK(db2->Get(ro, Key(0), &v));
+ ASSERT_EQ(1007, v.size());
+
+ // For db2, it is not enabled with secondary cache, so no search in the
+ // secondary cache
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ ASSERT_OK(db2->Get(ro, Key(5), &v));
+ ASSERT_EQ(1007, v.size());
+
+ // For db2, it is not enabled with secondary cache, so no search in the
+ // secondary cache
+ ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+ ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+ fault_fs_->SetFailGetUniqueId(false);
+ fault_fs_->SetFilesystemActive(true);
+ delete db1;
+ delete db2;
+ ASSERT_OK(DestroyDB(dbname1, options));
+ ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/secondary_cache.cc b/src/rocksdb/cache/secondary_cache.cc
new file mode 100644
index 000000000..84352db71
--- /dev/null
+++ b/src/rocksdb/cache/secondary_cache.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/secondary_cache.h"
+
+#include "cache/cache_entry_roles.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); }
+
+Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length,
+ void* out) {
+ const Slice& slice = *static_cast<Slice*>(from_obj);
+ std::memcpy(out, slice.data() + from_offset, length);
+ return Status::OK();
+}
+
+} // namespace
+
+Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
+ static Cache::CacheItemHelper helper{
+ &SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()};
+ // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
+ return Insert(key, const_cast<Slice*>(&saved), &helper);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/sharded_cache.cc b/src/rocksdb/cache/sharded_cache.cc
new file mode 100644
index 000000000..9ebca3ba8
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/sharded_cache.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
+ bool strict_capacity_limit,
+ std::shared_ptr<MemoryAllocator> allocator)
+ : Cache(std::move(allocator)),
+ last_id_(1),
+ shard_mask_((uint32_t{1} << num_shard_bits) - 1),
+ strict_capacity_limit_(strict_capacity_limit),
+ capacity_(capacity) {}
+
+size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
+ uint32_t num_shards = GetNumShards();
+ return (capacity + (num_shards - 1)) / num_shards;
+}
+
+size_t ShardedCacheBase::GetPerShardCapacity() const {
+ return ComputePerShardCapacity(GetCapacity());
+}
+
+uint64_t ShardedCacheBase::NewId() {
+ return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+size_t ShardedCacheBase::GetCapacity() const {
+ MutexLock l(&config_mutex_);
+ return capacity_;
+}
+
+bool ShardedCacheBase::HasStrictCapacityLimit() const {
+ MutexLock l(&config_mutex_);
+ return strict_capacity_limit_;
+}
+
+size_t ShardedCacheBase::GetUsage(Handle* handle) const {
+ return GetCharge(handle);
+}
+
+std::string ShardedCacheBase::GetPrintableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+ {
+ MutexLock l(&config_mutex_);
+ snprintf(buffer, kBufferSize, " capacity : %" ROCKSDB_PRIszt "\n",
+ capacity_);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " num_shard_bits : %d\n",
+ GetNumShardBits());
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " strict_capacity_limit : %d\n",
+ strict_capacity_limit_);
+ ret.append(buffer);
+ }
+ snprintf(buffer, kBufferSize, " memory_allocator : %s\n",
+ memory_allocator() ? memory_allocator()->Name() : "None");
+ ret.append(buffer);
+ AppendPrintableOptions(ret);
+ return ret;
+}
+
+int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
+ int num_shard_bits = 0;
+ size_t num_shards = capacity / min_shard_size;
+ while (num_shards >>= 1) {
+ if (++num_shard_bits >= 6) {
+ // No more than 6.
+ return num_shard_bits;
+ }
+ }
+ return num_shard_bits;
+}
+
+int ShardedCacheBase::GetNumShardBits() const {
+ return BitsSetToOne(shard_mask_);
+}
+
+uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; }
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/sharded_cache.h b/src/rocksdb/cache/sharded_cache.h
new file mode 100644
index 000000000..e3271cc7b
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.h
@@ -0,0 +1,322 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <string>
+
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Optional base class for classes implementing the CacheShard concept
+class CacheShardBase {
+ public:
+ explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy)
+ : metadata_charge_policy_(metadata_charge_policy) {}
+
+ using DeleterFn = Cache::DeleterFn;
+
+ // Expected by concept CacheShard (TODO with C++20 support)
+ // Some Defaults
+ std::string GetPrintableOptions() const { return ""; }
+ using HashVal = uint64_t;
+ using HashCref = uint64_t;
+ static inline HashVal ComputeHash(const Slice& key) {
+ return GetSliceNPHash64(key);
+ }
+ static inline uint32_t HashPieceForSharding(HashCref hash) {
+ return Lower32of64(hash);
+ }
+ void AppendPrintableOptions(std::string& /*str*/) const {}
+
+ // Must be provided for concept CacheShard (TODO with C++20 support)
+ /*
+ struct HandleImpl { // for concept HandleImpl
+ HashVal hash;
+ HashCref GetHash() const;
+ ...
+ };
+ Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
+ DeleterFn deleter, HandleImpl** handle,
+ Cache::Priority priority) = 0;
+ Status Insert(const Slice& key, HashCref hash, void* value,
+ const Cache::CacheItemHelper* helper, size_t charge,
+ HandleImpl** handle, Cache::Priority priority) = 0;
+ HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
+ HandleImpl* Lookup(const Slice& key, HashCref hash,
+ const Cache::CacheItemHelper* helper,
+ const Cache::CreateCallback& create_cb,
+ Cache::Priority priority, bool wait,
+ Statistics* stats) = 0;
+ bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
+ bool IsReady(HandleImpl* handle) = 0;
+ void Wait(HandleImpl* handle) = 0;
+ bool Ref(HandleImpl* handle) = 0;
+ void Erase(const Slice& key, HashCref hash) = 0;
+ void SetCapacity(size_t capacity) = 0;
+ void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+ size_t GetUsage() const = 0;
+ size_t GetPinnedUsage() const = 0;
+ size_t GetOccupancyCount() const = 0;
+ size_t GetTableAddressCount() const = 0;
+ // Handles iterating over roughly `average_entries_per_lock` entries, using
+ // `state` to somehow record where it last ended up. Caller initially uses
+ // *state == 0 and implementation sets *state = SIZE_MAX to indicate
+ // completion.
+ void ApplyToSomeEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ size_t average_entries_per_lock, size_t* state) = 0;
+ void EraseUnRefEntries() = 0;
+ */
+
+ protected:
+ const CacheMetadataChargePolicy metadata_charge_policy_;
+};
+
+// Portions of ShardedCache that do not depend on the template parameter
+class ShardedCacheBase : public Cache {
+ public:
+ ShardedCacheBase(size_t capacity, int num_shard_bits,
+ bool strict_capacity_limit,
+ std::shared_ptr<MemoryAllocator> memory_allocator);
+ virtual ~ShardedCacheBase() = default;
+
+ int GetNumShardBits() const;
+ uint32_t GetNumShards() const;
+
+ uint64_t NewId() override;
+
+ bool HasStrictCapacityLimit() const override;
+ size_t GetCapacity() const override;
+
+ using Cache::GetUsage;
+ size_t GetUsage(Handle* handle) const override;
+ std::string GetPrintableOptions() const override;
+
+ protected: // fns
+ virtual void AppendPrintableOptions(std::string& str) const = 0;
+ size_t GetPerShardCapacity() const;
+ size_t ComputePerShardCapacity(size_t capacity) const;
+
+ protected: // data
+ std::atomic<uint64_t> last_id_; // For NewId
+ const uint32_t shard_mask_;
+
+ // Dynamic configuration parameters, guarded by config_mutex_
+ bool strict_capacity_limit_;
+ size_t capacity_;
+ mutable port::Mutex config_mutex_;
+};
+
+// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are typically sharded by the lowest num_shard_bits bits of hash value
+// so that the upper bits of the hash value can keep a stable ordering of
+// table entries even as the table grows (using more upper hash bits).
+// See CacheShardBase above for what is expected of the CacheShard parameter.
+template <class CacheShard>
+class ShardedCache : public ShardedCacheBase {
+ public:
+ using HashVal = typename CacheShard::HashVal;
+ using HashCref = typename CacheShard::HashCref;
+ using HandleImpl = typename CacheShard::HandleImpl;
+
+ ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+ std::shared_ptr<MemoryAllocator> allocator)
+ : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
+ allocator),
+ shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
+ sizeof(CacheShard) * GetNumShards()))),
+ destroy_shards_in_dtor_(false) {}
+
+ virtual ~ShardedCache() {
+ if (destroy_shards_in_dtor_) {
+ ForEachShard([](CacheShard* cs) { cs->~CacheShard(); });
+ }
+ port::cacheline_aligned_free(shards_);
+ }
+
+ CacheShard& GetShard(HashCref hash) {
+ return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+ }
+
+ const CacheShard& GetShard(HashCref hash) const {
+ return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+ }
+
+ void SetCapacity(size_t capacity) override {
+ MutexLock l(&config_mutex_);
+ capacity_ = capacity;
+ auto per_shard = ComputePerShardCapacity(capacity);
+ ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); });
+ }
+
+ void SetStrictCapacityLimit(bool s_c_l) override {
+ MutexLock l(&config_mutex_);
+ strict_capacity_limit_ = s_c_l;
+ ForEachShard(
+ [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
+ }
+
+ Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+ Handle** handle, Priority priority) override {
+ HashVal hash = CacheShard::ComputeHash(key);
+ auto h_out = reinterpret_cast<HandleImpl**>(handle);
+ return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
+ priority);
+ }
+ Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+ size_t charge, Handle** handle = nullptr,
+ Priority priority = Priority::LOW) override {
+ if (!helper) {
+ return Status::InvalidArgument();
+ }
+ HashVal hash = CacheShard::ComputeHash(key);
+ auto h_out = reinterpret_cast<HandleImpl**>(handle);
+ return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
+ priority);
+ }
+
+ Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
+ HashVal hash = CacheShard::ComputeHash(key);
+ HandleImpl* result = GetShard(hash).Lookup(key, hash);
+ return reinterpret_cast<Handle*>(result);
+ }
+ Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+ const CreateCallback& create_cb, Priority priority, bool wait,
+ Statistics* stats = nullptr) override {
+ HashVal hash = CacheShard::ComputeHash(key);
+ HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
+ priority, wait, stats);
+ return reinterpret_cast<Handle*>(result);
+ }
+
+ void Erase(const Slice& key) override {
+ HashVal hash = CacheShard::ComputeHash(key);
+ GetShard(hash).Erase(key, hash);
+ }
+
+ bool Release(Handle* handle, bool useful,
+ bool erase_if_last_ref = false) override {
+ auto h = reinterpret_cast<HandleImpl*>(handle);
+ return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
+ }
+ bool IsReady(Handle* handle) override {
+ auto h = reinterpret_cast<HandleImpl*>(handle);
+ return GetShard(h->GetHash()).IsReady(h);
+ }
+ void Wait(Handle* handle) override {
+ auto h = reinterpret_cast<HandleImpl*>(handle);
+ GetShard(h->GetHash()).Wait(h);
+ }
+ bool Ref(Handle* handle) override {
+ auto h = reinterpret_cast<HandleImpl*>(handle);
+ return GetShard(h->GetHash()).Ref(h);
+ }
+ bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+ return Release(handle, true /*useful*/, erase_if_last_ref);
+ }
+ using ShardedCacheBase::GetUsage;
+ size_t GetUsage() const override {
+ return SumOverShards2(&CacheShard::GetUsage);
+ }
+ size_t GetPinnedUsage() const override {
+ return SumOverShards2(&CacheShard::GetPinnedUsage);
+ }
+ size_t GetOccupancyCount() const override {
+ return SumOverShards2(&CacheShard::GetPinnedUsage);
+ }
+ size_t GetTableAddressCount() const override {
+ return SumOverShards2(&CacheShard::GetTableAddressCount);
+ }
+ void ApplyToAllEntries(
+ const std::function<void(const Slice& key, void* value, size_t charge,
+ DeleterFn deleter)>& callback,
+ const ApplyToAllEntriesOptions& opts) override {
+ uint32_t num_shards = GetNumShards();
+ // Iterate over part of each shard, rotating between shards, to
+ // minimize impact on latency of concurrent operations.
+ std::unique_ptr<size_t[]> states(new size_t[num_shards]{});
+
+ size_t aepl = opts.average_entries_per_lock;
+ aepl = std::min(aepl, size_t{1});
+
+ bool remaining_work;
+ do {
+ remaining_work = false;
+ for (uint32_t i = 0; i < num_shards; i++) {
+ if (states[i] != SIZE_MAX) {
+ shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]);
+ remaining_work |= states[i] != SIZE_MAX;
+ }
+ }
+ } while (remaining_work);
+ }
+
+ virtual void EraseUnRefEntries() override {
+ ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); });
+ }
+
+ void DisownData() override {
+ // Leak data only if that won't generate an ASAN/valgrind warning.
+ if (!kMustFreeHeapAllocations) {
+ destroy_shards_in_dtor_ = false;
+ }
+ }
+
+ protected:
+ inline void ForEachShard(const std::function<void(CacheShard*)>& fn) {
+ uint32_t num_shards = GetNumShards();
+ for (uint32_t i = 0; i < num_shards; i++) {
+ fn(shards_ + i);
+ }
+ }
+
+ inline size_t SumOverShards(
+ const std::function<size_t(CacheShard&)>& fn) const {
+ uint32_t num_shards = GetNumShards();
+ size_t result = 0;
+ for (uint32_t i = 0; i < num_shards; i++) {
+ result += fn(shards_[i]);
+ }
+ return result;
+ }
+
+ inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const {
+ return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); });
+ }
+
+ // Must be called exactly once by derived class constructor
+ void InitShards(const std::function<void(CacheShard*)>& placement_new) {
+ ForEachShard(placement_new);
+ destroy_shards_in_dtor_ = true;
+ }
+
+ void AppendPrintableOptions(std::string& str) const override {
+ shards_[0].AppendPrintableOptions(str);
+ }
+
+ private:
+ CacheShard* const shards_;
+ bool destroy_shards_in_dtor_;
+};
+
+// 512KB is traditional minimum shard size.
+int GetDefaultCacheShardBits(size_t capacity,
+ size_t min_shard_size = 512U * 1024U);
+
+} // namespace ROCKSDB_NAMESPACE