diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/cache | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/cache')
26 files changed, 12558 insertions, 0 deletions
diff --git a/src/rocksdb/cache/cache.cc b/src/rocksdb/cache/cache.cc new file mode 100644 index 000000000..7d23fb757 --- /dev/null +++ b/src/rocksdb/cache/cache.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include "cache/lru_cache.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +static std::unordered_map<std::string, OptionTypeInfo> + lru_cache_options_type_info = { + {"capacity", + {offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"num_shard_bits", + {offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strict_capacity_limit", + {offsetof(struct LRUCacheOptions, strict_capacity_limit), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"high_pri_pool_ratio", + {offsetof(struct LRUCacheOptions, high_pri_pool_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"low_pri_pool_ratio", + {offsetof(struct LRUCacheOptions, low_pri_pool_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map<std::string, OptionTypeInfo> + comp_sec_cache_options_type_info = { + {"capacity", + {offsetof(struct CompressedSecondaryCacheOptions, capacity), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"num_shard_bits", + {offsetof(struct CompressedSecondaryCacheOptions, num_shard_bits), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression_type", + {offsetof(struct CompressedSecondaryCacheOptions, compression_type), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compress_format_version", + {offsetof(struct CompressedSecondaryCacheOptions, + compress_format_version), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enable_custom_split_merge", + {offsetof(struct CompressedSecondaryCacheOptions, + enable_custom_split_merge), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; +#endif // ROCKSDB_LITE + +Status SecondaryCache::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr<SecondaryCache>* result) { + if (value.find("compressed_secondary_cache://") == 0) { + std::string args = value; + args.erase(0, std::strlen("compressed_secondary_cache://")); + Status status; + std::shared_ptr<SecondaryCache> sec_cache; + +#ifndef ROCKSDB_LITE + CompressedSecondaryCacheOptions sec_cache_opts; + status = OptionTypeInfo::ParseStruct(config_options, "", + &comp_sec_cache_options_type_info, "", + args, &sec_cache_opts); + if (status.ok()) { + sec_cache = NewCompressedSecondaryCache(sec_cache_opts); + } + +#else + (void)config_options; + status = Status::NotSupported( + "Cannot load compressed secondary cache in LITE mode ", args); +#endif //! ROCKSDB_LITE + + if (status.ok()) { + result->swap(sec_cache); + } + return status; + } else { + return LoadSharedObject<SecondaryCache>(config_options, value, nullptr, + result); + } +} + +Status Cache::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr<Cache>* result) { + Status status; + std::shared_ptr<Cache> cache; + if (value.find('=') == std::string::npos) { + cache = NewLRUCache(ParseSizeT(value)); + } else { +#ifndef ROCKSDB_LITE + LRUCacheOptions cache_opts; + status = OptionTypeInfo::ParseStruct(config_options, "", + &lru_cache_options_type_info, "", + value, &cache_opts); + if (status.ok()) { + cache = NewLRUCache(cache_opts); + } +#else + (void)config_options; + status = Status::NotSupported("Cannot load cache in LITE mode ", value); +#endif //! ROCKSDB_LITE + } + if (status.ok()) { + result->swap(cache); + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_bench.cc b/src/rocksdb/cache/cache_bench.cc new file mode 100644 index 000000000..f836939a3 --- /dev/null +++ b/src/rocksdb/cache/cache_bench.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else +#include "rocksdb/cache_bench_tool.h" +int main(int argc, char** argv) { + return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv); +} +#endif // GFLAGS diff --git a/src/rocksdb/cache/cache_bench_tool.cc b/src/rocksdb/cache/cache_bench_tool.cc new file mode 100644 index 000000000..73360f414 --- /dev/null +++ b/src/rocksdb/cache/cache_bench_tool.cc @@ -0,0 +1,973 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache_key.h" +#ifdef GFLAGS +#include <cinttypes> +#include <cstddef> +#include <cstdio> +#include <limits> +#include <memory> +#include <set> +#include <sstream> + +#include "db/db_impl/db_impl.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/cachable_entry.h" +#include "util/coding.h" +#include "util/distributed_mutex.h" +#include "util/gflags_compat.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +static constexpr uint32_t KiB = uint32_t{1} << 10; +static constexpr uint32_t MiB = KiB << 10; +static constexpr uint64_t GiB = MiB << 10; + +DEFINE_uint32(threads, 16, "Number of concurrent threads to run."); +DEFINE_uint64(cache_size, 1 * GiB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_uint32(num_shard_bits, 6, "shard_bits."); + +DEFINE_double(resident_ratio, 0.25, + "Ratio of keys fitting in cache to keyspace."); +DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); +DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); + +DEFINE_uint32(skew, 5, "Degree of skew in key selection"); +DEFINE_bool(populate_cache, true, "Populate cache before operations"); + +DEFINE_uint32(lookup_insert_percent, 87, + "Ratio of lookup (+ insert on not found) to total workload " + "(expressed as a percentage)"); +DEFINE_uint32(insert_percent, 2, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_uint32(lookup_percent, 10, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_uint32(erase_percent, 1, + "Ratio of erase to total workload (expressed as a percentage)"); +DEFINE_bool(gather_stats, false, + "Whether to periodically simulate gathering block cache stats, " + "using one more thread."); +DEFINE_uint32( + gather_stats_sleep_ms, 1000, + "How many milliseconds to sleep between each gathering of stats."); + +DEFINE_uint32(gather_stats_entries_per_lock, 256, + "For Cache::ApplyToAllEntries"); +DEFINE_bool(skewed, false, "If true, skew the key access distribution"); + +DEFINE_bool(lean, false, + "If true, no additional computation is performed besides cache " + "operations."); + +#ifndef ROCKSDB_LITE +DEFINE_string(secondary_cache_uri, "", + "Full URI for creating a custom secondary cache object"); +static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache; +#endif // ROCKSDB_LITE + +DEFINE_string(cache_type, "lru_cache", "Type of block cache."); + +// ## BEGIN stress_cache_key sub-tool options ## +// See class StressCacheKey below. +DEFINE_bool(stress_cache_key, false, + "If true, run cache key stress test instead"); +DEFINE_uint32( + sck_files_per_day, 2500000, + "(-stress_cache_key) Simulated files generated per simulated day"); +// NOTE: Giving each run a specified lifetime, rather than e.g. "until +// first collision" ensures equal skew from start-up, when collisions are +// less likely. +DEFINE_uint32(sck_days_per_run, 90, + "(-stress_cache_key) Number of days to simulate in each run"); +// NOTE: The number of observed collisions directly affects the relative +// accuracy of the predicted probabilities. 15 observations should be well +// within factor-of-2 accuracy. +DEFINE_uint32( + sck_min_collision, 15, + "(-stress_cache_key) Keep running until this many collisions seen"); +// sck_file_size_mb can be thought of as average file size. The simulation is +// not precise enough to care about the distribution of file sizes; other +// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo) +// indicate the distribution only makes a small difference (e.g. < 2x factor) +DEFINE_uint32( + sck_file_size_mb, 32, + "(-stress_cache_key) Simulated file size in MiB, for accounting purposes"); +DEFINE_uint32(sck_reopen_nfiles, 100, + "(-stress_cache_key) Simulate DB re-open average every n files"); +DEFINE_uint32(sck_newdb_nreopen, 1000, + "(-stress_cache_key) Simulate new DB average every n re-opens"); +DEFINE_uint32(sck_restarts_per_day, 24, + "(-stress_cache_key) Average simulated process restarts per day " + "(across DBs)"); +DEFINE_uint32( + sck_db_count, 100, + "(-stress_cache_key) Parallel DBs in simulation sharing a block cache"); +DEFINE_uint32( + sck_table_bits, 20, + "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)"); +// sck_keep_bits being well below full 128 bits amplifies the collision +// probability so that the true probability can be estimated through observed +// collisions. (More explanation below.) +DEFINE_uint32( + sck_keep_bits, 50, + "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)"); +// sck_randomize is used to validate whether cache key is performing "better +// than random." Even with this setting, file offsets are not randomized. +DEFINE_bool(sck_randomize, false, + "(-stress_cache_key) Randomize (hash) cache key"); +// See https://github.com/facebook/rocksdb/pull/9058 +DEFINE_bool(sck_footer_unique_id, false, + "(-stress_cache_key) Simulate using proposed footer unique id"); +// ## END stress_cache_key sub-tool options ## + +namespace ROCKSDB_NAMESPACE { + +class CacheBench; +namespace { +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) {} + + ~SharedState() {} + + port::Mutex* GetMutex() { return &mu_; } + + port::CondVar* GetCondVar() { return &cv_; } + + CacheBench* GetCacheBench() const { return cache_bench_; } + + void IncInitialized() { num_initialized_++; } + + void IncDone() { num_done_++; } + + bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; } + + bool AllDone() const { return num_done_ >= FLAGS_threads; } + + void SetStart() { start_ = true; } + + bool Started() const { return start_; } + + private: + port::Mutex mu_; + port::CondVar cv_; + + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random64 rnd; + SharedState* shared; + HistogramImpl latency_ns_hist; + uint64_t duration_us = 0; + + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rnd(1000 + index), shared(_shared) {} +}; + +struct KeyGen { + char key_data[27]; + + Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { + uint64_t key = 0; + if (!FLAGS_skewed) { + uint64_t raw = rnd.Next(); + // Skew according to setting + for (uint32_t i = 0; i < FLAGS_skew; ++i) { + raw = std::min(raw, rnd.Next()); + } + key = FastRange64(raw, max_key); + } else { + key = rnd.Skewed(max_log); + if (key > max_key) { + key -= max_key; + } + } + // Variable size and alignment + size_t off = key % 8; + key_data[0] = char{42}; + EncodeFixed64(key_data + 1, key); + key_data[9] = char{11}; + EncodeFixed64(key_data + 10, key); + key_data[18] = char{4}; + EncodeFixed64(key_data + 19, key); + assert(27 >= kCacheKeySize); + return Slice(&key_data[off], kCacheKeySize); + } +}; + +char* createValue(Random64& rnd) { + char* rv = new char[FLAGS_value_bytes]; + // Fill with some filler data, and take some CPU time + for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { + EncodeFixed64(rv + i, rnd.Next()); + } + return rv; +} + +// Callbacks for secondary cache +size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; } + +Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) { + memcpy(out, obj, size); + return Status::OK(); +} + +// Different deleters to simulate using deleter to gather +// stats on the code origin and kind of cache entries. +void deleter1(const Slice& /*key*/, void* value) { + delete[] static_cast<char*>(value); +} +void deleter2(const Slice& /*key*/, void* value) { + delete[] static_cast<char*>(value); +} +void deleter3(const Slice& /*key*/, void* value) { + delete[] static_cast<char*>(value); +} + +Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1); +Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2); +Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3); +} // namespace + +class CacheBench { + static constexpr uint64_t kHundredthUint64 = + std::numeric_limits<uint64_t>::max() / 100U; + + public: + CacheBench() + : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio / + FLAGS_value_bytes)), + lookup_insert_threshold_(kHundredthUint64 * + FLAGS_lookup_insert_percent), + insert_threshold_(lookup_insert_threshold_ + + kHundredthUint64 * FLAGS_insert_percent), + lookup_threshold_(insert_threshold_ + + kHundredthUint64 * FLAGS_lookup_percent), + erase_threshold_(lookup_threshold_ + + kHundredthUint64 * FLAGS_erase_percent), + skewed_(FLAGS_skewed) { + if (erase_threshold_ != 100U * kHundredthUint64) { + fprintf(stderr, "Percentages must add to 100.\n"); + exit(1); + } + + max_log_ = 0; + if (skewed_) { + uint64_t max_key = max_key_; + while (max_key >>= 1) max_log_++; + if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++; + } + + if (FLAGS_cache_type == "clock_cache") { + fprintf(stderr, "Old clock cache implementation has been removed.\n"); + exit(1); + } else if (FLAGS_cache_type == "hyper_clock_cache") { + cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, + FLAGS_num_shard_bits) + .MakeSharedCache(); + } else if (FLAGS_cache_type == "lru_cache") { + LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */); +#ifndef ROCKSDB_LITE + if (!FLAGS_secondary_cache_uri.empty()) { + Status s = SecondaryCache::CreateFromString( + ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf( + stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + opts.secondary_cache = secondary_cache; + } +#endif // ROCKSDB_LITE + + cache_ = NewLRUCache(opts); + } else { + fprintf(stderr, "Cache type not supported."); + exit(1); + } + } + + ~CacheBench() {} + + void PopulateCache() { + Random64 rnd(1); + KeyGen keygen; + for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { + Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), + createValue(rnd), &helper1, FLAGS_value_bytes); + assert(s.ok()); + } + } + + bool Run() { + const auto clock = SystemClock::Default().get(); + + PrintEnv(); + SharedState shared(this); + std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads); + for (uint32_t i = 0; i < FLAGS_threads; i++) { + threads[i].reset(new ThreadState(i, &shared)); + std::thread(ThreadBody, threads[i].get()).detach(); + } + + HistogramImpl stats_hist; + std::string stats_report; + std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report); + + uint64_t start_time; + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + start_time = clock->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + // Stats gathering is considered background work. This time measurement + // is for foreground work, and not really ideal for that. See below. + uint64_t end_time = clock->NowMicros(); + stats_thread.join(); + + // Wall clock time - includes idle time if threads + // finish at different times (not ideal). + double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6; + uint32_t ops_per_sec = static_cast<uint32_t>( + 1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs); + printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs, + ops_per_sec); + + // Total time in each thread (more accurate throughput measure) + elapsed_secs = 0; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + elapsed_secs += threads[i]->duration_us * 1e-6; + } + ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads * + FLAGS_ops_per_thread / elapsed_secs); + printf("Thread ops/sec = %u\n", ops_per_sec); + + printf("\nOperation latency (ns):\n"); + HistogramImpl combined; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + combined.Merge(threads[i]->latency_ns_hist); + } + printf("%s", combined.ToString().c_str()); + + if (FLAGS_gather_stats) { + printf("\nGather stats latency (us):\n"); + printf("%s", stats_hist.ToString().c_str()); + } + + printf("\n%s", stats_report.c_str()); + + return true; + } + + private: + std::shared_ptr<Cache> cache_; + const uint64_t max_key_; + // Cumulative thresholds in the space of a random uint64_t + const uint64_t lookup_insert_threshold_; + const uint64_t insert_threshold_; + const uint64_t lookup_threshold_; + const uint64_t erase_threshold_; + const bool skewed_; + int max_log_; + + // A benchmark version of gathering stats on an active block cache by + // iterating over it. The primary purpose is to measure the impact of + // gathering stats with ApplyToAllEntries on throughput- and + // latency-sensitive Cache users. Performance of stats gathering is + // also reported. The last set of gathered stats is also reported, for + // manual sanity checking for logical errors or other unexpected + // behavior of cache_bench or the underlying Cache. + static void StatsBody(SharedState* shared, HistogramImpl* stats_hist, + std::string* stats_report) { + if (!FLAGS_gather_stats) { + return; + } + const auto clock = SystemClock::Default().get(); + uint64_t total_key_size = 0; + uint64_t total_charge = 0; + uint64_t total_entry_count = 0; + uint64_t table_occupancy = 0; + uint64_t table_size = 0; + std::set<Cache::DeleterFn> deleters; + StopWatchNano timer(clock); + + for (;;) { + uint64_t time; + time = clock->NowMicros(); + uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000; + + { + MutexLock l(shared->GetMutex()); + for (;;) { + if (shared->AllDone()) { + std::ostringstream ostr; + ostr << "Most recent cache entry stats:\n" + << "Number of entries: " << total_entry_count << "\n" + << "Table occupancy: " << table_occupancy << " / " + << table_size << " = " + << (100.0 * table_occupancy / table_size) << "%\n" + << "Total charge: " << BytesToHumanString(total_charge) << "\n" + << "Average key size: " + << (1.0 * total_key_size / total_entry_count) << "\n" + << "Average charge: " + << BytesToHumanString(static_cast<uint64_t>( + 1.0 * total_charge / total_entry_count)) + << "\n" + << "Unique deleters: " << deleters.size() << "\n"; + *stats_report = ostr.str(); + return; + } + if (clock->NowMicros() >= deadline) { + break; + } + uint64_t diff = deadline - std::min(clock->NowMicros(), deadline); + shared->GetCondVar()->TimedWait(diff + 1); + } + } + + // Now gather stats, outside of mutex + total_key_size = 0; + total_charge = 0; + total_entry_count = 0; + deleters.clear(); + auto fn = [&](const Slice& key, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + total_key_size += key.size(); + total_charge += charge; + ++total_entry_count; + // Something slightly more expensive as in (future) stats by category + deleters.insert(deleter); + }; + timer.Start(); + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; + shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); + table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount(); + table_size = shared->GetCacheBench()->cache_->GetTableAddressCount(); + stats_hist->Add(timer.ElapsedNanos() / 1000); + } + } + + static void ThreadBody(ThreadState* thread) { + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + // To use looked-up values + uint64_t result = 0; + // To hold handles for a non-trivial amount of time + Cache::Handle* handle = nullptr; + KeyGen gen; + const auto clock = SystemClock::Default().get(); + uint64_t start_time = clock->NowMicros(); + StopWatchNano timer(clock); + + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); + uint64_t random_op = thread->rnd.Next(); + Cache::CreateCallback create_cb = [](const void* buf, size_t size, + void** out_obj, + size_t* charge) -> Status { + *out_obj = reinterpret_cast<void*>(new char[size]); + memcpy(*out_obj, buf, size); + *charge = size; + return Status::OK(); + }; + + timer.Start(); + + if (random_op < lookup_insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + if (!FLAGS_lean) { + // do something with the data + result += NPHash64(static_cast<char*>(cache_->Value(handle)), + FLAGS_value_bytes); + } + } else { + // do insert + Status s = cache_->Insert(key, createValue(thread->rnd), &helper2, + FLAGS_value_bytes, &handle); + assert(s.ok()); + } + } else if (random_op < insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do insert + Status s = cache_->Insert(key, createValue(thread->rnd), &helper3, + FLAGS_value_bytes, &handle); + assert(s.ok()); + } else if (random_op < lookup_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + if (!FLAGS_lean) { + // do something with the data + result += NPHash64(static_cast<char*>(cache_->Value(handle)), + FLAGS_value_bytes); + } + } + } else if (random_op < erase_threshold_) { + // do erase + cache_->Erase(key); + } else { + // Should be extremely unlikely (noop) + assert(random_op >= kHundredthUint64 * 100U); + } + thread->latency_ns_hist.Add(timer.ElapsedNanos()); + } + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // Ensure computations on `result` are not optimized away. + if (result == 1) { + printf("You are extremely unlucky(2). Try again.\n"); + exit(1); + } + thread->duration_us = clock->NowMicros() - start_time; + } + + void PrintEnv() const { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + printf( + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); +#endif +#ifndef NDEBUG + printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("DMutex impl name : %s\n", DMutex::kName()); + printf("Number of threads : %u\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %s\n", + BytesToHumanString(FLAGS_cache_size).c_str()); + printf("Num shard bits : %u\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", max_key_); + printf("Resident ratio : %g\n", FLAGS_resident_ratio); + printf("Skew degree : %u\n", FLAGS_skew); + printf("Populate cache : %d\n", int{FLAGS_populate_cache}); + printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent); + printf("Insert percentage : %u%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %u%%\n", FLAGS_erase_percent); + std::ostringstream stats; + if (FLAGS_gather_stats) { + stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, " + << FLAGS_gather_stats_entries_per_lock << "/lock)"; + } else { + stats << "disabled"; + } + printf("Gather stats : %s\n", stats.str().c_str()); + printf("----------------------------\n"); + } +}; + +// cache_bench -stress_cache_key is an independent embedded tool for +// estimating the probability of CacheKey collisions through simulation. +// At a high level, it simulates generating SST files over many months, +// keeping them in the DB and/or cache for some lifetime while staying +// under resource caps, and checking for any cache key collisions that +// arise among the set of live files. For efficient simulation, we make +// some simplifying "pessimistic" assumptions (that only increase the +// chance of the simulation reporting a collision relative to the chance +// of collision in practice): +// * Every generated file has a cache entry for every byte offset in the +// file (contiguous range of cache keys) +// * All of every file is cached for its entire lifetime. (Here "lifetime" +// is technically the union of DB and Cache lifetime, though we only +// model a generous DB lifetime, where space usage is always maximized. +// In a effective Cache, lifetime in cache can only substantially exceed +// lifetime in DB if there is little cache activity; cache activity is +// required to hit cache key collisions.) +// +// It would be possible to track an exact set of cache key ranges for the +// set of live files, but we would have no hope of observing collisions +// (overlap in live files) in our simulation. We need to employ some way +// of amplifying collision probability that allows us to predict the real +// collision probability by extrapolation from observed collisions. Our +// basic approach is to reduce each cache key range down to some smaller +// number of bits, and limiting to bits that are shared over the whole +// range. Now we can observe collisions using a set of smaller stripped-down +// (reduced) cache keys. Let's do some case analysis to understand why this +// works: +// * No collision in reduced key - because the reduction is a pure function +// this implies no collision in the full keys +// * Collision detected between two reduced keys - either +// * The reduction has dropped some structured uniqueness info (from one of +// session counter or file number; file offsets are never materialized here). +// This can only artificially inflate the observed and extrapolated collision +// probabilities. We only have to worry about this in designing the reduction. +// * The reduction has preserved all the structured uniqueness in the cache +// key, which means either +// * REJECTED: We have a uniqueness bug in generating cache keys, where +// structured uniqueness info should have been different but isn't. In such a +// case, increasing by 1 the number of bits kept after reduction would not +// reduce observed probabilities by half. (In our observations, the +// probabilities are reduced approximately by half.) +// * ACCEPTED: The lost unstructured uniqueness in the key determines the +// probability that an observed collision would imply an overlap in ranges. +// In short, dropping n bits from key would increase collision probability by +// 2**n, assuming those n bits have full entropy in unstructured uniqueness. +// +// But we also have to account for the key ranges based on file size. If file +// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for +// "ranges", we know from other simulations (see +// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to +// (less than 2x higher collision probability) using a cache key of size +// 128 - b bits for the whole file. (This is the only place we make an +// "optimistic" assumption, which is more than offset by the real +// implementation stripping off 2 lower bits from block byte offsets for cache +// keys. The simulation assumes byte offsets, which is net pessimistic.) +// +// So to accept the extrapolation as valid, we need to be confident that all +// "lost" bits, excluding those covered by file offset, are full entropy. +// Recall that we have assumed (verifiably, safely) that other structured data +// (file number and session counter) are kept, not lost. Based on the +// implementation comments for OffsetableCacheKey, the only potential hole here +// is that we only have ~103 bits of entropy in "all new" session IDs, and in +// extreme cases, there might be only 1 DB ID. However, because the upper ~39 +// bits of session ID are hashed, the combination of file number and file +// offset only has to add to 25 bits (or more) to ensure full entropy in +// unstructured uniqueness lost in the reduction. Typical file size of 32MB +// suffices (at least for simulation purposes where we assume each file offset +// occupies a cache key). +// +// Example results in comments on OffsetableCacheKey. +class StressCacheKey { + public: + void Run() { + if (FLAGS_sck_footer_unique_id) { + // Proposed footer unique IDs are DB-independent and session-independent + // (but process-dependent) which is most easily simulated here by + // assuming 1 DB and (later below) no session resets without process + // reset. + FLAGS_sck_db_count = 1; + } + + // Describe the simulated workload + uint64_t mb_per_day = + uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb; + printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n", + FLAGS_sck_file_size_mb / 1024.0 / 1024.0 * + std::pow(2.0, FLAGS_sck_table_bits), + mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0); + // For extrapolating probability of any collisions from a number of + // observed collisions + multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) / + (FLAGS_sck_file_size_mb * 1024.0 * 1024.0); + printf( + "Multiply by %g to correct for simulation losses (but still assume " + "whole file cached)\n", + multiplier_); + restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day; + double without_ejection = + std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day; + // This should be a lower bound for -sck_randomize, usually a terribly + // rough lower bound. + // If observation is worse than this, then something has gone wrong. + printf( + "Without ejection, expect random collision after %g days (%g " + "corrected)\n", + without_ejection, without_ejection * multiplier_); + double with_full_table = + std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) / + FLAGS_sck_files_per_day; + // This is an alternate lower bound for -sck_randomize, usually pretty + // accurate. Our cache keys should usually perform "better than random" + // but always no worse. (If observation is substantially worse than this, + // then something has gone wrong.) + printf( + "With ejection and full table, expect random collision after %g " + "days (%g corrected)\n", + with_full_table, with_full_table * multiplier_); + collisions_ = 0; + + // Run until sufficient number of observed collisions. + for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) { + RunOnce(); + if (collisions_ == 0) { + printf( + "No collisions after %d x %u days " + " \n", + i, FLAGS_sck_days_per_run); + } else { + double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_; + printf("%" PRIu64 + " collisions after %d x %u days, est %g days between (%g " + "corrected) \n", + collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_); + } + } + } + + void RunOnce() { + // Re-initialized simulated state + const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1}); + dbs_.reset(new TableProperties[db_count]{}); + const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1; + table_.reset(new uint64_t[table_mask + 1]{}); + if (FLAGS_sck_keep_bits > 64) { + FLAGS_sck_keep_bits = 64; + } + + // Details of which bits are dropped in reduction + uint32_t shift_away = 64 - FLAGS_sck_keep_bits; + // Shift away fewer potential file number bits (b) than potential + // session counter bits (a). + uint32_t shift_away_b = shift_away / 3; + uint32_t shift_away_a = shift_away - shift_away_b; + + process_count_ = 0; + session_count_ = 0; + newdb_count_ = 0; + ResetProcess(/*newdbs*/ true); + + Random64 r{std::random_device{}()}; + + uint64_t max_file_count = + uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run; + uint32_t report_count = 0; + uint32_t collisions_this_run = 0; + size_t db_i = 0; + + for (uint64_t file_count = 1; file_count <= max_file_count; + ++file_count, ++db_i) { + // Round-robin through DBs (this faster than %) + if (db_i >= db_count) { + db_i = 0; + } + // Any other periodic actions before simulating next file + if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) { + ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen)); + } else if (r.OneIn(restart_nfiles_)) { + ResetProcess(/*newdbs*/ false); + } + // Simulate next file + OffsetableCacheKey ock; + dbs_[db_i].orig_file_number += 1; + // skip some file numbers for other file kinds, except in footer unique + // ID, orig_file_number here tracks process-wide generated SST file + // count. + if (!FLAGS_sck_footer_unique_id) { + dbs_[db_i].orig_file_number += (r.Next() & 3); + } + bool is_stable; + BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "", + /* ignored */ 42, &ock, &is_stable); + assert(is_stable); + // Get a representative cache key, which later we analytically generalize + // to a range. + CacheKey ck = ock.WithOffset(0); + uint64_t reduced_key; + if (FLAGS_sck_randomize) { + reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away; + } else if (FLAGS_sck_footer_unique_id) { + // Special case: keep only file number, not session counter + reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away; + } else { + // Try to keep file number and session counter (shift away other bits) + uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a; + uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b; + reduced_key = (uint64_t{a} << 32) + b; + } + if (reduced_key == 0) { + // Unlikely, but we need to exclude tracking this value because we + // use it to mean "empty" in table. This case is OK as long as we + // don't hit it often. + printf("Hit Zero! \n"); + file_count--; + continue; + } + uint64_t h = + NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key)); + // Skew expected lifetimes, for high variance (super-Poisson) variance + // in actual lifetimes. + size_t pos = + std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask); + if (table_[pos] == reduced_key) { + collisions_this_run++; + // Our goal is to predict probability of no collisions, not expected + // number of collisions. To make the distinction, we have to get rid + // of observing correlated collisions, which this takes care of: + ResetProcess(/*newdbs*/ false); + } else { + // Replace (end of lifetime for file that was in this slot) + table_[pos] = reduced_key; + } + + if (++report_count == FLAGS_sck_files_per_day) { + report_count = 0; + // Estimate fill % + size_t incr = table_mask / 1000; + size_t sampled_count = 0; + for (size_t i = 0; i <= table_mask; i += incr) { + if (table_[i] != 0) { + sampled_count++; + } + } + // Report + printf( + "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64 + " newdb, %u coll, occ %g%%, ejected %g%% \r", + file_count / FLAGS_sck_files_per_day, process_count_, + session_count_, newdb_count_ - FLAGS_sck_db_count, + collisions_this_run, 100.0 * sampled_count / 1000.0, + 100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count)); + fflush(stdout); + } + } + collisions_ += collisions_this_run; + } + + void ResetSession(size_t i, bool newdb) { + dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr); + if (newdb) { + ++newdb_count_; + if (FLAGS_sck_footer_unique_id) { + // Simulate how footer id would behave + dbs_[i].db_id = "none"; + } else { + // db_id might be ignored, depending on the implementation details + dbs_[i].db_id = std::to_string(newdb_count_); + dbs_[i].orig_file_number = 0; + } + } + session_count_++; + } + + void ResetProcess(bool newdbs) { + process_count_++; + DBImpl::TEST_ResetDbSessionIdGen(); + for (size_t i = 0; i < FLAGS_sck_db_count; ++i) { + ResetSession(i, newdbs); + } + if (FLAGS_sck_footer_unique_id) { + // For footer unique ID, this tracks process-wide generated SST file + // count. + dbs_[0].orig_file_number = 0; + } + } + + private: + // Use db_session_id and orig_file_number from TableProperties + std::unique_ptr<TableProperties[]> dbs_; + std::unique_ptr<uint64_t[]> table_; + uint64_t process_count_ = 0; + uint64_t session_count_ = 0; + uint64_t newdb_count_ = 0; + uint64_t collisions_ = 0; + uint32_t restart_nfiles_ = 0; + double multiplier_ = 0.0; +}; + +int cache_bench_tool(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_stress_cache_key) { + // Alternate tool + StressCacheKey().Run(); + return 0; + } + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::CacheBench bench; + if (FLAGS_populate_cache) { + bench.PopulateCache(); + printf("Population complete\n"); + printf("----------------------------\n"); + } + if (bench.Run()) { + return 0; + } else { + return 1; + } +} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/src/rocksdb/cache/cache_entry_roles.cc b/src/rocksdb/cache/cache_entry_roles.cc new file mode 100644 index 000000000..b27349554 --- /dev/null +++ b/src/rocksdb/cache/cache_entry_roles.cc @@ -0,0 +1,134 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_entry_roles.h" + +#include <mutex> + +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { + +std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{ + "DataBlock", + "FilterBlock", + "FilterMetaBlock", + "DeprecatedFilterBlock", + "IndexBlock", + "OtherBlock", + "WriteBuffer", + "CompressionDictionaryBuildingBuffer", + "FilterConstruction", + "BlockBasedTableReader", + "FileMetadata", + "BlobValue", + "BlobCache", + "Misc", +}}; + +std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{ + "data-block", + "filter-block", + "filter-meta-block", + "deprecated-filter-block", + "index-block", + "other-block", + "write-buffer", + "compression-dictionary-building-buffer", + "filter-construction", + "block-based-table-reader", + "file-metadata", + "blob-value", + "blob-cache", + "misc", +}}; + +const std::string& GetCacheEntryRoleName(CacheEntryRole role) { + return kCacheEntryRoleToHyphenString[static_cast<size_t>(role)]; +} + +const std::string& BlockCacheEntryStatsMapKeys::CacheId() { + static const std::string kCacheId = "id"; + return kCacheId; +} + +const std::string& BlockCacheEntryStatsMapKeys::CacheCapacityBytes() { + static const std::string kCacheCapacityBytes = "capacity"; + return kCacheCapacityBytes; +} + +const std::string& +BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds() { + static const std::string kLastCollectionDurationSeconds = + "secs_for_last_collection"; + return kLastCollectionDurationSeconds; +} + +const std::string& BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds() { + static const std::string kLastCollectionAgeSeconds = + "secs_since_last_collection"; + return kLastCollectionAgeSeconds; +} + +namespace { + +std::string GetPrefixedCacheEntryRoleName(const std::string& prefix, + CacheEntryRole role) { + const std::string& role_name = GetCacheEntryRoleName(role); + std::string prefixed_role_name; + prefixed_role_name.reserve(prefix.size() + role_name.size()); + prefixed_role_name.append(prefix); + prefixed_role_name.append(role_name); + return prefixed_role_name; +} + +} // namespace + +std::string BlockCacheEntryStatsMapKeys::EntryCount(CacheEntryRole role) { + const static std::string kPrefix = "count."; + return GetPrefixedCacheEntryRoleName(kPrefix, role); +} + +std::string BlockCacheEntryStatsMapKeys::UsedBytes(CacheEntryRole role) { + const static std::string kPrefix = "bytes."; + return GetPrefixedCacheEntryRoleName(kPrefix, role); +} + +std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) { + const static std::string kPrefix = "percent."; + return GetPrefixedCacheEntryRoleName(kPrefix, role); +} + +namespace { + +struct Registry { + std::mutex mutex; + UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map; + void Register(Cache::DeleterFn fn, CacheEntryRole role) { + std::lock_guard<std::mutex> lock(mutex); + role_map[fn] = role; + } + UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() { + std::lock_guard<std::mutex> lock(mutex); + return role_map; + } +}; + +Registry& GetRegistry() { + STATIC_AVOID_DESTRUCTION(Registry, registry); + return registry; +} + +} // namespace + +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) { + GetRegistry().Register(fn, role); +} + +UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() { + return GetRegistry().Copy(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_entry_roles.h b/src/rocksdb/cache/cache_entry_roles.h new file mode 100644 index 000000000..5a49fdfd4 --- /dev/null +++ b/src/rocksdb/cache/cache_entry_roles.h @@ -0,0 +1,103 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <array> +#include <cstdint> +#include <memory> +#include <type_traits> + +#include "rocksdb/cache.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +extern std::array<std::string, kNumCacheEntryRoles> + kCacheEntryRoleToCamelString; +extern std::array<std::string, kNumCacheEntryRoles> + kCacheEntryRoleToHyphenString; + +// To associate cache entries with their role, we use a hack on the +// existing Cache interface. Because the deleter of an entry can authenticate +// the code origin of an entry, we can elaborate the choice of deleter to +// also encode role information, without inferring false role information +// from entries not choosing to encode a role. +// +// The rest of this file is for handling mappings between deleters and +// roles. + +// To infer a role from a deleter, the deleter must be registered. This +// can be done "manually" with this function. This function is thread-safe, +// and the registration mappings go into private but static storage. (Note +// that DeleterFn is a function pointer, not std::function. Registrations +// should not be too many.) +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role); + +// Gets a copy of the registered deleter -> role mappings. This is the only +// function for reading the mappings made with RegisterCacheDeleterRole. +// Why only this interface for reading? +// * This function has to be thread safe, which could incur substantial +// overhead. We should not pay this overhead for every deleter look-up. +// * This is suitable for preparing for batch operations, like with +// CacheEntryStatsCollector. +// * The number of mappings should be sufficiently small (dozens). +UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap(); + +// ************************************************************** // +// An automatic registration infrastructure. This enables code +// to simply ask for a deleter associated with a particular type +// and role, and registration is automatic. In a sense, this is +// a small dependency injection infrastructure, because linking +// in new deleter instantiations is essentially sufficient for +// making stats collection (using CopyCacheDeleterRoleMap) aware +// of them. + +namespace cache_entry_roles_detail { + +template <typename T, CacheEntryRole R> +struct RegisteredDeleter { + RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); } + + // These have global linkage to help ensure compiler optimizations do not + // break uniqueness for each <T,R> + static void Delete(const Slice& /* key */, void* value) { + // Supports T == Something[], unlike delete operator + std::default_delete<T>()( + static_cast<typename std::remove_extent<T>::type*>(value)); + } +}; + +template <CacheEntryRole R> +struct RegisteredNoopDeleter { + RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); } + + static void Delete(const Slice& /* key */, void* /* value */) { + // Here was `assert(value == nullptr);` but we can also put pointers + // to static data in Cache, for testing at least. + } +}; + +} // namespace cache_entry_roles_detail + +// Get an automatically registered deleter for value type T and role R. +// Based on C++ semantics, registration is invoked exactly once in a +// thread-safe way on first call to this function, for each <T, R>. +template <typename T, CacheEntryRole R> +Cache::DeleterFn GetCacheEntryDeleterForRole() { + static cache_entry_roles_detail::RegisteredDeleter<T, R> reg; + return reg.Delete; +} + +// Get an automatically registered no-op deleter (value should be nullptr) +// and associated with role R. This is used for Cache "reservation" entries +// such as for WriteBufferManager. +template <CacheEntryRole R> +Cache::DeleterFn GetNoopDeleterForRole() { + static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg; + return reg.Delete; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_entry_stats.h b/src/rocksdb/cache/cache_entry_stats.h new file mode 100644 index 000000000..63b12735b --- /dev/null +++ b/src/rocksdb/cache/cache_entry_stats.h @@ -0,0 +1,183 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <array> +#include <cstdint> +#include <memory> +#include <mutex> + +#include "cache/cache_helpers.h" +#include "cache/cache_key.h" +#include "port/lang.h" +#include "rocksdb/cache.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding_lean.h" + +namespace ROCKSDB_NAMESPACE { + +// A generic helper object for gathering stats about cache entries by +// iterating over them with ApplyToAllEntries. This class essentially +// solves the problem of slowing down a Cache with too many stats +// collectors that could be sharing stat results, such as from multiple +// column families or multiple DBs sharing a Cache. We employ a few +// mitigations: +// * Only one collector for a particular kind of Stats is alive +// for each Cache. This is guaranteed using the Cache itself to hold +// the collector. +// * A mutex ensures only one thread is gathering stats for this +// collector. +// * The most recent gathered stats are saved and simply copied to +// satisfy requests within a time window (default: 3 minutes) of +// completion of the most recent stat gathering. +// +// Template parameter Stats must be copyable and trivially constructable, +// as well as... +// concept Stats { +// // Notification before applying callback to all entries +// void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); +// // Get the callback to apply to all entries. `callback` +// // type must be compatible with Cache::ApplyToAllEntries +// callback GetEntryCallback(); +// // Notification after applying callback to all entries +// void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); +// // Notification that a collection was skipped because of +// // sufficiently recent saved results. +// void SkippedCollection(); +// } +template <class Stats> +class CacheEntryStatsCollector { + public: + // Gather and save stats if saved stats are too old. (Use GetStats() to + // read saved stats.) + // + // Maximum allowed age for a "hit" on saved results is determined by the + // two interval parameters. Both set to 0 forces a re-scan. For example + // with min_interval_seconds=300 and min_interval_factor=100, if the last + // scan took 10s, we would only rescan ("miss") if the age in seconds of + // the saved results is > max(300, 100*10). + // Justification: scans can vary wildly in duration, e.g. from 0.02 sec + // to as much as 20 seconds, so we want to be able to cap the absolute + // and relative frequency of scans. + void CollectStats(int min_interval_seconds, int min_interval_factor) { + // Waits for any pending reader or writer (collector) + std::lock_guard<std::mutex> lock(working_mutex_); + + uint64_t max_age_micros = + static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U; + + if (last_end_time_micros_ > last_start_time_micros_ && + min_interval_factor > 0) { + max_age_micros = std::max( + max_age_micros, min_interval_factor * (last_end_time_micros_ - + last_start_time_micros_)); + } + + uint64_t start_time_micros = clock_->NowMicros(); + if ((start_time_micros - last_end_time_micros_) > max_age_micros) { + last_start_time_micros_ = start_time_micros; + working_stats_.BeginCollection(cache_, clock_, start_time_micros); + + cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {}); + TEST_SYNC_POINT_CALLBACK( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr); + + uint64_t end_time_micros = clock_->NowMicros(); + last_end_time_micros_ = end_time_micros; + working_stats_.EndCollection(cache_, clock_, end_time_micros); + } else { + working_stats_.SkippedCollection(); + } + + // Save so that we don't need to wait for an outstanding collection in + // order to make of copy of the last saved stats + std::lock_guard<std::mutex> lock2(saved_mutex_); + saved_stats_ = working_stats_; + } + + // Gets saved stats, regardless of age + void GetStats(Stats *stats) { + std::lock_guard<std::mutex> lock(saved_mutex_); + *stats = saved_stats_; + } + + Cache *GetCache() const { return cache_; } + + // Gets or creates a shared instance of CacheEntryStatsCollector in the + // cache itself, and saves into `ptr`. This shared_ptr will hold the + // entry in cache until all refs are destroyed. + static Status GetShared(Cache *cache, SystemClock *clock, + std::shared_ptr<CacheEntryStatsCollector> *ptr) { + const Slice &cache_key = GetCacheKey(); + + Cache::Handle *h = cache->Lookup(cache_key); + if (h == nullptr) { + // Not yet in cache, but Cache doesn't provide a built-in way to + // avoid racing insert. So we double-check under a shared mutex, + // inspired by TableCache. + STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex); + std::lock_guard<std::mutex> lock(static_mutex); + + h = cache->Lookup(cache_key); + if (h == nullptr) { + auto new_ptr = new CacheEntryStatsCollector(cache, clock); + // TODO: non-zero charge causes some tests that count block cache + // usage to go flaky. Fix the problem somehow so we can use an + // accurate charge. + size_t charge = 0; + Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h, + Cache::Priority::HIGH); + if (!s.ok()) { + assert(h == nullptr); + delete new_ptr; + return s; + } + } + } + // If we reach here, shared entry is in cache with handle `h`. + assert(cache->GetDeleter(h) == Deleter); + + // Build an aliasing shared_ptr that keeps `ptr` in cache while there + // are references. + *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h); + return Status::OK(); + } + + private: + explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock) + : saved_stats_(), + working_stats_(), + last_start_time_micros_(0), + last_end_time_micros_(/*pessimistic*/ 10000000), + cache_(cache), + clock_(clock) {} + + static void Deleter(const Slice &, void *value) { + delete static_cast<CacheEntryStatsCollector *>(value); + } + + static const Slice &GetCacheKey() { + // For each template instantiation + static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime(); + static Slice ckey_slice = ckey.AsSlice(); + return ckey_slice; + } + + std::mutex saved_mutex_; + Stats saved_stats_; + + std::mutex working_mutex_; + Stats working_stats_; + uint64_t last_start_time_micros_; + uint64_t last_end_time_micros_; + + Cache *const cache_; + SystemClock *const clock_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_helpers.h b/src/rocksdb/cache/cache_helpers.h new file mode 100644 index 000000000..7ea2365b8 --- /dev/null +++ b/src/rocksdb/cache/cache_helpers.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> + +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the cached value given a cache handle. +template <typename T> +T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) { + assert(cache); + assert(handle); + + return static_cast<T*>(cache->Value(handle)); +} + +// Simple generic deleter for Cache (to be used with Cache::Insert). +template <typename T> +void DeleteCacheEntry(const Slice& /* key */, void* value) { + delete static_cast<T*>(value); +} + +// Turns a T* into a Slice so it can be used as a key with Cache. +template <typename T> +Slice GetSlice(const T* t) { + return Slice(reinterpret_cast<const char*>(t), sizeof(T)); +} + +// Generic resource management object for cache handles that releases the handle +// when destroyed. Has unique ownership of the handle, so copying it is not +// allowed, while moving it transfers ownership. +template <typename T> +class CacheHandleGuard { + public: + CacheHandleGuard() = default; + + CacheHandleGuard(Cache* cache, Cache::Handle* handle) + : cache_(cache), + handle_(handle), + value_(GetFromCacheHandle<T>(cache, handle)) { + assert(cache_ && handle_ && value_); + } + + CacheHandleGuard(const CacheHandleGuard&) = delete; + CacheHandleGuard& operator=(const CacheHandleGuard&) = delete; + + CacheHandleGuard(CacheHandleGuard&& rhs) noexcept + : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) { + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + } + + CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept { + if (this == &rhs) { + return *this; + } + + ReleaseHandle(); + + cache_ = rhs.cache_; + handle_ = rhs.handle_; + value_ = rhs.value_; + + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + + return *this; + } + + ~CacheHandleGuard() { ReleaseHandle(); } + + bool IsEmpty() const { return !handle_; } + + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return handle_; } + T* GetValue() const { return value_; } + + void TransferTo(Cleanable* cleanable) { + if (cleanable) { + if (handle_ != nullptr) { + assert(cache_); + cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_); + } + } + ResetFields(); + } + + void Reset() { + ReleaseHandle(); + ResetFields(); + } + + private: + void ReleaseHandle() { + if (IsEmpty()) { + return; + } + + assert(cache_); + cache_->Release(handle_); + } + + void ResetFields() { + cache_ = nullptr; + handle_ = nullptr; + value_ = nullptr; + } + + static void ReleaseCacheHandle(void* arg1, void* arg2) { + Cache* const cache = static_cast<Cache*>(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2); + assert(cache_handle); + + cache->Release(cache_handle); + } + + private: + Cache* cache_ = nullptr; + Cache::Handle* handle_ = nullptr; + T* value_ = nullptr; +}; + +// Build an aliasing shared_ptr that keeps `handle` in cache while there +// are references, but the pointer is to the value for that cache entry, +// which must be of type T. This is copyable, unlike CacheHandleGuard, but +// does not provide access to caching details. +template <typename T> +std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache, + Cache::Handle* handle) { + auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle); + return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_key.cc b/src/rocksdb/cache/cache_key.cc new file mode 100644 index 000000000..a79328972 --- /dev/null +++ b/src/rocksdb/cache/cache_key.cc @@ -0,0 +1,364 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_key.h" + +#include <algorithm> +#include <atomic> + +#include "rocksdb/cache.h" +#include "table/unique_id_impl.h" +#include "util/hash.h" +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { + +// Value space plan for CacheKey: +// +// file_num_etc64_ | offset_etc64_ | Only generated by +// ---------------+---------------+------------------------------------------ +// 0 | 0 | Reserved for "empty" CacheKey() +// 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime +// 0 | >= 1<<63 | CreateUniqueForProcessLifetime +// > 0 | any | OffsetableCacheKey.WithOffset + +CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) { + // +1 so that we can reserve all zeros for "unset" cache key + uint64_t id = cache->NewId() + 1; + // Ensure we don't collide with CreateUniqueForProcessLifetime + assert((id >> 63) == 0U); + return CacheKey(0, id); +} + +CacheKey CacheKey::CreateUniqueForProcessLifetime() { + // To avoid colliding with CreateUniqueForCacheLifetime, assuming + // Cache::NewId counts up from zero, here we count down from UINT64_MAX. + // If this ever becomes a point of contention, we could sub-divide the + // space and use CoreLocalArray. + static std::atomic<uint64_t> counter{UINT64_MAX}; + uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed); + // Ensure we don't collide with CreateUniqueForCacheLifetime + assert((id >> 63) == 1U); + return CacheKey(0, id); +} + +// How we generate CacheKeys and base OffsetableCacheKey, assuming that +// db_session_ids are generated from a base_session_id and +// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId +// in DBImpl::GenerateDbSessionId): +// +// Conceptual inputs: +// db_id (unstructured, from GenerateRawUniqueId or equiv) +// * could be shared between cloned DBs but rare +// * could be constant, if session id suffices +// base_session_id (unstructured, from GenerateRawUniqueId) +// session_id_counter (structured) +// * usually much smaller than 2**24 +// orig_file_number (structured) +// * usually smaller than 2**24 +// offset_in_file (structured, might skip lots of values) +// * usually smaller than 2**32 +// +// Overall approach (see https://github.com/pdillinger/unique_id for +// background): +// +// First, we have three "structured" values, up to 64 bits each, that we +// need to fit, without losses, into 128 bits. In practice, the values will +// be small enough that they should fit. For example, applications generating +// large SST files (large offsets) will naturally produce fewer files (small +// file numbers). But we don't know ahead of time what bounds the values will +// have. +// +// Second, we have unstructured inputs that enable distinct RocksDB processes +// to pick a random point in space, likely very different from others. Xoring +// the structured with the unstructured give us a cache key that is +// structurally distinct between related keys (e.g. same file or same RocksDB +// process) and distinct with high probability between unrelated keys. +// +// The problem of packing three structured values into the space for two is +// complicated by the fact that we want to derive cache keys from SST unique +// IDs, which have already combined structured and unstructured inputs in a +// practically inseparable way. And we want a base cache key that works +// with an offset of any size. So basically, we need to encode these three +// structured values, each up to 64 bits, into 128 bits without knowing any +// of their sizes. The DownwardInvolution() function gives us a mechanism to +// accomplish this. (See its properties in math.h.) Specifically, for inputs +// a, b, and c: +// lower64 = DownwardInvolution(a) ^ ReverseBits(b); +// upper64 = c ^ ReverseBits(a); +// The 128-bit output is unique assuming there exist some i, j, and k +// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and +// i + j + k <= 128. In other words, as long as there exist some bounds +// that would allow us to pack the bits of a, b, and c into the output +// if we know the bound, we can generate unique outputs without knowing +// those bounds. To validate this claim, the inversion function (given +// the bounds) has been implemented in CacheKeyDecoder in +// db_block_cache_test.cc. +// +// With that in mind, the outputs in terms of the conceptual inputs look +// like this, using bitwise-xor of the constituent pieces, low bits on left: +// +// |------------------------- file_num_etc64 -------------------------| +// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ | +// |-----------------------------------------------------------------| +// | session_id_counter (involution) ..... | | +// |-----------------------------------------------------------------| +// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ | +// | * base_session_id (upper ~39 bits) | +// | * db_id (~122 bits entropy) | +// |-----------------------------------------------------------------| +// | | ..... orig_file_number (reversed) | +// |-----------------------------------------------------------------| +// +// +// |------------------------- offset_etc64 --------------------------| +// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ | +// |-----------------------------------------------------------------| +// | | ..... session_id_counter (reversed) | +// |-----------------------------------------------------------------| +// | offset_in_file ............... | | +// |-----------------------------------------------------------------| +// +// Some oddities or inconveniences of this layout are due to deriving +// the "base" cache key (without offset) from the SST unique ID (see +// GetSstInternalUniqueId). Specifically, +// * Lower 64 of base_session_id occurs in both output words (ok but +// weird) +// * The inclusion of db_id is bad for the conditions under which we +// can guarantee uniqueness, but could be useful in some cases with +// few small files per process, to make up for db session id only having +// ~103 bits of entropy. +// +// In fact, if DB ids were not involved, we would be guaranteed unique +// cache keys for files generated in a single process until total bits for +// biggest session_id_counter, orig_file_number, and offset_in_file +// reach 128 bits. +// +// With the DB id limitation, we only have nice guaranteed unique cache +// keys for files generated in a single process until biggest +// session_id_counter and offset_in_file reach combined 64 bits. This +// is quite good in practice because we can have millions of DB Opens +// with terabyte size SST files, or billions of DB Opens with gigabyte +// size SST files. +// +// One of the considerations in the translation between existing SST unique +// IDs and base cache keys is supporting better SST unique IDs in a future +// format_version. If we use a process-wide file counter instead of +// session counter and file numbers, we only need to combine two 64-bit values +// instead of three. But we don't want to track unique ID versions in the +// manifest, so we want to keep the same translation layer between SST unique +// IDs and base cache keys, even with updated SST unique IDs. If the new +// unique IDs put the file counter where the orig_file_number was, and +// use no structured field where session_id_counter was, then our translation +// layer works fine for two structured fields as well as three (for +// compatibility). The small computation for the translation (one +// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep) +// is negligible for computing as part of SST file reader open. +// +// More on how https://github.com/pdillinger/unique_id applies here: +// Every bit of output always includes "unstructured" uniqueness bits and +// often combines with "structured" uniqueness bits. The "unstructured" bits +// change infrequently: only when we cannot guarantee our state tracking for +// "structured" uniqueness hasn't been cloned. Using a static +// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an +// "all new" session id when a new process uses RocksDB. (Between processes, +// we don't know if a DB or other persistent storage has been cloned. We +// assume that if VM hot cloning is used, subsequently generated SST files +// do not interact.) Within a process, only the session_lower of the +// db_session_id changes incrementally ("structured" uniqueness). +// +// This basically means that our offsets, counters and file numbers allow us +// to do somewhat "better than random" (birthday paradox) while in the +// degenerate case of completely new session for each tiny file, we still +// have strong uniqueness properties from the birthday paradox, with ~103 +// bit session IDs or up to 128 bits entropy with different DB IDs sharing a +// cache. +// +// More collision probability analysis: +// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD) +// with average process/session lifetime of (pessimistically) 4 minutes. +// In 180 days (generous allowable data lifespan), we generate 31 million GB +// of data, or 2^55 bytes, and 2^16 "all new" session IDs. +// +// First, suppose this is in a single DB (lifetime 180 days): +// 128 bits cache key size +// - 55 <- ideal size for byte offsets + file numbers +// - 2 <- bits for offsets and file numbers not exactly powers of two +// + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey +// ---- +// 73 <- bits remaining for distinguishing session IDs +// The probability of a collision in 73 bits of session ID data is less than +// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all +// data from the last 180 days is in cache for potential collision, and that +// cache keys under each session id exhaustively cover the remaining 57 bits +// while in reality they'll only cover a small fraction of it. +// +// Although data could be transferred between hosts, each host has its own +// cache and we are already assuming a high rate of "all new" session ids. +// So this doesn't really change the collision calculation. Across a fleet +// of 1 million, each with <1 in a trillion collision possibility, +// fleetwide collision probability is <1 in a million. +// +// Now suppose we have many DBs per host, say 2**10, with same host-wide write +// rate and process/session lifetime. File numbers will be ~10 bits smaller +// and we will have 2**10 times as many session IDs because of simultaneous +// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)), +// or roughly 1 in a billion. +// +// Suppose instead we generated random or hashed cache keys for each +// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys +// in 180 days. Collision probability is more easily estimated at roughly +// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all +// data from the last 180 days is in cache, but NOT the other assumption +// for the 1 in a trillion estimate above). +// +// +// Collision probability estimation through simulation: +// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache +// activity over many months, by making some pessimistic simplifying +// assumptions. See class StressCacheKey in cache_bench_tool.cc for details. +// Here is some sample output with +// `./cache_bench -stress_cache_key -sck_keep_bits=43`: +// +// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day +// Multiply by 1.15292e+18 to correct for simulation losses (but still +// assume whole file cached) +// +// These come from default settings of 2.5M files per day of 32 MB each, and +// `-sck_keep_bits=43` means that to represent a single file, we are only +// keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25 +// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or +// about 1 billion billion times more prone to collision than reality. +// +// More default assumptions, relatively pessimistic: +// * 100 DBs in same process (doesn't matter much) +// * Re-open DB in same process (new session ID related to old session ID) on +// average every 100 files generated +// * Restart process (all new session IDs unrelated to old) 24 times per day +// +// After enough data, we get a result at the end (-sck_keep_bits=43): +// +// (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between +// (1.15292e+19 corrected) +// +// If we believe the (pessimistic) simulation and the mathematical +// extrapolation, we would need to run a billion machines all for 11 billion +// days to expect a cache key collision. To help verify that our extrapolation +// ("corrected") is robust, we can make our simulation more precise by +// increasing the "keep" bits, which takes more running time to get enough +// collision data: +// +// (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between +// (1.6213e+19 corrected) +// (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between +// (1.21057e+19 corrected) +// (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between +// (1.46997e+19 corrected) +// (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between +// (2.11849e+19 corrected) +// +// The extrapolated prediction seems to be within noise (sampling error). +// +// With the `-sck_randomize` option, we can see that typical workloads like +// above have lower collision probability than "random" cache keys (note: +// offsets still non-randomized) by a modest amount (roughly 2-3x less +// collision prone than random), which should make us reasonably comfortable +// even in "degenerate" cases (e.g. repeatedly launch a process to generate +// one file with SstFileWriter): +// +// (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between +// (4.7165e+18 corrected) +// +// We can see that with more frequent process restarts, +// -sck_restarts_per_day=5000, which means more all-new session IDs, we get +// closer to the "random" cache key performance: +// +// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected) +// +// And with less frequent process restarts and re-opens, +// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision +// probability: +// +// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected) +// +// Other tests have been run to validate other conditions behave as expected, +// never behaving "worse than random" unless we start chopping off structured +// data. +// +// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs +// that only arise when a new process is started, the chance of any cache key +// collisions in a giant fleet of machines is negligible. Especially when +// processes live for hours or days, the chance of a cache key collision is +// likely more plausibly due to bad hardware than to bad luck in random +// session ID data. Software defects are surely more likely to cause corruption +// than both of those. +// +// TODO: Nevertheless / regardless, an efficient way to detect (and thus +// quantify) block cache corruptions, including collisions, should be added. +OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number) { + UniqueId64x2 internal_id; + Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, + &internal_id, /*force=*/true); + assert(s.ok()); + *this = FromInternalUniqueId(&internal_id); +} + +OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) { + uint64_t session_lower = id.ptr[0]; + uint64_t file_num_etc = id.ptr[1]; + +#ifndef NDEBUG + bool is_empty = session_lower == 0 && file_num_etc == 0; +#endif + + // Although DBImpl guarantees (in recent versions) that session_lower is not + // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is + // not zero (so that the 0 case can be used by CacheKey::CreateUnique*) + // However, if we are given an "empty" id as input, then we should produce + // "empty" as output. + // As a consequence, this function is only bijective assuming + // id[0] == 0 only if id[1] == 0. + if (session_lower == 0U) { + session_lower = file_num_etc; + } + + // See comments above for how DownwardInvolution and ReverseBits + // make this function invertible under various assumptions. + OffsetableCacheKey rv; + rv.file_num_etc64_ = + DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc); + rv.offset_etc64_ = ReverseBits(session_lower); + + // Because of these transformations and needing to allow arbitrary + // offset (thus, second 64 bits of cache key might be 0), we need to + // make some correction to ensure the first 64 bits is not 0. + // Fortunately, the transformation ensures the second 64 bits is not 0 + // for non-empty base key, so we can swap in the case one is 0 without + // breaking bijectivity (assuming condition above). + assert(is_empty || rv.offset_etc64_ > 0); + if (rv.file_num_etc64_ == 0) { + std::swap(rv.file_num_etc64_, rv.offset_etc64_); + } + assert(is_empty || rv.file_num_etc64_ > 0); + return rv; +} + +// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if +// offset_etc64 == 0) +UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() { + uint64_t a = file_num_etc64_; + uint64_t b = offset_etc64_; + if (b == 0) { + std::swap(a, b); + } + UniqueId64x2 rv; + rv[0] = ReverseBits(b); + rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0])); + return rv; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_key.h b/src/rocksdb/cache/cache_key.h new file mode 100644 index 000000000..0b93c6bd9 --- /dev/null +++ b/src/rocksdb/cache/cache_key.h @@ -0,0 +1,143 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstdint> + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "table/unique_id_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; + +// A standard holder for fixed-size block cache keys (and for related caches). +// They are created through one of these, each using its own range of values: +// * CacheKey::CreateUniqueForCacheLifetime +// * CacheKey::CreateUniqueForProcessLifetime +// * Default ctor ("empty" cache key) +// * OffsetableCacheKey->WithOffset +// +// The first two use atomic counters to guarantee uniqueness over the given +// lifetime and the last uses a form of universally unique identifier for +// uniqueness with very high probabilty (and guaranteed for files generated +// during a single process lifetime). +// +// CacheKeys are currently used by calling AsSlice() to pass as a key to +// Cache. For performance, the keys are endianness-dependent (though otherwise +// portable). (Persistable cache entries are not intended to cross platforms.) +class CacheKey { + public: + // For convenience, constructs an "empty" cache key that is never returned + // by other means. + inline CacheKey() : file_num_etc64_(), offset_etc64_() {} + + inline bool IsEmpty() const { + return (file_num_etc64_ == 0) & (offset_etc64_ == 0); + } + + // Use this cache key as a Slice (byte order is endianness-dependent) + inline Slice AsSlice() const { + static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key"); + assert(!IsEmpty()); + return Slice(reinterpret_cast<const char *>(this), sizeof(*this)); + } + + // Create a CacheKey that is unique among others associated with this Cache + // instance. Depends on Cache::NewId. This is useful for block cache + // "reservations". + static CacheKey CreateUniqueForCacheLifetime(Cache *cache); + + // Create a CacheKey that is unique among others for the lifetime of this + // process. This is useful for saving in a static data member so that + // different DB instances can agree on a cache key for shared entities, + // such as for CacheEntryStatsCollector. + static CacheKey CreateUniqueForProcessLifetime(); + + protected: + friend class OffsetableCacheKey; + CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64) + : file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {} + uint64_t file_num_etc64_; + uint64_t offset_etc64_; +}; + +constexpr uint8_t kCacheKeySize = static_cast<uint8_t>(sizeof(CacheKey)); + +// A file-specific generator of cache keys, sometimes referred to as the +// "base" cache key for a file because all the cache keys for various offsets +// within the file are computed using simple arithmetic. The basis for the +// general approach is dicussed here: https://github.com/pdillinger/unique_id +// Heavily related to GetUniqueIdFromTableProperties. +// +// If the db_id, db_session_id, and file_number come from the file's table +// properties, then the keys will be stable across DB::Open/Close, backup/ +// restore, import/export, etc. +// +// This class "is a" CacheKey only privately so that it is not misused as +// a ready-to-use CacheKey. +class OffsetableCacheKey : private CacheKey { + public: + // For convenience, constructs an "empty" cache key that should not be used. + inline OffsetableCacheKey() : CacheKey() {} + + // Constructs an OffsetableCacheKey with the given information about a file. + // This constructor never generates an "empty" base key. + OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, + uint64_t file_number); + + // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys + // can be derived from DB manifest data before reading the file from + // storage--so that every part of the file can potentially go in a persistent + // cache. + // + // Calling GetSstInternalUniqueId() on a db_id, db_session_id, and + // file_number and passing the result to this function produces the same + // base cache key as feeding those inputs directly to the constructor. + // + // This is a bijective transformation assuming either id is empty or + // lower 64 bits is non-zero: + // * Empty (all zeros) input -> empty (all zeros) output + // * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is + // non-zero + static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id); + + // This is the inverse transformation to the above, assuming either empty + // or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for + // testing. + UniqueId64x2 ToInternalUniqueId(); + + inline bool IsEmpty() const { + bool result = file_num_etc64_ == 0; + assert(!(offset_etc64_ > 0 && result)); + return result; + } + + // Construct a CacheKey for an offset within a file. An offset is not + // necessarily a byte offset if a smaller unique identifier of keyable + // offsets is used. + // + // This class was designed to make this hot code extremely fast. + inline CacheKey WithOffset(uint64_t offset) const { + assert(!IsEmpty()); + return CacheKey(file_num_etc64_, offset_etc64_ ^ offset); + } + + // The "common prefix" is a shared prefix for all the returned CacheKeys. + // It is specific to the file but the same for all offsets within the file. + static constexpr size_t kCommonPrefixSize = 8; + inline Slice CommonPrefixSlice() const { + static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize, + "8 byte common prefix expected"); + assert(!IsEmpty()); + assert(&this->file_num_etc64_ == static_cast<const void *>(this)); + + return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_reservation_manager.cc b/src/rocksdb/cache/cache_reservation_manager.cc new file mode 100644 index 000000000..53dee5d79 --- /dev/null +++ b/src/rocksdb/cache/cache_reservation_manager.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "cache/cache_reservation_manager.h" + +#include <cassert> +#include <cstddef> +#include <cstring> +#include <memory> + +#include "cache/cache_entry_roles.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/block_based/reader_common.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +template <CacheEntryRole R> +CacheReservationManagerImpl<R>::CacheReservationHandle::CacheReservationHandle( + std::size_t incremental_memory_used, + std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr) + : incremental_memory_used_(incremental_memory_used) { + assert(cache_res_mgr); + cache_res_mgr_ = cache_res_mgr; +} + +template <CacheEntryRole R> +CacheReservationManagerImpl< + R>::CacheReservationHandle::~CacheReservationHandle() { + Status s = cache_res_mgr_->ReleaseCacheReservation(incremental_memory_used_); + s.PermitUncheckedError(); +} + +template <CacheEntryRole R> +CacheReservationManagerImpl<R>::CacheReservationManagerImpl( + std::shared_ptr<Cache> cache, bool delayed_decrease) + : delayed_decrease_(delayed_decrease), + cache_allocated_size_(0), + memory_used_(0) { + assert(cache != nullptr); + cache_ = cache; +} + +template <CacheEntryRole R> +CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() { + for (auto* handle : dummy_handles_) { + cache_->Release(handle, true); + } +} + +template <CacheEntryRole R> +Status CacheReservationManagerImpl<R>::UpdateCacheReservation( + std::size_t new_mem_used) { + memory_used_ = new_mem_used; + std::size_t cur_cache_allocated_size = + cache_allocated_size_.load(std::memory_order_relaxed); + if (new_mem_used == cur_cache_allocated_size) { + return Status::OK(); + } else if (new_mem_used > cur_cache_allocated_size) { + Status s = IncreaseCacheReservation(new_mem_used); + return s; + } else { + // In delayed decrease mode, we don't decrease cache reservation + // untill the memory usage is less than 3/4 of what we reserve + // in the cache. + // We do this because + // (1) Dummy entry insertion is expensive in block cache + // (2) Delayed releasing previously inserted dummy entries can save such + // expensive dummy entry insertion on memory increase in the near future, + // which is likely to happen when the memory usage is greater than or equal + // to 3/4 of what we reserve + if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) { + return Status::OK(); + } else { + Status s = DecreaseCacheReservation(new_mem_used); + return s; + } + } +} + +template <CacheEntryRole R> +Status CacheReservationManagerImpl<R>::MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle) { + assert(handle); + Status s = + UpdateCacheReservation(GetTotalMemoryUsed() + incremental_memory_used); + (*handle).reset(new CacheReservationManagerImpl::CacheReservationHandle( + incremental_memory_used, + std::enable_shared_from_this< + CacheReservationManagerImpl<R>>::shared_from_this())); + return s; +} + +template <CacheEntryRole R> +Status CacheReservationManagerImpl<R>::ReleaseCacheReservation( + std::size_t incremental_memory_used) { + assert(GetTotalMemoryUsed() >= incremental_memory_used); + std::size_t updated_total_mem_used = + GetTotalMemoryUsed() - incremental_memory_used; + Status s = UpdateCacheReservation(updated_total_mem_used); + return s; +} + +template <CacheEntryRole R> +Status CacheReservationManagerImpl<R>::IncreaseCacheReservation( + std::size_t new_mem_used) { + Status return_status = Status::OK(); + while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) { + Cache::Handle* handle = nullptr; + return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry, + GetNoopDeleterForRole<R>(), &handle); + + if (return_status != Status::OK()) { + return return_status; + } + + dummy_handles_.push_back(handle); + cache_allocated_size_ += kSizeDummyEntry; + } + return return_status; +} + +template <CacheEntryRole R> +Status CacheReservationManagerImpl<R>::DecreaseCacheReservation( + std::size_t new_mem_used) { + Status return_status = Status::OK(); + + // Decrease to the smallest multiple of kSizeDummyEntry that is greater than + // or equal to new_mem_used We do addition instead of new_mem_used <= + // cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to + // avoid underflow of size_t when cache_allocated_size_ = 0 + while (new_mem_used + kSizeDummyEntry <= + cache_allocated_size_.load(std::memory_order_relaxed)) { + assert(!dummy_handles_.empty()); + auto* handle = dummy_handles_.back(); + cache_->Release(handle, true); + dummy_handles_.pop_back(); + cache_allocated_size_ -= kSizeDummyEntry; + } + return return_status; +} + +template <CacheEntryRole R> +std::size_t CacheReservationManagerImpl<R>::GetTotalReservedCacheSize() { + return cache_allocated_size_.load(std::memory_order_relaxed); +} + +template <CacheEntryRole R> +std::size_t CacheReservationManagerImpl<R>::GetTotalMemoryUsed() { + return memory_used_; +} + +template <CacheEntryRole R> +Slice CacheReservationManagerImpl<R>::GetNextCacheKey() { + // Calling this function will have the side-effect of changing the + // underlying cache_key_ that is shared among other keys generated from this + // fucntion. Therefore please make sure the previous keys are saved/copied + // before calling this function. + cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get()); + return cache_key_.AsSlice(); +} + +template <CacheEntryRole R> +Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() { + return GetNoopDeleterForRole<R>(); +} + +template class CacheReservationManagerImpl< + CacheEntryRole::kBlockBasedTableReader>; +template class CacheReservationManagerImpl< + CacheEntryRole::kCompressionDictionaryBuildingBuffer>; +template class CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>; +template class CacheReservationManagerImpl<CacheEntryRole::kMisc>; +template class CacheReservationManagerImpl<CacheEntryRole::kWriteBuffer>; +template class CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>; +template class CacheReservationManagerImpl<CacheEntryRole::kBlobCache>; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_reservation_manager.h b/src/rocksdb/cache/cache_reservation_manager.h new file mode 100644 index 000000000..147aaa915 --- /dev/null +++ b/src/rocksdb/cache/cache_reservation_manager.h @@ -0,0 +1,316 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <atomic> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <mutex> +#include <vector> + +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +// CacheReservationManager is an interface for reserving cache space for the +// memory used +class CacheReservationManager { + public: + // CacheReservationHandle is for managing the lifetime of a cache reservation + // for an incremental amount of memory used (i.e, incremental_memory_used) + class CacheReservationHandle { + public: + virtual ~CacheReservationHandle() {} + }; + virtual ~CacheReservationManager() {} + virtual Status UpdateCacheReservation(std::size_t new_memory_used) = 0; + // TODO(hx235): replace the usage of + // `UpdateCacheReservation(memory_used_delta, increase)` with + // `UpdateCacheReservation(new_memory_used)` so that we only have one + // `UpdateCacheReservation` function + virtual Status UpdateCacheReservation(std::size_t memory_used_delta, + bool increase) = 0; + virtual Status MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr<CacheReservationManager::CacheReservationHandle> + *handle) = 0; + virtual std::size_t GetTotalReservedCacheSize() = 0; + virtual std::size_t GetTotalMemoryUsed() = 0; +}; + +// CacheReservationManagerImpl implements interface CacheReservationManager +// for reserving cache space for the memory used by inserting/releasing dummy +// entries in the cache. +// +// This class is NOT thread-safe, except that GetTotalReservedCacheSize() +// can be called without external synchronization. +template <CacheEntryRole R> +class CacheReservationManagerImpl + : public CacheReservationManager, + public std::enable_shared_from_this<CacheReservationManagerImpl<R>> { + public: + class CacheReservationHandle + : public CacheReservationManager::CacheReservationHandle { + public: + CacheReservationHandle( + std::size_t incremental_memory_used, + std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr); + ~CacheReservationHandle() override; + + private: + std::size_t incremental_memory_used_; + std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr_; + }; + + // Construct a CacheReservationManagerImpl + // @param cache The cache where dummy entries are inserted and released for + // reserving cache space + // @param delayed_decrease If set true, then dummy entries won't be released + // immediately when memory usage decreases. + // Instead, it will be released when the memory usage + // decreases to 3/4 of what we have reserved so far. + // This is for saving some future dummy entry + // insertion when memory usage increases are likely to + // happen in the near future. + // + // REQUIRED: cache is not nullptr + explicit CacheReservationManagerImpl(std::shared_ptr<Cache> cache, + bool delayed_decrease = false); + + // no copy constructor, copy assignment, move constructor, move assignment + CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete; + CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) = + delete; + CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete; + CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) = + delete; + + ~CacheReservationManagerImpl() override; + + // One of the two ways of reserving/releasing cache space, + // see MakeCacheReservation() for the other. + // + // Use ONLY one of these two ways to prevent unexpected behavior. + // + // Insert and release dummy entries in the cache to + // match the size of total dummy entries with the least multiple of + // kSizeDummyEntry greater than or equal to new_mem_used + // + // Insert dummy entries if new_memory_used > cache_allocated_size_; + // + // Release dummy entries if new_memory_used < cache_allocated_size_ + // (and new_memory_used < cache_allocated_size_ * 3/4 + // when delayed_decrease is set true); + // + // Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_ + // or (2) new_memory_used is in the interval of + // [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease + // is set true. + // + // @param new_memory_used The number of bytes used by new memory + // The most recent new_memoy_used passed in will be returned + // in GetTotalMemoryUsed() even when the call return non-ok status. + // + // Since the class is NOT thread-safe, external synchronization on the + // order of calling UpdateCacheReservation() is needed if you want + // GetTotalMemoryUsed() indeed returns the latest memory used. + // + // @return On inserting dummy entries, it returns Status::OK() if all dummy + // entry insertions succeed. + // Otherwise, it returns the first non-ok status; + // On releasing dummy entries, it always returns Status::OK(). + // On keeping dummy entries the same, it always returns Status::OK(). + Status UpdateCacheReservation(std::size_t new_memory_used) override; + + Status UpdateCacheReservation(std::size_t /* memory_used_delta */, + bool /* increase */) override { + return Status::NotSupported(); + } + + // One of the two ways of reserving cache space and releasing is done through + // destruction of CacheReservationHandle. + // See UpdateCacheReservation() for the other way. + // + // Use ONLY one of these two ways to prevent unexpected behavior. + // + // Insert dummy entries in the cache for the incremental memory usage + // to match the size of total dummy entries with the least multiple of + // kSizeDummyEntry greater than or equal to the total memory used. + // + // A CacheReservationHandle is returned as an output parameter. + // The reserved dummy entries are automatically released on the destruction of + // this handle, which achieves better RAII per cache reservation. + // + // WARNING: Deallocate all the handles of the CacheReservationManager object + // before deallocating the object to prevent unexpected behavior. + // + // @param incremental_memory_used The number of bytes increased in memory + // usage. + // + // Calling GetTotalMemoryUsed() afterward will return the total memory + // increased by this number, even when calling MakeCacheReservation() + // returns non-ok status. + // + // Since the class is NOT thread-safe, external synchronization in + // calling MakeCacheReservation() is needed if you want + // GetTotalMemoryUsed() indeed returns the latest memory used. + // + // @param handle An pointer to std::unique_ptr<CacheReservationHandle> that + // manages the lifetime of the cache reservation represented by the + // handle. + // + // @return It returns Status::OK() if all dummy + // entry insertions succeed. + // Otherwise, it returns the first non-ok status; + // + // REQUIRES: handle != nullptr + Status MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle) + override; + + // Return the size of the cache (which is a multiple of kSizeDummyEntry) + // successfully reserved by calling UpdateCacheReservation(). + // + // When UpdateCacheReservation() returns non-ok status, + // calling GetTotalReservedCacheSize() after that might return a slightly + // smaller number than the actual reserved cache size due to + // the returned number will always be a multiple of kSizeDummyEntry + // and cache full might happen in the middle of inserting a dummy entry. + std::size_t GetTotalReservedCacheSize() override; + + // Return the latest total memory used indicated by the most recent call of + // UpdateCacheReservation(std::size_t new_memory_used); + std::size_t GetTotalMemoryUsed() override; + + static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; } + + // For testing only - it is to help ensure the NoopDeleterForRole<R> + // accessed from CacheReservationManagerImpl and the one accessed from the + // test are from the same translation units + static Cache::DeleterFn TEST_GetNoopDeleterForRole(); + + private: + static constexpr std::size_t kSizeDummyEntry = 256 * 1024; + + Slice GetNextCacheKey(); + + Status ReleaseCacheReservation(std::size_t incremental_memory_used); + Status IncreaseCacheReservation(std::size_t new_mem_used); + Status DecreaseCacheReservation(std::size_t new_mem_used); + + std::shared_ptr<Cache> cache_; + bool delayed_decrease_; + std::atomic<std::size_t> cache_allocated_size_; + std::size_t memory_used_; + std::vector<Cache::Handle *> dummy_handles_; + CacheKey cache_key_; +}; + +class ConcurrentCacheReservationManager + : public CacheReservationManager, + public std::enable_shared_from_this<ConcurrentCacheReservationManager> { + public: + class CacheReservationHandle + : public CacheReservationManager::CacheReservationHandle { + public: + CacheReservationHandle( + std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr, + std::unique_ptr<CacheReservationManager::CacheReservationHandle> + cache_res_handle) { + assert(cache_res_mgr && cache_res_handle); + cache_res_mgr_ = cache_res_mgr; + cache_res_handle_ = std::move(cache_res_handle); + } + + ~CacheReservationHandle() override { + std::lock_guard<std::mutex> lock(cache_res_mgr_->cache_res_mgr_mu_); + cache_res_handle_.reset(); + } + + private: + std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_; + std::unique_ptr<CacheReservationManager::CacheReservationHandle> + cache_res_handle_; + }; + + explicit ConcurrentCacheReservationManager( + std::shared_ptr<CacheReservationManager> cache_res_mgr) { + cache_res_mgr_ = std::move(cache_res_mgr); + } + ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) = + delete; + ConcurrentCacheReservationManager &operator=( + const ConcurrentCacheReservationManager &) = delete; + ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) = + delete; + ConcurrentCacheReservationManager &operator=( + ConcurrentCacheReservationManager &&) = delete; + + ~ConcurrentCacheReservationManager() override {} + + inline Status UpdateCacheReservation(std::size_t new_memory_used) override { + std::lock_guard<std::mutex> lock(cache_res_mgr_mu_); + return cache_res_mgr_->UpdateCacheReservation(new_memory_used); + } + + inline Status UpdateCacheReservation(std::size_t memory_used_delta, + bool increase) override { + std::lock_guard<std::mutex> lock(cache_res_mgr_mu_); + std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed(); + Status s; + if (!increase) { + assert(total_mem_used >= memory_used_delta); + s = cache_res_mgr_->UpdateCacheReservation(total_mem_used - + memory_used_delta); + } else { + s = cache_res_mgr_->UpdateCacheReservation(total_mem_used + + memory_used_delta); + } + return s; + } + + inline Status MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle) + override { + std::unique_ptr<CacheReservationManager::CacheReservationHandle> + wrapped_handle; + Status s; + { + std::lock_guard<std::mutex> lock(cache_res_mgr_mu_); + s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used, + &wrapped_handle); + } + (*handle).reset( + new ConcurrentCacheReservationManager::CacheReservationHandle( + std::enable_shared_from_this< + ConcurrentCacheReservationManager>::shared_from_this(), + std::move(wrapped_handle))); + return s; + } + inline std::size_t GetTotalReservedCacheSize() override { + return cache_res_mgr_->GetTotalReservedCacheSize(); + } + inline std::size_t GetTotalMemoryUsed() override { + std::lock_guard<std::mutex> lock(cache_res_mgr_mu_); + return cache_res_mgr_->GetTotalMemoryUsed(); + } + + private: + std::mutex cache_res_mgr_mu_; + std::shared_ptr<CacheReservationManager> cache_res_mgr_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/cache_reservation_manager_test.cc b/src/rocksdb/cache/cache_reservation_manager_test.cc new file mode 100644 index 000000000..2a0c318e0 --- /dev/null +++ b/src/rocksdb/cache/cache_reservation_manager_test.cc @@ -0,0 +1,469 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "cache/cache_reservation_manager.h" + +#include <cstddef> +#include <cstring> +#include <memory> + +#include "cache/cache_entry_roles.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +class CacheReservationManagerTest : public ::testing::Test { + protected: + static constexpr std::size_t kSizeDummyEntry = + CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize(); + static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + static constexpr int kNumShardBits = 0; // 2^0 shard + static constexpr std::size_t kMetaDataChargeOverhead = 10000; + + std::shared_ptr<Cache> cache = NewLRUCache(kCacheCapacity, kNumShardBits); + std::shared_ptr<CacheReservationManager> test_cache_rev_mng; + + CacheReservationManagerTest() { + test_cache_rev_mng = + std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>( + cache); + } +}; + +TEST_F(CacheReservationManagerTest, GenerateCacheKey) { + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + // Next unique Cache key + CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + // Get to the underlying values + uint64_t* ckey_data = reinterpret_cast<uint64_t*>(&ckey); + // Back it up to the one used by CRM (using CacheKey implementation details) + ckey_data[1]--; + + // Specific key (subject to implementation details) + EXPECT_EQ(ckey_data[0], 0); + EXPECT_EQ(ckey_data[1], 2); + + Cache::Handle* handle = cache->Lookup(ckey.AsSlice()); + EXPECT_NE(handle, nullptr) + << "Failed to generate the cache key for the dummy entry correctly"; + // Clean up the returned handle from Lookup() to prevent memory leak + cache->Release(handle); +} + +TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) { + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + std::size_t initial_pinned_usage = cache->GetPinnedUsage(); + ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry); + ASSERT_LT(initial_pinned_usage, + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to keep cache reservation the same when new_mem_used equals " + "to current cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly when new_mem_used equals to current " + "cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly when new_mem_used " + "equals to current cache reservation"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to keep underlying dummy entries the same when new_mem_used " + "equals to current cache reservation"; +} + +TEST_F(CacheReservationManagerTest, + IncreaseCacheReservationByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry) + << "Failed to increase underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to increase underlying dummy entries in cache correctly"; +} + +TEST_F(CacheReservationManagerTest, + IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 3 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry) + << "Failed to increase underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 3 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to increase underlying dummy entries in cache correctly"; +} + +TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, + IncreaseCacheReservationOnFullCache) { + ; + constexpr std::size_t kSizeDummyEntry = + CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize(); + constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry; + constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kSmallCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr<Cache> cache = NewLRUCache(lo); + std::shared_ptr<CacheReservationManager> test_cache_rev_mng = + std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>( + cache); + + std::size_t new_mem_used = kSmallCacheCapacity + 1; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::MemoryLimit()) + << "Failed to return status to indicate failure of dummy entry insertion " + "during cache reservation on full cache"; + EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly before cache resevation failure happens " + "due to full cache"; + EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(), + kSmallCacheCapacity) + << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy " + "entry insertions) when encountering cache resevation failure due to " + "full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + + new_mem_used = kSmallCacheCapacity / 2; // 2 dummy entries + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation after encountering cache " + "reservation failure due to full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly after " + "encountering cache reservation due to full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry) + << "Failed to release underlying dummy entries correctly on cache " + "reservation decrease after encountering cache resevation failure due " + "to full cache"; + EXPECT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to release underlying dummy entries correctly on cache " + "reservation decrease after encountering cache resevation failure due " + "to full cache"; + + // Create cache full again for subsequent tests + new_mem_used = kSmallCacheCapacity + 1; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::MemoryLimit()) + << "Failed to return status to indicate failure of dummy entry insertion " + "during cache reservation on full cache"; + EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly before cache resevation failure happens " + "due to full cache"; + EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(), + kSmallCacheCapacity) + << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy " + "entry insertions) when encountering cache resevation failure due to " + "full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + + // Increase cache capacity so the previously failed insertion can fully + // succeed + cache->SetCapacity(kBigCacheCapacity); + new_mem_used = kSmallCacheCapacity + 1; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation after increasing cache capacity " + "and mitigating cache full error"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 5 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly after " + "increasing cache capacity and mitigating cache full error"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly after increasing " + "cache capacity and mitigating cache full error"; + EXPECT_LT(cache->GetPinnedUsage(), + 5 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to insert underlying dummy entries correctly after increasing " + "cache capacity and mitigating cache full error"; +} + +TEST_F(CacheReservationManagerTest, + DecreaseCacheReservationByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = 1 * kSizeDummyEntry; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache correctly"; +} + +TEST_F(CacheReservationManagerTest, + DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = kSizeDummyEntry / 2; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache correctly"; +} + +TEST(CacheReservationManagerWithDelayedDecreaseTest, + DecreaseCacheReservationWithDelayedDecrease) { + constexpr std::size_t kSizeDummyEntry = + CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize(); + constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; + std::shared_ptr<Cache> cache = NewLRUCache(lo); + std::shared_ptr<CacheReservationManager> test_cache_rev_mng = + std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>( + cache, true /* delayed_decrease */); + + std::size_t new_mem_used = 8 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + std::size_t initial_pinned_usage = cache->GetPinnedUsage(); + ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry); + ASSERT_LT(initial_pinned_usage, + 8 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = 6 * kSizeDummyEntry; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry) + << "Failed to bookkeep correctly when delaying cache reservation " + "decrease"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to delay decreasing underlying dummy entries in cache"; + + new_mem_used = 7 * kSizeDummyEntry; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry) + << "Failed to bookkeep correctly when delaying cache reservation " + "decrease"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to delay decreasing underlying dummy entries in cache"; + + new_mem_used = 6 * kSizeDummyEntry - 1; + s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly when new_mem_used < " + "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 6 * kSizeDummyEntry) + << "Failed to bookkeep correctly when new_mem_used < " + "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache when " + "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed " + "decrease mode"; + EXPECT_LT(cache->GetPinnedUsage(), + 6 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache when " + "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed " + "decrease mode"; +} + +TEST(CacheReservationManagerDestructorTest, + ReleaseRemainingDummyEntriesOnDestruction) { + constexpr std::size_t kSizeDummyEntry = + CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize(); + constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; + std::shared_ptr<Cache> cache = NewLRUCache(lo); + { + std::shared_ptr<CacheReservationManager> test_cache_rev_mng = + std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>( + cache); + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + } + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry) + << "Failed to release remaining underlying dummy entries in cache in " + "CacheReservationManager's destructor"; +} + +TEST(CacheReservationHandleTest, HandleTest) { + constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024; + constexpr std::size_t kSizeDummyEntry = 256 * 1024; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kOneGigabyte; + lo.num_shard_bits = 0; + std::shared_ptr<Cache> cache = NewLRUCache(lo); + + std::shared_ptr<CacheReservationManager> test_cache_rev_mng( + std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>( + cache)); + + std::size_t mem_used = 0; + const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry; + const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry; + std::unique_ptr<CacheReservationManager::CacheReservationHandle> handle_1, + handle_2; + + // To test consecutive CacheReservationManager::MakeCacheReservation works + // correctly in terms of returning the handle as well as updating cache + // reservation and the latest total memory used + Status s = test_cache_rev_mng->MakeCacheReservation( + incremental_mem_used_handle_1, &handle_1); + mem_used = mem_used + incremental_mem_used_handle_1; + ASSERT_EQ(s, Status::OK()); + EXPECT_TRUE(handle_1 != nullptr); + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + s = test_cache_rev_mng->MakeCacheReservation(incremental_mem_used_handle_2, + &handle_2); + mem_used = mem_used + incremental_mem_used_handle_2; + ASSERT_EQ(s, Status::OK()); + EXPECT_TRUE(handle_2 != nullptr); + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + // To test + // CacheReservationManager::CacheReservationHandle::~CacheReservationHandle() + // works correctly in releasing the cache reserved for the handle + handle_1.reset(); + EXPECT_TRUE(handle_1 == nullptr); + mem_used = mem_used - incremental_mem_used_handle_1; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + // To test the actual CacheReservationManager object won't be deallocated + // as long as there remain handles pointing to it. + // We strongly recommend deallocating CacheReservationManager object only + // after all its handles are deallocated to keep things easy to reasonate + test_cache_rev_mng.reset(); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + handle_2.reset(); + // The CacheReservationManager object is now deallocated since all the handles + // and its original pointer is gone + mem_used = mem_used - incremental_mem_used_handle_2; + EXPECT_EQ(mem_used, 0); + EXPECT_EQ(cache->GetPinnedUsage(), mem_used); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/cache/cache_test.cc b/src/rocksdb/cache/cache_test.cc new file mode 100644 index 000000000..212d65d96 --- /dev/null +++ b/src/rocksdb/cache/cache_test.cc @@ -0,0 +1,1037 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include <forward_list> +#include <functional> +#include <iostream> +#include <string> +#include <vector> + +#include "cache/lru_cache.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/string_util.h" + +// HyperClockCache only supports 16-byte keys, so some of the tests +// originally written for LRUCache do not work on the other caches. +// Those tests were adapted to use 16-byte keys. We kept the original ones. +// TODO: Remove the original tests if they ever become unused. + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Conversions between numeric keys/values and the types expected by Cache. +std::string EncodeKey16Bytes(int k) { + std::string result; + PutFixed32(&result, k); + result.append(std::string(12, 'a')); // Because we need a 16B output, we + // add a 12-byte padding. + return result; +} + +int DecodeKey16Bytes(const Slice& k) { + assert(k.size() == 16); + return DecodeFixed32(k.data()); // Decodes only the first 4 bytes of k. +} + +std::string EncodeKey32Bits(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} + +int DecodeKey32Bits(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} + +void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); } + +int DecodeValue(void* v) { + return static_cast<int>(reinterpret_cast<uintptr_t>(v)); +} + +void DumbDeleter(const Slice& /*key*/, void* /*value*/) {} + +void EraseDeleter1(const Slice& /*key*/, void* value) { + Cache* cache = reinterpret_cast<Cache*>(value); + cache->Erase("foo"); +} + +void EraseDeleter2(const Slice& /*key*/, void* value) { + Cache* cache = reinterpret_cast<Cache*>(value); + cache->Erase(EncodeKey16Bytes(1234)); +} + +const std::string kLRU = "lru"; +const std::string kHyperClock = "hyper_clock"; + +} // anonymous namespace + +class CacheTest : public testing::TestWithParam<std::string> { + public: + static CacheTest* current_; + static std::string type_; + + static void Deleter(const Slice& key, void* v) { + if (type_ == kHyperClock) { + current_->deleted_keys_.push_back(DecodeKey16Bytes(key)); + } else { + current_->deleted_keys_.push_back(DecodeKey32Bits(key)); + } + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 1000; + static const int kNumShardBits = 4; + + static const int kCacheSize2 = 100; + static const int kNumShardBits2 = 2; + + std::vector<int> deleted_keys_; + std::vector<int> deleted_values_; + std::shared_ptr<Cache> cache_; + std::shared_ptr<Cache> cache2_; + + size_t estimated_value_size_ = 1; + + CacheTest() + : cache_(NewCache(kCacheSize, kNumShardBits, false)), + cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) { + current_ = this; + type_ = GetParam(); + } + + ~CacheTest() override {} + + std::shared_ptr<Cache> NewCache(size_t capacity) { + auto type = GetParam(); + if (type == kLRU) { + return NewLRUCache(capacity); + } + if (type == kHyperClock) { + return HyperClockCacheOptions( + capacity, estimated_value_size_ /*estimated_value_size*/) + .MakeSharedCache(); + } + return nullptr; + } + + std::shared_ptr<Cache> NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { + auto type = GetParam(); + if (type == kLRU) { + LRUCacheOptions co; + co.capacity = capacity; + co.num_shard_bits = num_shard_bits; + co.strict_capacity_limit = strict_capacity_limit; + co.high_pri_pool_ratio = 0; + co.metadata_charge_policy = charge_policy; + return NewLRUCache(co); + } + if (type == kHyperClock) { + return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/, + num_shard_bits, strict_capacity_limit, + nullptr /*allocator*/, charge_policy) + .MakeSharedCache(); + } + return nullptr; + } + + // These functions encode/decode keys in tests cases that use + // int keys. + // Currently, HyperClockCache requires keys to be 16B long, whereas + // LRUCache doesn't, so the encoding depends on the cache type. + std::string EncodeKey(int k) { + auto type = GetParam(); + if (type == kHyperClock) { + return EncodeKey16Bytes(k); + } else { + return EncodeKey32Bits(k); + } + } + + int DecodeKey(const Slice& k) { + auto type = GetParam(); + if (type == kHyperClock) { + return DecodeKey16Bytes(k); + } else { + return DecodeKey32Bits(k); + } + } + + int Lookup(std::shared_ptr<Cache> cache, int key) { + Cache::Handle* handle = cache->Lookup(EncodeKey(key)); + const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle)); + if (handle != nullptr) { + cache->Release(handle); + } + return r; + } + + void Insert(std::shared_ptr<Cache> cache, int key, int value, + int charge = 1) { + EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(std::shared_ptr<Cache> cache, int key) { + cache->Erase(EncodeKey(key)); + } + + int Lookup(int key) { return Lookup(cache_, key); } + + void Insert(int key, int value, int charge = 1) { + Insert(cache_, key, value, charge); + } + + void Erase(int key) { Erase(cache_, key); } + + int Lookup2(int key) { return Lookup(cache2_, key); } + + void Insert2(int key, int value, int charge = 1) { + Insert(cache2_, key, value, charge); + } + + void Erase2(int key) { Erase(cache2_, key); } +}; + +CacheTest* CacheTest::current_; +std::string CacheTest::type_; + +class LRUCacheTest : public CacheTest {}; + +TEST_P(CacheTest, UsageTest) { + auto type = GetParam(); + + // cache is std::shared_ptr and will be automatically cleaned up. + const size_t kCapacity = 100000; + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); + ASSERT_EQ(0, cache->GetUsage()); + size_t baseline_meta_usage = precise_cache->GetUsage(); + if (type != kHyperClock) { + ASSERT_EQ(0, baseline_meta_usage); + } + + size_t usage = 0; + char value[10] = "abcdef"; + // make sure everything will be cached + for (int i = 1; i < 100; ++i) { + std::string key; + if (type == kLRU) { + key = std::string(i, 'a'); + } else { + key = EncodeKey(i); + } + auto kv_size = key.size() + 5; + ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size, + DumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value), + kv_size, DumbDeleter)); + usage += kv_size; + ASSERT_EQ(usage, cache->GetUsage()); + if (type == kHyperClock) { + ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage()); + } else { + ASSERT_LT(usage, precise_cache->GetUsage()); + } + } + + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); + + // make sure the cache will be overloaded + for (size_t i = 1; i < kCapacity; ++i) { + std::string key; + if (type == kLRU) { + key = std::to_string(i); + } else { + key = EncodeKey(static_cast<int>(1000 + i)); + } + ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5, + DumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value), + key.size() + 5, DumbDeleter)); + } + + // the usage should be close to the capacity + ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_GT(kCapacity, precise_cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); + if (type != kHyperClock) { + ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); + } else { + // estimated value size of 1 is weird for clock cache, because + // almost all of the capacity will be used for metadata, and due to only + // using power of 2 table sizes, we might hit strict occupancy limit + // before hitting capacity limit. + ASSERT_LT(kCapacity * 0.80, precise_cache->GetUsage()); + } +} + +// TODO: This test takes longer than expected on ClockCache. This is +// because the values size estimate at construction is too sloppy. +// Fix this. +// Why is it so slow? The cache is constructed with an estimate of 1, but +// then the charge is claimed to be 21. This will cause the hash table +// to be extremely sparse, which in turn means clock needs to scan too +// many slots to find victims. +TEST_P(CacheTest, PinnedUsageTest) { + auto type = GetParam(); + + // cache is std::shared_ptr and will be automatically cleaned up. + const size_t kCapacity = 200000; + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); + size_t baseline_meta_usage = precise_cache->GetUsage(); + if (type != kHyperClock) { + ASSERT_EQ(0, baseline_meta_usage); + } + + size_t pinned_usage = 0; + char value[10] = "abcdef"; + + std::forward_list<Cache::Handle*> unreleased_handles; + std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache; + + // Add entries. Unpin some of them after insertion. Then, pin some of them + // again. Check GetPinnedUsage(). + for (int i = 1; i < 100; ++i) { + std::string key; + if (type == kLRU) { + key = std::string(i, 'a'); + } else { + key = EncodeKey(i); + } + auto kv_size = key.size() + 5; + Cache::Handle* handle; + Cache::Handle* handle_in_precise_cache; + ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size, + DumbDeleter, &handle)); + assert(handle); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value), + kv_size, DumbDeleter, + &handle_in_precise_cache)); + assert(handle_in_precise_cache); + pinned_usage += kv_size; + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); + if (i % 2 == 0) { + cache->Release(handle); + precise_cache->Release(handle_in_precise_cache); + pinned_usage -= kv_size; + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); + } else { + unreleased_handles.push_front(handle); + unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache); + } + if (i % 3 == 0) { + unreleased_handles.push_front(cache->Lookup(key)); + auto x = precise_cache->Lookup(key); + assert(x); + unreleased_handles_in_precise_cache.push_front(x); + // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned + // usage increased + if (i % 2 == 0) { + pinned_usage += kv_size; + } + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); + } + } + auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage(); + ASSERT_LT(pinned_usage, precise_cache_pinned_usage); + + // check that overloading the cache does not change the pinned usage + for (size_t i = 1; i < 2 * kCapacity; ++i) { + std::string key; + if (type == kLRU) { + key = std::to_string(i); + } else { + key = EncodeKey(static_cast<int>(1000 + i)); + } + ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5, + DumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value), + key.size() + 5, DumbDeleter)); + } + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); + + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); + + // release handles for pinned entries to prevent memory leaks + for (auto handle : unreleased_handles) { + cache->Release(handle); + } + for (auto handle : unreleased_handles_in_precise_cache) { + precise_cache->Release(handle); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + ASSERT_EQ(0, precise_cache->GetPinnedUsage()); + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); +} + +TEST_P(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + if (GetParam() == kHyperClock) { + // ClockCache usually doesn't overwrite on Insert + ASSERT_EQ(101, Lookup(100)); + } else { + ASSERT_EQ(102, Lookup(100)); + } + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + if (GetParam() == kHyperClock) { + ASSERT_EQ(102, deleted_values_[0]); + } else { + ASSERT_EQ(101, deleted_values_[0]); + } +} + +TEST_P(CacheTest, InsertSameKey) { + if (GetParam() == kHyperClock) { + ROCKSDB_GTEST_BYPASS( + "ClockCache doesn't guarantee Insert overwrite same key."); + return; + } + Insert(1, 1); + Insert(1, 2); + ASSERT_EQ(2, Lookup(1)); +} + +TEST_P(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0U, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); +} + +TEST_P(CacheTest, EntriesArePinned) { + if (GetParam() == kHyperClock) { + ROCKSDB_GTEST_BYPASS( + "ClockCache doesn't guarantee Insert overwrite same key."); + return; + } + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + ASSERT_EQ(1U, cache_->GetUsage()); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(2U, cache_->GetUsage()); + + cache_->Release(h1); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + ASSERT_EQ(1U, cache_->GetUsage()); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(1U, cache_->GetUsage()); + + cache_->Release(h2); + ASSERT_EQ(2U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); + ASSERT_EQ(0U, cache_->GetUsage()); +} + +TEST_P(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + // Frequently used entry must be kept around + for (int i = 0; i < 2 * kCacheSize; i++) { + Insert(1000 + i, 2000 + i); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST_P(CacheTest, ExternalRefPinsEntries) { + Insert(100, 101); + Cache::Handle* h = cache_->Lookup(EncodeKey(100)); + ASSERT_TRUE(cache_->Ref(h)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h))); + ASSERT_EQ(1U, cache_->GetUsage()); + + for (int i = 0; i < 3; ++i) { + if (i > 0) { + // First release (i == 1) corresponds to Ref(), second release (i == 2) + // corresponds to Lookup(). Then, since all external refs are released, + // the below insertions should push out the cache entry. + cache_->Release(h); + } + // double cache size because the usage bit in block cache prevents 100 from + // being evicted in the first kCacheSize iterations + for (int j = 0; j < 2 * kCacheSize + 100; j++) { + Insert(1000 + j, 2000 + j); + } + // Clock cache is even more stateful and needs more churn to evict + if (GetParam() == kHyperClock) { + for (int j = 0; j < kCacheSize; j++) { + Insert(11000 + j, 11000 + j); + } + } + if (i < 2) { + ASSERT_EQ(101, Lookup(100)); + } + } + ASSERT_EQ(-1, Lookup(100)); +} + +TEST_P(CacheTest, EvictionPolicyRef) { + Insert(100, 101); + Insert(101, 102); + Insert(102, 103); + Insert(103, 104); + Insert(200, 101); + Insert(201, 102); + Insert(202, 103); + Insert(203, 104); + Cache::Handle* h201 = cache_->Lookup(EncodeKey(200)); + Cache::Handle* h202 = cache_->Lookup(EncodeKey(201)); + Cache::Handle* h203 = cache_->Lookup(EncodeKey(202)); + Cache::Handle* h204 = cache_->Lookup(EncodeKey(203)); + Insert(300, 101); + Insert(301, 102); + Insert(302, 103); + Insert(303, 104); + + // Insert entries much more than cache capacity. + for (int i = 0; i < 100 * kCacheSize; i++) { + Insert(1000 + i, 2000 + i); + } + + // Check whether the entries inserted in the beginning + // are evicted. Ones without extra ref are evicted and + // those with are not. + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(101)); + ASSERT_EQ(-1, Lookup(102)); + ASSERT_EQ(-1, Lookup(103)); + + ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(301)); + ASSERT_EQ(-1, Lookup(302)); + ASSERT_EQ(-1, Lookup(303)); + + ASSERT_EQ(101, Lookup(200)); + ASSERT_EQ(102, Lookup(201)); + ASSERT_EQ(103, Lookup(202)); + ASSERT_EQ(104, Lookup(203)); + + // Cleaning up all the handles + cache_->Release(h201); + cache_->Release(h202); + cache_->Release(h203); + cache_->Release(h204); +} + +TEST_P(CacheTest, EvictEmptyCache) { + auto type = GetParam(); + + // Insert item large than capacity to trigger eviction on empty cache. + auto cache = NewCache(1, 0, false); + if (type == kLRU) { + ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter)); + } else { + ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter)); + } +} + +TEST_P(CacheTest, EraseFromDeleter) { + auto type = GetParam(); + + // Have deleter which will erase item from cache, which will re-enter + // the cache at that point. + std::shared_ptr<Cache> cache = NewCache(10, 0, false); + std::string foo, bar; + Cache::DeleterFn erase_deleter; + if (type == kLRU) { + foo = "foo"; + bar = "bar"; + erase_deleter = EraseDeleter1; + } else { + foo = EncodeKey(1234); + bar = EncodeKey(5678); + erase_deleter = EraseDeleter2; + } + + ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter)); + ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter)); + + cache->Erase(bar); + ASSERT_EQ(nullptr, cache->Lookup(foo)); + ASSERT_EQ(nullptr, cache->Lookup(bar)); +} + +TEST_P(CacheTest, ErasedHandleState) { + // insert a key and get two handles + Insert(100, 1000); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(h1, h2); + ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000); + ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000); + + // delete the key from the cache + Erase(100); + // can no longer find in the cache + ASSERT_EQ(-1, Lookup(100)); + + // release one handle + cache_->Release(h1); + // still can't find in cache + ASSERT_EQ(-1, Lookup(100)); + + cache_->Release(h2); +} + +TEST_P(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = 1; + const int kHeavy = 10; + int added = 0; + int index = 0; + while (added < 2 * kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000 + index, weight); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight; + ASSERT_EQ(1000 + i, r); + } + } + ASSERT_LE(cached_weight, kCacheSize + kCacheSize / 10); +} + +TEST_P(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +class Value { + public: + explicit Value(int v) : v_(v) {} + + int v_; +}; + +namespace { +void deleter(const Slice& /*key*/, void* value) { + delete static_cast<Value*>(value); +} +} // namespace + +TEST_P(CacheTest, ReleaseAndErase) { + std::shared_ptr<Cache> cache = NewCache(5, 0, false); + Cache::Handle* handle; + Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1, + &CacheTest::Deleter, &handle); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(5U, cache->GetCapacity()); + ASSERT_EQ(1U, cache->GetUsage()); + ASSERT_EQ(0U, deleted_keys_.size()); + auto erased = cache->Release(handle, true); + ASSERT_TRUE(erased); + // This tests that deleter has been called + ASSERT_EQ(1U, deleted_keys_.size()); +} + +TEST_P(CacheTest, ReleaseWithoutErase) { + std::shared_ptr<Cache> cache = NewCache(5, 0, false); + Cache::Handle* handle; + Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1, + &CacheTest::Deleter, &handle); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(5U, cache->GetCapacity()); + ASSERT_EQ(1U, cache->GetUsage()); + ASSERT_EQ(0U, deleted_keys_.size()); + auto erased = cache->Release(handle); + ASSERT_FALSE(erased); + // This tests that deleter is not called. When cache has free capacity it is + // not expected to immediately erase the released items. + ASSERT_EQ(0U, deleted_keys_.size()); +} + +TEST_P(CacheTest, SetCapacity) { + auto type = GetParam(); + if (type == kHyperClock) { + ROCKSDB_GTEST_BYPASS( + "FastLRUCache and HyperClockCache don't support arbitrary capacity " + "adjustments."); + return; + } + // test1: increase capacity + // lets create a cache with capacity 5, + // then, insert 5 elements, then increase capacity + // to 10, returned capacity should be 10, usage=5 + std::shared_ptr<Cache> cache = NewCache(5, 0, false); + std::vector<Cache::Handle*> handles(10); + // Insert 5 entries, but not releasing. + for (int i = 0; i < 5; i++) { + std::string key = EncodeKey(i + 1); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); + } + ASSERT_EQ(5U, cache->GetCapacity()); + ASSERT_EQ(5U, cache->GetUsage()); + cache->SetCapacity(10); + ASSERT_EQ(10U, cache->GetCapacity()); + ASSERT_EQ(5U, cache->GetUsage()); + + // test2: decrease capacity + // insert 5 more elements to cache, then release 5, + // then decrease capacity to 7, final capacity should be 7 + // and usage should be 7 + for (int i = 5; i < 10; i++) { + std::string key = EncodeKey(i + 1); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); + } + ASSERT_EQ(10U, cache->GetCapacity()); + ASSERT_EQ(10U, cache->GetUsage()); + for (int i = 0; i < 5; i++) { + cache->Release(handles[i]); + } + ASSERT_EQ(10U, cache->GetCapacity()); + ASSERT_EQ(10U, cache->GetUsage()); + cache->SetCapacity(7); + ASSERT_EQ(7, cache->GetCapacity()); + ASSERT_EQ(7, cache->GetUsage()); + + // release remaining 5 to keep valgrind happy + for (int i = 5; i < 10; i++) { + cache->Release(handles[i]); + } + + // Make sure this doesn't crash or upset ASAN/valgrind + cache->DisownData(); +} + +TEST_P(LRUCacheTest, SetStrictCapacityLimit) { + // test1: set the flag to false. Insert more keys than capacity. See if they + // all go through. + std::shared_ptr<Cache> cache = NewCache(5, 0, false); + std::vector<Cache::Handle*> handles(10); + Status s; + for (int i = 0; i < 10; i++) { + std::string key = EncodeKey(i + 1); + s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_OK(s); + ASSERT_NE(nullptr, handles[i]); + } + ASSERT_EQ(10, cache->GetUsage()); + + // test2: set the flag to true. Insert and check if it fails. + std::string extra_key = EncodeKey(100); + Value* extra_value = new Value(0); + cache->SetStrictCapacityLimit(true); + Cache::Handle* handle; + s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle); + ASSERT_TRUE(s.IsMemoryLimit()); + ASSERT_EQ(nullptr, handle); + ASSERT_EQ(10, cache->GetUsage()); + + for (int i = 0; i < 10; i++) { + cache->Release(handles[i]); + } + + // test3: init with flag being true. + std::shared_ptr<Cache> cache2 = NewCache(5, 0, true); + for (int i = 0; i < 5; i++) { + std::string key = EncodeKey(i + 1); + s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_OK(s); + ASSERT_NE(nullptr, handles[i]); + } + s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle); + ASSERT_TRUE(s.IsMemoryLimit()); + ASSERT_EQ(nullptr, handle); + // test insert without handle + s = cache2->Insert(extra_key, extra_value, 1, &deleter); + // AS if the key have been inserted into cache but get evicted immediately. + ASSERT_OK(s); + ASSERT_EQ(5, cache2->GetUsage()); + ASSERT_EQ(nullptr, cache2->Lookup(extra_key)); + + for (int i = 0; i < 5; i++) { + cache2->Release(handles[i]); + } +} + +TEST_P(CacheTest, OverCapacity) { + size_t n = 10; + + // a LRUCache with n entries and one shard only + std::shared_ptr<Cache> cache = NewCache(n, 0, false); + + std::vector<Cache::Handle*> handles(n + 1); + + // Insert n+1 entries, but not releasing. + for (int i = 0; i < static_cast<int>(n + 1); i++) { + std::string key = EncodeKey(i + 1); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); + } + + // Guess what's in the cache now? + for (int i = 0; i < static_cast<int>(n + 1); i++) { + std::string key = EncodeKey(i + 1); + auto h = cache->Lookup(key); + ASSERT_TRUE(h != nullptr); + if (h) cache->Release(h); + } + + // the cache is over capacity since nothing could be evicted + ASSERT_EQ(n + 1U, cache->GetUsage()); + for (int i = 0; i < static_cast<int>(n + 1); i++) { + cache->Release(handles[i]); + } + + if (GetParam() == kHyperClock) { + // Make sure eviction is triggered. + ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0])); + + // cache is under capacity now since elements were released + ASSERT_GE(n, cache->GetUsage()); + + // clean up + cache->Release(handles[0]); + } else { + // LRUCache checks for over-capacity in Release. + + // cache is exactly at capacity now with minimal eviction + ASSERT_EQ(n, cache->GetUsage()); + + // element 0 is evicted and the rest is there + // This is consistent with the LRU policy since the element 0 + // was released first + for (int i = 0; i < static_cast<int>(n + 1); i++) { + std::string key = EncodeKey(i + 1); + auto h = cache->Lookup(key); + if (h) { + ASSERT_NE(static_cast<size_t>(i), 0U); + cache->Release(h); + } else { + ASSERT_EQ(static_cast<size_t>(i), 0U); + } + } + } +} + +namespace { +std::vector<std::pair<int, int>> legacy_callback_state; +void legacy_callback(void* value, size_t charge) { + legacy_callback_state.push_back( + {DecodeValue(value), static_cast<int>(charge)}); +} +}; // namespace + +TEST_P(CacheTest, ApplyToAllCacheEntriesTest) { + std::vector<std::pair<int, int>> inserted; + legacy_callback_state.clear(); + + for (int i = 0; i < 10; ++i) { + Insert(i, i * 2, i + 1); + inserted.push_back({i * 2, i + 1}); + } + cache_->ApplyToAllCacheEntries(legacy_callback, true); + + std::sort(inserted.begin(), inserted.end()); + std::sort(legacy_callback_state.begin(), legacy_callback_state.end()); + ASSERT_EQ(inserted.size(), legacy_callback_state.size()); + for (int i = 0; i < static_cast<int>(inserted.size()); ++i) { + EXPECT_EQ(inserted[i], legacy_callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesTest) { + std::vector<std::string> callback_state; + const auto callback = [&](const Slice& key, void* value, size_t charge, + Cache::DeleterFn deleter) { + callback_state.push_back(std::to_string(DecodeKey(key)) + "," + + std::to_string(DecodeValue(value)) + "," + + std::to_string(charge)); + assert(deleter == &CacheTest::Deleter); + }; + + std::vector<std::string> inserted; + callback_state.clear(); + + for (int i = 0; i < 10; ++i) { + Insert(i, i * 2, i + 1); + inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," + + std::to_string(i + 1)); + } + cache_->ApplyToAllEntries(callback, /*opts*/ {}); + + std::sort(inserted.begin(), inserted.end()); + std::sort(callback_state.begin(), callback_state.end()); + ASSERT_EQ(inserted.size(), callback_state.size()); + for (int i = 0; i < static_cast<int>(inserted.size()); ++i) { + EXPECT_EQ(inserted[i], callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesDuringResize) { + // This is a mini-stress test of ApplyToAllEntries, to ensure + // items in the cache that are neither added nor removed + // during ApplyToAllEntries are counted exactly once. + + // Insert some entries that we expect to be seen exactly once + // during iteration. + constexpr int kSpecialCharge = 2; + constexpr int kNotSpecialCharge = 1; + constexpr int kSpecialCount = 100; + size_t expected_usage = 0; + for (int i = 0; i < kSpecialCount; ++i) { + Insert(i, i * 2, kSpecialCharge); + expected_usage += kSpecialCharge; + } + + // For callback + int special_count = 0; + const auto callback = [&](const Slice&, void*, size_t charge, + Cache::DeleterFn) { + if (charge == static_cast<size_t>(kSpecialCharge)) { + ++special_count; + } + }; + + // Start counting + std::thread apply_thread([&]() { + // Use small average_entries_per_lock to make the problem difficult + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = 2; + cache_->ApplyToAllEntries(callback, opts); + }); + + // In parallel, add more entries, enough to cause resize but not enough + // to cause ejections. (Note: if any cache shard is over capacity, there + // will be ejections) + for (int i = kSpecialCount * 1; i < kSpecialCount * 5; ++i) { + Insert(i, i * 2, kNotSpecialCharge); + expected_usage += kNotSpecialCharge; + } + + apply_thread.join(); + // verify no evictions + ASSERT_EQ(cache_->GetUsage(), expected_usage); + // verify everything seen in ApplyToAllEntries + ASSERT_EQ(special_count, kSpecialCount); +} + +TEST_P(CacheTest, DefaultShardBits) { + // Prevent excessive allocation (to save time & space) + estimated_value_size_ = 100000; + // Implementations use different minimum shard sizes + size_t min_shard_size = + (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U; + + std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size); + ShardedCacheBase* sc = dynamic_cast<ShardedCacheBase*>(cache.get()); + ASSERT_EQ(5, sc->GetNumShardBits()); + + cache = NewCache(min_shard_size / 1000U * 999U); + sc = dynamic_cast<ShardedCacheBase*>(cache.get()); + ASSERT_EQ(0, sc->GetNumShardBits()); + + cache = NewCache(3U * 1024U * 1024U * 1024U); + sc = dynamic_cast<ShardedCacheBase*>(cache.get()); + // current maximum of 6 + ASSERT_EQ(6, sc->GetNumShardBits()); + + if constexpr (sizeof(size_t) > 4) { + cache = NewCache(128U * min_shard_size); + sc = dynamic_cast<ShardedCacheBase*>(cache.get()); + // current maximum of 6 + ASSERT_EQ(6, sc->GetNumShardBits()); + } +} + +TEST_P(CacheTest, GetChargeAndDeleter) { + Insert(1, 2); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); + ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); + ASSERT_EQ(1, cache_->GetCharge(h1)); + ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1)); + cache_->Release(h1); +} + +INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, + testing::Values(kLRU, kHyperClock)); +INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/cache/charged_cache.cc b/src/rocksdb/cache/charged_cache.cc new file mode 100644 index 000000000..a9ff969b8 --- /dev/null +++ b/src/rocksdb/cache/charged_cache.cc @@ -0,0 +1,117 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/charged_cache.h" + +#include "cache/cache_reservation_manager.h" + +namespace ROCKSDB_NAMESPACE { + +ChargedCache::ChargedCache(std::shared_ptr<Cache> cache, + std::shared_ptr<Cache> block_cache) + : cache_(cache), + cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>( + std::make_shared< + CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>( + block_cache))) {} + +Status ChargedCache::Insert(const Slice& key, void* value, size_t charge, + DeleterFn deleter, Handle** handle, + Priority priority) { + Status s = cache_->Insert(key, value, charge, deleter, handle, priority); + if (s.ok()) { + // Insert may cause the cache entry eviction if the cache is full. So we + // directly call the reservation manager to update the total memory used + // in the cache. + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); + } + return s; +} + +Status ChargedCache::Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle, Priority priority) { + Status s = cache_->Insert(key, value, helper, charge, handle, priority); + if (s.ok()) { + // Insert may cause the cache entry eviction if the cache is full. So we + // directly call the reservation manager to update the total memory used + // in the cache. + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); + } + return s; +} + +Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) { + return cache_->Lookup(key, stats); +} + +Cache::Handle* ChargedCache::Lookup(const Slice& key, + const CacheItemHelper* helper, + const CreateCallback& create_cb, + Priority priority, bool wait, + Statistics* stats) { + auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats); + // Lookup may promote the KV pair from the secondary cache to the primary + // cache. So we directly call the reservation manager to update the total + // memory used in the cache. + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); + return handle; +} + +bool ChargedCache::Release(Cache::Handle* handle, bool useful, + bool erase_if_last_ref) { + size_t memory_used_delta = cache_->GetUsage(handle); + bool erased = cache_->Release(handle, useful, erase_if_last_ref); + if (erased) { + assert(cache_res_mgr_); + cache_res_mgr_ + ->UpdateCacheReservation(memory_used_delta, /* increase */ false) + .PermitUncheckedError(); + } + return erased; +} + +bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) { + size_t memory_used_delta = cache_->GetUsage(handle); + bool erased = cache_->Release(handle, erase_if_last_ref); + if (erased) { + assert(cache_res_mgr_); + cache_res_mgr_ + ->UpdateCacheReservation(memory_used_delta, /* increase */ false) + .PermitUncheckedError(); + } + return erased; +} + +void ChargedCache::Erase(const Slice& key) { + cache_->Erase(key); + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); +} + +void ChargedCache::EraseUnRefEntries() { + cache_->EraseUnRefEntries(); + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); +} + +void ChargedCache::SetCapacity(size_t capacity) { + cache_->SetCapacity(capacity); + // SetCapacity can result in evictions when the cache capacity is decreased, + // so we would want to update the cache reservation here as well. + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/charged_cache.h b/src/rocksdb/cache/charged_cache.h new file mode 100644 index 000000000..1739e4088 --- /dev/null +++ b/src/rocksdb/cache/charged_cache.h @@ -0,0 +1,121 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> + +#include "port/port.h" +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { + +class ConcurrentCacheReservationManager; + +// A cache interface which wraps around another cache and takes care of +// reserving space in block cache towards a single global memory limit, and +// forwards all the calls to the underlying cache. +class ChargedCache : public Cache { + public: + ChargedCache(std::shared_ptr<Cache> cache, + std::shared_ptr<Cache> block_cache); + ~ChargedCache() override = default; + + Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, + Handle** handle, Priority priority) override; + Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW) override; + + Cache::Handle* Lookup(const Slice& key, Statistics* stats) override; + Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, + bool wait, Statistics* stats = nullptr) override; + + bool Release(Cache::Handle* handle, bool useful, + bool erase_if_last_ref = false) override; + bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; + + void Erase(const Slice& key) override; + void EraseUnRefEntries() override; + + static const char* kClassName() { return "ChargedCache"; } + const char* Name() const override { return kClassName(); } + + uint64_t NewId() override { return cache_->NewId(); } + + void SetCapacity(size_t capacity) override; + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + cache_->SetStrictCapacityLimit(strict_capacity_limit); + } + + bool HasStrictCapacityLimit() const override { + return cache_->HasStrictCapacityLimit(); + } + + void* Value(Cache::Handle* handle) override { return cache_->Value(handle); } + + bool IsReady(Cache::Handle* handle) override { + return cache_->IsReady(handle); + } + + void Wait(Cache::Handle* handle) override { cache_->Wait(handle); } + + void WaitAll(std::vector<Handle*>& handles) override { + cache_->WaitAll(handles); + } + + bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); } + + size_t GetCapacity() const override { return cache_->GetCapacity(); } + + size_t GetUsage() const override { return cache_->GetUsage(); } + + size_t GetUsage(Cache::Handle* handle) const override { + return cache_->GetUsage(handle); + } + + size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } + + size_t GetCharge(Cache::Handle* handle) const override { + return cache_->GetCharge(handle); + } + + Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override { + return cache_->GetDeleter(handle); + } + + void ApplyToAllEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + Cache::DeleterFn deleter)>& callback, + const Cache::ApplyToAllEntriesOptions& opts) override { + cache_->ApplyToAllEntries(callback, opts); + } + + void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge), + bool thread_safe) override { + cache_->ApplyToAllCacheEntries(callback, thread_safe); + } + + std::string GetPrintableOptions() const override { + return cache_->GetPrintableOptions(); + } + + void DisownData() override { return cache_->DisownData(); } + + inline Cache* GetCache() const { return cache_.get(); } + + inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager() + const { + return cache_res_mgr_.get(); + } + + private: + std::shared_ptr<Cache> cache_; + std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc new file mode 100644 index 000000000..6c9f18c2f --- /dev/null +++ b/src/rocksdb/cache/clock_cache.cc @@ -0,0 +1,1404 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "cache/clock_cache.h" + +#include <cassert> +#include <functional> +#include <numeric> + +#include "cache/cache_key.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "util/hash.h" +#include "util/math.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace clock_cache { + +namespace { +inline uint64_t GetRefcount(uint64_t meta) { + return ((meta >> ClockHandle::kAcquireCounterShift) - + (meta >> ClockHandle::kReleaseCounterShift)) & + ClockHandle::kCounterMask; +} + +inline uint64_t GetInitialCountdown(Cache::Priority priority) { + // Set initial clock data from priority + // TODO: configuration parameters for priority handling and clock cycle + // count? + switch (priority) { + case Cache::Priority::HIGH: + return ClockHandle::kHighCountdown; + default: + assert(false); + FALLTHROUGH_INTENDED; + case Cache::Priority::LOW: + return ClockHandle::kLowCountdown; + case Cache::Priority::BOTTOM: + return ClockHandle::kBottomCountdown; + } +} + +inline void FreeDataMarkEmpty(ClockHandle& h) { + // NOTE: in theory there's more room for parallelism if we copy the handle + // data and delay actions like this until after marking the entry as empty, + // but performance tests only show a regression by copying the few words + // of data. + h.FreeData(); + +#ifndef NDEBUG + // Mark slot as empty, with assertion + uint64_t meta = h.meta.exchange(0, std::memory_order_release); + assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.store(0, std::memory_order_release); +#endif +} + +inline bool ClockUpdate(ClockHandle& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + + uint64_t acquire_count = + (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; + uint64_t release_count = + (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; + // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count, + // release_count, (unsigned)(meta >> ClockHandle::kStateShift)); + if (acquire_count != release_count) { + // Only clock update entries with no outstanding refs + return false; + } + if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) { + // Only clock update Shareable entries + return false; + } + if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && + acquire_count > 0) { + // Decrement clock + uint64_t new_count = + std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1); + // Compare-exchange in the decremented clock info, but + // not aggressively + uint64_t new_meta = + (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | + (new_count << ClockHandle::kReleaseCounterShift) | + (new_count << ClockHandle::kAcquireCounterShift); + h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed); + return false; + } + // Otherwise, remove entry (either unreferenced invisible or + // unreferenced and expired visible). + if (h.meta.compare_exchange_strong( + meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership. + return true; + } else { + // Compare-exchange failing probably + // indicates the entry was used, so skip it in that case. + return false; + } +} + +} // namespace + +void ClockHandleBasicData::FreeData() const { + if (deleter) { + UniqueId64x2 unhashed; + (*deleter)( + ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed), + value); + } +} + +HyperClockTable::HyperClockTable( + size_t capacity, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts) + : length_bits_(CalcHashBits(capacity, opts.estimated_value_size, + metadata_charge_policy)), + length_bits_mask_((size_t{1} << length_bits_) - 1), + occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) * + kStrictLoadFactor)), + array_(new HandleImpl[size_t{1} << length_bits_]) { + if (metadata_charge_policy == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + usage_ += size_t{GetTableSize()} * sizeof(HandleImpl); + } + + static_assert(sizeof(HandleImpl) == 64U, + "Expecting size / alignment with common cache line size"); +} + +HyperClockTable::~HyperClockTable() { + // Assumes there are no references or active operations on any slot/element + // in the table. + for (size_t i = 0; i < GetTableSize(); i++) { + HandleImpl& h = array_[i]; + switch (h.meta >> ClockHandle::kStateShift) { + case ClockHandle::kStateEmpty: + // noop + break; + case ClockHandle::kStateInvisible: // rare but possible + case ClockHandle::kStateVisible: + assert(GetRefcount(h.meta) == 0); + h.FreeData(); +#ifndef NDEBUG + Rollback(h.hashed_key, &h); + ReclaimEntryUsage(h.GetTotalCharge()); +#endif + break; + // otherwise + default: + assert(false); + break; + } + } + +#ifndef NDEBUG + for (size_t i = 0; i < GetTableSize(); i++) { + assert(array_[i].displacements.load() == 0); + } +#endif + + assert(usage_.load() == 0 || + usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl)); + assert(occupancy_ == 0); +} + +// If an entry doesn't receive clock updates but is repeatedly referenced & +// released, the acquire and release counters could overflow without some +// intervention. This is that intervention, which should be inexpensive +// because it only incurs a simple, very predictable check. (Applying a bit +// mask in addition to an increment to every Release likely would be +// relatively expensive, because it's an extra atomic update.) +// +// We do have to assume that we never have many millions of simultaneous +// references to a cache handle, because we cannot represent so many +// references with the difference in counters, masked to the number of +// counter bits. Similarly, we assume there aren't millions of threads +// holding transient references (which might be "undone" rather than +// released by the way). +// +// Consider these possible states for each counter: +// low: less than kMaxCountdown +// medium: kMaxCountdown to half way to overflow + kMaxCountdown +// high: half way to overflow + kMaxCountdown, or greater +// +// And these possible states for the combination of counters: +// acquire / release +// ------- ------- +// low low - Normal / common, with caveats (see below) +// medium low - Can happen while holding some refs +// high low - Violates assumptions (too many refs) +// low medium - Violates assumptions (refs underflow, etc.) +// medium medium - Normal (very read heavy cache) +// high medium - Can happen while holding some refs +// low high - This function is supposed to prevent +// medium high - Violates assumptions (refs underflow, etc.) +// high high - Needs CorrectNearOverflow +// +// Basically, this function detects (high, high) state (inferred from +// release alone being high) and bumps it back down to (medium, medium) +// state with the same refcount and the same logical countdown counter +// (everything > kMaxCountdown is logically the same). Note that bumping +// down to (low, low) would modify the countdown counter, so is "reserved" +// in a sense. +// +// If near-overflow correction is triggered here, there's no guarantee +// that another thread hasn't freed the entry and replaced it with another. +// Therefore, it must be the case that the correction does not affect +// entries unless they are very old (many millions of acquire-release cycles). +// (Our bit manipulation is indeed idempotent and only affects entries in +// exceptional cases.) We assume a pre-empted thread will not stall that long. +// If it did, the state could be corrupted in the (unlikely) case that the top +// bit of the acquire counter is set but not the release counter, and thus +// we only clear the top bit of the acquire counter on resumption. It would +// then appear that there are too many refs and the entry would be permanently +// pinned (which is not terrible for an exceptionally rare occurrence), unless +// it is referenced enough (at least kMaxCountdown more times) for the release +// counter to reach "high" state again and bumped back to "medium." (This +// motivates only checking for release counter in high state, not both in high +// state.) +inline void CorrectNearOverflow(uint64_t old_meta, + std::atomic<uint64_t>& meta) { + // We clear both top-most counter bits at the same time. + constexpr uint64_t kCounterTopBit = uint64_t{1} + << (ClockHandle::kCounterNumBits - 1); + constexpr uint64_t kClearBits = + (kCounterTopBit << ClockHandle::kAcquireCounterShift) | + (kCounterTopBit << ClockHandle::kReleaseCounterShift); + // A simple check that allows us to initiate clearing the top bits for + // a large portion of the "high" state space on release counter. + constexpr uint64_t kCheckBits = + (kCounterTopBit | (ClockHandle::kMaxCountdown + 1)) + << ClockHandle::kReleaseCounterShift; + + if (UNLIKELY(old_meta & kCheckBits)) { + meta.fetch_and(~kClearBits, std::memory_order_relaxed); + } +} + +inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + if (total_charge > capacity) { + return Status::MemoryLimit( + "Cache entry too large for a single cache shard: " + + std::to_string(total_charge) + " > " + std::to_string(capacity)); + } + // Grab any available capacity, and free up any more required. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t new_usage; + if (LIKELY(old_usage != capacity)) { + do { + new_usage = std::min(capacity, old_usage + total_charge); + } while (!usage_.compare_exchange_weak(old_usage, new_usage, + std::memory_order_relaxed)); + } else { + new_usage = old_usage; + } + // How much do we need to evict then? + size_t need_evict_charge = old_usage + total_charge - new_usage; + size_t request_evict_charge = need_evict_charge; + if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { + // Require at least 1 eviction. + request_evict_charge = 1; + } + if (request_evict_charge > 0) { + size_t evicted_charge = 0; + size_t evicted_count = 0; + Evict(request_evict_charge, &evicted_charge, &evicted_count); + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + if (LIKELY(evicted_charge > need_evict_charge)) { + assert(evicted_count > 0); + // Evicted more than enough + usage_.fetch_sub(evicted_charge - need_evict_charge, + std::memory_order_relaxed); + } else if (evicted_charge < need_evict_charge || + (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { + // Roll back to old usage minus evicted + usage_.fetch_sub(evicted_charge + (new_usage - old_usage), + std::memory_order_relaxed); + if (evicted_charge < need_evict_charge) { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "capacity limit."); + } else { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "table occupancy limit."); + } + } + // If we needed to evict something and we are proceeding, we must have + // evicted something. + assert(evicted_count > 0); + } + return Status::OK(); +} + +inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + // For simplicity, we consider that either the cache can accept the insert + // with no evictions, or we must evict enough to make (at least) enough + // space. It could lead to unnecessary failures or excessive evictions in + // some extreme cases, but allows a fast, simple protocol. If we allow a + // race to get us over capacity, then we might never get back to capacity + // limit if the sizes of entries allow each insertion to evict the minimum + // charge. Thus, we should evict some extra if it's not a signifcant + // portion of the shard capacity. This can have the side benefit of + // involving fewer threads in eviction. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t need_evict_charge; + // NOTE: if total_charge > old_usage, there isn't yet enough to evict + // `total_charge` amount. Even if we only try to evict `old_usage` amount, + // there's likely something referenced and we would eat CPU looking for + // enough to evict. + if (old_usage + total_charge <= capacity || total_charge > old_usage) { + // Good enough for me (might run over with a race) + need_evict_charge = 0; + } else { + // Try to evict enough space, and maybe some extra + need_evict_charge = total_charge; + if (old_usage > capacity) { + // Not too much to avoid thundering herd while avoiding strict + // synchronization, such as the compare_exchange used with strict + // capacity limit. + need_evict_charge += std::min(capacity / 1024, total_charge) + 1; + } + } + if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { + // Special case: require at least 1 eviction if we only have to + // deal with occupancy + need_evict_charge = 1; + } + size_t evicted_charge = 0; + size_t evicted_count = 0; + if (need_evict_charge > 0) { + Evict(need_evict_charge, &evicted_charge, &evicted_count); + // Deal with potential occupancy deficit + if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { + assert(evicted_charge == 0); + // Can't meet occupancy requirement + return false; + } else { + // Update occupancy for evictions + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + } + } + // Track new usage even if we weren't able to evict enough + usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + // Success + return true; +} + +inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert( + const ClockHandleBasicData& proto) { + // Heap allocated separate from table + HandleImpl* h = new HandleImpl(); + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + h->SetDetached(); + // Single reference (detached entries only created if returning a refed + // Handle back to user) + uint64_t meta = uint64_t{ClockHandle::kStateInvisible} + << ClockHandle::kStateShift; + meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + h->meta.store(meta, std::memory_order_release); + // Keep track of how much of usage is detached + detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed); + return h; +} + +Status HyperClockTable::Insert(const ClockHandleBasicData& proto, + HandleImpl** handle, Cache::Priority priority, + size_t capacity, bool strict_capacity_limit) { + // Do we have the available occupancy? Optimistically assume we do + // and deal with it if we don't. + size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); + auto revert_occupancy_fn = [&]() { + occupancy_.fetch_sub(1, std::memory_order_relaxed); + }; + // Whether we over-committed and need an eviction to make up for it + bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_; + + // Usage/capacity handling is somewhat different depending on + // strict_capacity_limit, but mostly pessimistic. + bool use_detached_insert = false; + const size_t total_charge = proto.GetTotalCharge(); + if (strict_capacity_limit) { + Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!s.ok()) { + revert_occupancy_fn(); + return s; + } + } else { + // Case strict_capacity_limit == false + bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!success) { + revert_occupancy_fn(); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry + // inserted into cache and evicted immediately. + proto.FreeData(); + return Status::OK(); + } else { + // Need to track usage of fallback detached insert + usage_.fetch_add(total_charge, std::memory_order_relaxed); + use_detached_insert = true; + } + } + } + auto revert_usage_fn = [&]() { + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + }; + + if (!use_detached_insert) { + // Attempt a table insert, but abort if we find an existing entry for the + // key. If we were to overwrite old entries, we would either + // * Have to gain ownership over an existing entry to overwrite it, which + // would only work if there are no outstanding (read) references and would + // create a small gap in availability of the entry (old or new) to lookups. + // * Have to insert into a suboptimal location (more probes) so that the + // old entry can be kept around as well. + + uint64_t initial_countdown = GetInitialCountdown(priority); + assert(initial_countdown > 0); + + size_t probe = 0; + HandleImpl* e = FindSlot( + proto.hashed_key, + [&](HandleImpl* h) { + // Optimistically transition the slot from "empty" to + // "under construction" (no effect on other states) + uint64_t old_meta = + h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit} + << ClockHandle::kStateShift, + std::memory_order_acq_rel); + uint64_t old_state = old_meta >> ClockHandle::kStateShift; + + if (old_state == ClockHandle::kStateEmpty) { + // We've started inserting into an available slot, and taken + // ownership Save data fields + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + + // Transition from "under construction" state to "visible" state + uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} + << ClockHandle::kStateShift; + + // Maybe with an outstanding reference + new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; + new_meta |= (initial_countdown - (handle != nullptr)) + << ClockHandle::kReleaseCounterShift; + +#ifndef NDEBUG + // Save the state transition, with assertion + old_meta = h->meta.exchange(new_meta, std::memory_order_release); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Save the state transition + h->meta.store(new_meta, std::memory_order_release); +#endif + return true; + } else if (old_state != ClockHandle::kStateVisible) { + // Slot not usable / touchable now + return false; + } + // Existing, visible entry, which might be a match. + // But first, we need to acquire a ref to read it. In fact, number of + // refs for initial countdown, so that we boost the clock state if + // this is a match. + old_meta = h->meta.fetch_add( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + // Like Lookup + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == proto.hashed_key) { + // Match. Release in a way that boosts the clock state + old_meta = h->meta.fetch_add( + ClockHandle::kReleaseIncrement * initial_countdown, + std::memory_order_acq_rel); + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + // Insert detached instead (only if return handle needed) + use_detached_insert = true; + return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub( + ClockHandle::kAcquireIncrement * initial_countdown, + std::memory_order_acq_rel); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + // Slot not usable / touchable now. + } + (void)old_meta; + return false; + }, + [&](HandleImpl* /*h*/) { return false; }, + [&](HandleImpl* h) { + h->displacements.fetch_add(1, std::memory_order_relaxed); + }, + probe); + if (e == nullptr) { + // Occupancy check and never abort FindSlot above should generally + // prevent this, except it's theoretically possible for other threads + // to evict and replace entries in the right order to hit every slot + // when it is populated. Assuming random hashing, the chance of that + // should be no higher than pow(kStrictLoadFactor, n) for n slots. + // That should be infeasible for roughly n >= 256, so if this assertion + // fails, that suggests something is going wrong. + assert(GetTableSize() < 256); + use_detached_insert = true; + } + if (!use_detached_insert) { + // Successfully inserted + if (handle) { + *handle = e; + } + return Status::OK(); + } + // Roll back table insertion + Rollback(proto.hashed_key, e); + revert_occupancy_fn(); + // Maybe fall back on detached insert + if (handle == nullptr) { + revert_usage_fn(); + // As if unrefed entry immdiately evicted + proto.FreeData(); + return Status::OK(); + } + } + + // Run detached insert + assert(use_detached_insert); + + *handle = DetachedInsert(proto); + + // The OkOverwritten status is used to count "redundant" insertions into + // block cache. This implementation doesn't strictly check for redundant + // insertions, but we instead are probably interested in how many insertions + // didn't go into the table (instead "detached"), which could be redundant + // Insert or some other reason (use_detached_insert reasons above). + return Status::OkOverwritten(); +} + +HyperClockTable::HandleImpl* HyperClockTable::Lookup( + const UniqueId64x2& hashed_key) { + size_t probe = 0; + HandleImpl* e = FindSlot( + hashed_key, + [&](HandleImpl* h) { + // Mostly branch-free version (similar performance) + /* + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; + bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; + bool match = (h->key == key) & visible; + h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) << + ClockHandle::kAcquireCounterShift, std::memory_order_release); return + match; + */ + // Optimistic lookup should pay off when the table is relatively + // sparse. + constexpr bool kOptimisticLookup = true; + uint64_t old_meta; + if (!kOptimisticLookup) { + old_meta = h->meta.load(std::memory_order_acquire); + if ((old_meta >> ClockHandle::kStateShift) != + ClockHandle::kStateVisible) { + return false; + } + } + // (Optimistically) increment acquire counter + old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == hashed_key) { + // Match + return true; + } else { + // Mismatch. Pretend we never took the reference + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + (void)old_meta; + return false; + }, + [&](HandleImpl* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, + [&](HandleImpl* /*h*/) {}, probe); + + return e; +} + +bool HyperClockTable::Release(HandleImpl* h, bool useful, + bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the cache is above capacity and the reference is the last one. Space + // is only freed up by EvictFromClock (called by Insert when space is needed) + // and Erase. We do this to avoid an extra atomic read of the variable usage_. + + uint64_t old_meta; + if (useful) { + // Increment release counter to indicate was used + old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement, + std::memory_order_release); + } else { + // Decrement acquire counter to pretend it never happened + old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // No underflow + assert(((old_meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask) != + ((old_meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask)); + + if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateInvisible)) { + // Update for last fetch_add op + if (useful) { + old_meta += ClockHandle::kReleaseIncrement; + } else { + old_meta -= ClockHandle::kAcquireIncrement; + } + // Take ownership if no refs + do { + if (GetRefcount(old_meta) != 0) { + // Not last ref at some point in time during this Release call + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; + } + if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift)) == 0) { + // Someone else took ownership + return false; + } + // Note that there's a small chance that we release, another thread + // replaces this entry with another, reaches zero refs, and then we end + // up erasing that other entry. That's an acceptable risk / imprecision. + } while (!h->meta.compare_exchange_weak( + old_meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)); + // Took ownership + size_t total_charge = h->GetTotalCharge(); + if (UNLIKELY(h->IsDetached())) { + h->FreeData(); + // Delete detached handle + delete h; + detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); + } else { + Rollback(h->hashed_key, h); + FreeDataMarkEmpty(*h); + ReclaimEntryUsage(total_charge); + } + return true; + } else { + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + return false; + } +} + +void HyperClockTable::Ref(HandleImpl& h) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // Must have already had a reference + assert(GetRefcount(old_meta) > 0); + (void)old_meta; +} + +void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) { + // Increment acquire counter + uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; +} + +void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { + if (n > 0) { + // Split into n - 1 and 1 steps. + uint64_t old_meta = h->meta.fetch_add( + (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire); + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; + + Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); + } +} + +void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { + size_t probe = 0; + (void)FindSlot( + hashed_key, + [&](HandleImpl* h) { + // Could be multiple entries in rare cases. Erase them all. + // Optimistically increment acquire counter + uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == hashed_key) { + // Match. Set invisible. + old_meta = + h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift), + std::memory_order_acq_rel); + // Apply update to local copy + old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift); + for (;;) { + uint64_t refcount = GetRefcount(old_meta); + assert(refcount > 0); + if (refcount > 1) { + // Not last ref at some point in time during this Erase call + // Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + break; + } else if (h->meta.compare_exchange_weak( + old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acq_rel)) { + // Took ownership + assert(hashed_key == h->hashed_key); + size_t total_charge = h->GetTotalCharge(); + FreeDataMarkEmpty(*h); + ReclaimEntryUsage(total_charge); + // We already have a copy of hashed_key in this case, so OK to + // delay Rollback until after releasing the entry + Rollback(hashed_key, h); + break; + } + } + } else { + // Mismatch. Pretend we never took the reference + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + h->meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + } + return false; + }, + [&](HandleImpl* h) { + return h->displacements.load(std::memory_order_relaxed) == 0; + }, + [&](HandleImpl* /*h*/) {}, probe); +} + +void HyperClockTable::ConstApplyToEntriesRange( + std::function<void(const HandleImpl&)> func, size_t index_begin, + size_t index_end, bool apply_if_will_be_deleted) const { + uint64_t check_state_mask = ClockHandle::kStateShareableBit; + if (!apply_if_will_be_deleted) { + check_state_mask |= ClockHandle::kStateVisibleBit; + } + + for (size_t i = index_begin; i < index_end; i++) { + HandleImpl& h = array_[i]; + + // Note: to avoid using compare_exchange, we have to be extra careful. + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + // Increment acquire counter. Note: it's possible that the entry has + // completely changed since we loaded old_meta, but incrementing acquire + // count is always safe. (Similar to optimistic Lookup here.) + old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + // Check whether we actually acquired a reference. + if ((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit) { + // Apply func if appropriate + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + func(h); + } + // Pretend we never took the reference + h.meta.fetch_sub(ClockHandle::kAcquireIncrement, + std::memory_order_release); + // No net change, so don't need to check for overflow + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + } + } +} + +void HyperClockTable::EraseUnRefEntries() { + for (size_t i = 0; i <= this->length_bits_mask_; i++) { + HandleImpl& h = array_[i]; + + uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift) && + GetRefcount(old_meta) == 0 && + h.meta.compare_exchange_strong(old_meta, + uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership + size_t total_charge = h.GetTotalCharge(); + Rollback(h.hashed_key, &h); + FreeDataMarkEmpty(h); + ReclaimEntryUsage(total_charge); + } + } +} + +inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( + const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn, + std::function<bool(HandleImpl*)> abort_fn, + std::function<void(HandleImpl*)> update_fn, size_t& probe) { + // NOTE: upper 32 bits of hashed_key[0] is used for sharding + // + // We use double-hashing probing. Every probe in the sequence is a + // pseudorandom integer, computed as a linear function of two random hashes, + // which we call base and increment. Specifically, the i-th probe is base + i + // * increment modulo the table size. + size_t base = static_cast<size_t>(hashed_key[1]); + // We use an odd increment, which is relatively prime with the power-of-two + // table size. This implies that we cycle back to the first probe only + // after probing every slot exactly once. + // TODO: we could also reconsider linear probing, though locality benefits + // are limited because each slot is a full cache line + size_t increment = static_cast<size_t>(hashed_key[0]) | 1U; + size_t current = ModTableSize(base + probe * increment); + while (probe <= length_bits_mask_) { + HandleImpl* h = &array_[current]; + if (match_fn(h)) { + probe++; + return h; + } + if (abort_fn(h)) { + return nullptr; + } + probe++; + update_fn(h); + current = ModTableSize(current + increment); + } + // We looped back. + return nullptr; +} + +inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key, + const HandleImpl* h) { + size_t current = ModTableSize(hashed_key[1]); + size_t increment = static_cast<size_t>(hashed_key[0]) | 1U; + while (&array_[current] != h) { + array_[current].displacements.fetch_sub(1, std::memory_order_relaxed); + current = ModTableSize(current + increment); + } +} + +inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) { + auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release); + (void)old_occupancy; + // No underflow + assert(old_occupancy > 0); + auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed); + (void)old_usage; + // No underflow + assert(old_usage >= total_charge); +} + +inline void HyperClockTable::Evict(size_t requested_charge, + size_t* freed_charge, size_t* freed_count) { + // precondition + assert(requested_charge > 0); + + // TODO: make a tuning parameter? + constexpr size_t step_size = 4; + + // First (concurrent) increment clock pointer + uint64_t old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + + // Cap the eviction effort at this thread (along with those operating in + // parallel) circling through the whole structure kMaxCountdown times. + // In other words, this eviction run must find something/anything that is + // unreferenced at start of and during the eviction run that isn't reclaimed + // by a concurrent eviction run. + uint64_t max_clock_pointer = + old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_); + + for (;;) { + for (size_t i = 0; i < step_size; i++) { + HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; + bool evicting = ClockUpdate(h); + if (evicting) { + Rollback(h.hashed_key, &h); + *freed_charge += h.GetTotalCharge(); + *freed_count += 1; + FreeDataMarkEmpty(h); + } + } + + // Loop exit condition + if (*freed_charge >= requested_charge) { + return; + } + if (old_clock_pointer >= max_clock_pointer) { + return; + } + + // Advance clock pointer (concurrently) + old_clock_pointer = + clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + } +} + +template <class Table> +ClockCacheShard<Table>::ClockCacheShard( + size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + const typename Table::Opts& opts) + : CacheShardBase(metadata_charge_policy), + table_(capacity, strict_capacity_limit, metadata_charge_policy, opts), + capacity_(capacity), + strict_capacity_limit_(strict_capacity_limit) { + // Initial charge metadata should not exceed capacity + assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl)); +} + +template <class Table> +void ClockCacheShard<Table>::EraseUnRefEntries() { + table_.EraseUnRefEntries(); +} + +template <class Table> +void ClockCacheShard<Table>::ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state) { + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. + size_t length_bits = table_.GetLengthBits(); + size_t length = table_.GetTableSize(); + + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow). + assert(average_entries_per_lock < length || *state == 0); + + size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits); + size_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end. + index_end = length; + *state = SIZE_MAX; + } else { + *state = index_end << (sizeof(size_t) * 8u - length_bits); + } + + table_.ConstApplyToEntriesRange( + [callback](const HandleImpl& h) { + UniqueId64x2 unhashed; + callback(ReverseHash(h.hashed_key, &unhashed), h.value, + h.GetTotalCharge(), h.deleter); + }, + index_begin, index_end, false); +} + +int HyperClockTable::CalcHashBits( + size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { + double average_slot_charge = estimated_value_size * kLoadFactor; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + average_slot_charge += sizeof(HandleImpl); + } + assert(average_slot_charge > 0.0); + uint64_t num_slots = + static_cast<uint64_t>(capacity / average_slot_charge + 0.999999); + + int hash_bits = FloorLog2((num_slots << 1) - 1); + if (metadata_charge_policy == kFullChargeCacheMetadata) { + // For very small estimated value sizes, it's possible to overshoot + while (hash_bits > 0 && + uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) { + hash_bits--; + } + } + return hash_bits; +} + +template <class Table> +void ClockCacheShard<Table>::SetCapacity(size_t capacity) { + capacity_.store(capacity, std::memory_order_relaxed); + // next Insert will take care of any necessary evictions +} + +template <class Table> +void ClockCacheShard<Table>::SetStrictCapacityLimit( + bool strict_capacity_limit) { + strict_capacity_limit_.store(strict_capacity_limit, + std::memory_order_relaxed); + // next Insert will take care of any necessary evictions +} + +template <class Table> +Status ClockCacheShard<Table>::Insert(const Slice& key, + const UniqueId64x2& hashed_key, + void* value, size_t charge, + Cache::DeleterFn deleter, + HandleImpl** handle, + Cache::Priority priority) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return Status::NotSupported("ClockCache only supports key size " + + std::to_string(kCacheKeySize) + "B"); + } + ClockHandleBasicData proto; + proto.hashed_key = hashed_key; + proto.value = value; + proto.deleter = deleter; + proto.total_charge = charge; + Status s = table_.Insert( + proto, handle, priority, capacity_.load(std::memory_order_relaxed), + strict_capacity_limit_.load(std::memory_order_relaxed)); + return s; +} + +template <class Table> +typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup( + const Slice& key, const UniqueId64x2& hashed_key) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return nullptr; + } + return table_.Lookup(hashed_key); +} + +template <class Table> +bool ClockCacheShard<Table>::Ref(HandleImpl* h) { + if (h == nullptr) { + return false; + } + table_.Ref(*h); + return true; +} + +template <class Table> +bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful, + bool erase_if_last_ref) { + if (handle == nullptr) { + return false; + } + return table_.Release(handle, useful, erase_if_last_ref); +} + +template <class Table> +void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) { + table_.TEST_RefN(*h, n); +} + +template <class Table> +void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) { + table_.TEST_ReleaseN(h, n); +} + +template <class Table> +bool ClockCacheShard<Table>::Release(HandleImpl* handle, + bool erase_if_last_ref) { + return Release(handle, /*useful=*/true, erase_if_last_ref); +} + +template <class Table> +void ClockCacheShard<Table>::Erase(const Slice& key, + const UniqueId64x2& hashed_key) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return; + } + table_.Erase(hashed_key); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetUsage() const { + return table_.GetUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetDetachedUsage() const { + return table_.GetDetachedUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetCapacity() const { + return capacity_; +} + +template <class Table> +size_t ClockCacheShard<Table>::GetPinnedUsage() const { + // Computes the pinned usage by scanning the whole hash table. This + // is slow, but avoids keeping an exact counter on the clock usage, + // i.e., the number of not externally referenced elements. + // Why avoid this counter? Because Lookup removes elements from the clock + // list, so it would need to update the pinned usage every time, + // which creates additional synchronization costs. + size_t table_pinned_usage = 0; + const bool charge_metadata = + metadata_charge_policy_ == kFullChargeCacheMetadata; + table_.ConstApplyToEntriesRange( + [&table_pinned_usage, charge_metadata](const HandleImpl& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + uint64_t refcount = GetRefcount(meta); + // Holding one ref for ConstApplyToEntriesRange + assert(refcount > 0); + if (refcount > 1) { + table_pinned_usage += h.GetTotalCharge(); + if (charge_metadata) { + table_pinned_usage += sizeof(HandleImpl); + } + } + }, + 0, table_.GetTableSize(), true); + + return table_pinned_usage + table_.GetDetachedUsage(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetOccupancyCount() const { + return table_.GetOccupancy(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetOccupancyLimit() const { + return table_.GetOccupancyLimit(); +} + +template <class Table> +size_t ClockCacheShard<Table>::GetTableAddressCount() const { + return table_.GetTableSize(); +} + +// Explicit instantiation +template class ClockCacheShard<HyperClockTable>; + +HyperClockCache::HyperClockCache( + size_t capacity, size_t estimated_value_size, int num_shard_bits, + bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + std::shared_ptr<MemoryAllocator> memory_allocator) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, + std::move(memory_allocator)) { + assert(estimated_value_size > 0 || + metadata_charge_policy != kDontChargeCacheMetadata); + // TODO: should not need to go through two levels of pointer indirection to + // get to table entries + size_t per_shard = GetPerShardCapacity(); + InitShards([=](Shard* cs) { + HyperClockTable::Opts opts; + opts.estimated_value_size = estimated_value_size; + new (cs) + Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts); + }); +} + +void* HyperClockCache::Value(Handle* handle) { + return reinterpret_cast<const HandleImpl*>(handle)->value; +} + +size_t HyperClockCache::GetCharge(Handle* handle) const { + return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge(); +} + +Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast<const HandleImpl*>(handle); + return h->deleter; +} + +namespace { + +// For each cache shard, estimate what the table load factor would be if +// cache filled to capacity with average entries. This is considered +// indicative of a potential problem if the shard is essentially operating +// "at limit", which we define as high actual usage (>80% of capacity) +// or actual occupancy very close to limit (>95% of limit). +// Also, for each shard compute the recommended estimated_entry_charge, +// and keep the minimum one for use as overall recommendation. +void AddShardEvaluation(const HyperClockCache::Shard& shard, + std::vector<double>& predicted_load_factors, + size_t& min_recommendation) { + size_t usage = shard.GetUsage() - shard.GetDetachedUsage(); + size_t capacity = shard.GetCapacity(); + double usage_ratio = 1.0 * usage / capacity; + + size_t occupancy = shard.GetOccupancyCount(); + size_t occ_limit = shard.GetOccupancyLimit(); + double occ_ratio = 1.0 * occupancy / occ_limit; + if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) { + // Skip as described above + return; + } + + // If filled to capacity, what would the occupancy ratio be? + double ratio = occ_ratio / usage_ratio; + // Given max load factor, what that load factor be? + double lf = ratio * kStrictLoadFactor; + predicted_load_factors.push_back(lf); + + // Update min_recommendation also + size_t recommendation = usage / occupancy; + min_recommendation = std::min(min_recommendation, recommendation); +} + +} // namespace + +void HyperClockCache::ReportProblems( + const std::shared_ptr<Logger>& info_log) const { + uint32_t shard_count = GetNumShards(); + std::vector<double> predicted_load_factors; + size_t min_recommendation = SIZE_MAX; + const_cast<HyperClockCache*>(this)->ForEachShard( + [&](HyperClockCache::Shard* shard) { + AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); + }); + + if (predicted_load_factors.empty()) { + // None operating "at limit" -> nothing to report + return; + } + std::sort(predicted_load_factors.begin(), predicted_load_factors.end()); + + // First, if the average load factor is within spec, we aren't going to + // complain about a few shards being out of spec. + // NOTE: this is only the average among cache shards operating "at limit," + // which should be representative of what we care about. It it normal, even + // desirable, for a cache to operate "at limit" so this should not create + // selection bias. See AddShardEvaluation(). + // TODO: Consider detecting cases where decreasing the number of shards + // would be good, e.g. serious imbalance among shards. + double average_load_factor = + std::accumulate(predicted_load_factors.begin(), + predicted_load_factors.end(), 0.0) / + shard_count; + + constexpr double kLowSpecLoadFactor = kLoadFactor / 2; + constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414; + if (average_load_factor > kLoadFactor) { + // Out of spec => Consider reporting load factor too high + // Estimate effective overall capacity loss due to enforcing occupancy limit + double lost_portion = 0.0; + int over_count = 0; + for (double lf : predicted_load_factors) { + if (lf > kStrictLoadFactor) { + ++over_count; + lost_portion += (lf - kStrictLoadFactor) / lf / shard_count; + } + } + // >= 20% loss -> error + // >= 10% loss -> consistent warning + // >= 1% loss -> intermittent warning + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + bool report = true; + if (lost_portion > 0.2) { + level = InfoLogLevel::ERROR_LEVEL; + } else if (lost_portion > 0.1) { + level = InfoLogLevel::WARN_LEVEL; + } else if (lost_portion > 0.01) { + int report_percent = static_cast<int>(lost_portion * 100.0); + if (Random::GetTLSInstance()->PercentTrue(report_percent)) { + level = InfoLogLevel::WARN_LEVEL; + } + } else { + // don't report + report = false; + } + if (report) { + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p unable to use estimated %.1f%% capacity because " + "of " + "full occupancy in %d/%u cache shards (estimated_entry_charge too " + "high). Recommend estimated_entry_charge=%zu", + this, lost_portion * 100.0, over_count, (unsigned)shard_count, + min_recommendation); + } + } else if (average_load_factor < kLowSpecLoadFactor) { + // Out of spec => Consider reporting load factor too low + // But cautiously because low is not as big of a problem. + + // Only report if highest occupancy shard is also below + // spec and only if average is substantially out of spec + if (predicted_load_factors.back() < kLowSpecLoadFactor && + average_load_factor < kLowSpecLoadFactor / 1.414) { + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + if (average_load_factor < kLowSpecLoadFactor / 2) { + level = InfoLogLevel::WARN_LEVEL; + } + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p table has low occupancy at full capacity. Higher " + "estimated_entry_charge (about %.1fx) would likely improve " + "performance. Recommend estimated_entry_charge=%zu", + this, kMidSpecLoadFactor / average_load_factor, min_recommendation); + } + } +} + +} // namespace clock_cache + +// DEPRECATED (see public API) +std::shared_ptr<Cache> NewClockCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + /* high_pri_pool_ratio */ 0.5, nullptr, + kDefaultToAdaptiveMutex, metadata_charge_policy, + /* low_pri_pool_ratio */ 0.0); +} + +std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const { + auto my_num_shard_bits = num_shard_bits; + if (my_num_shard_bits >= 20) { + return nullptr; // The cache cannot be sharded into too many fine pieces. + } + if (my_num_shard_bits < 0) { + // Use larger shard size to reduce risk of large entries clustering + // or skewing individual shards. + constexpr size_t min_shard_size = 32U * 1024U * 1024U; + my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); + } + return std::make_shared<clock_cache::HyperClockCache>( + capacity, estimated_entry_charge, my_num_shard_bits, + strict_capacity_limit, metadata_charge_policy, memory_allocator); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/clock_cache.h b/src/rocksdb/cache/clock_cache.h new file mode 100644 index 000000000..ef1b0ccb7 --- /dev/null +++ b/src/rocksdb/cache/clock_cache.h @@ -0,0 +1,701 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <array> +#include <atomic> +#include <cstddef> +#include <cstdint> +#include <memory> +#include <string> + +#include "cache/cache_key.h" +#include "cache/sharded_cache.h" +#include "port/lang.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/secondary_cache.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +namespace clock_cache { + +// Forward declaration of friend class. +class ClockCacheTest; + +// HyperClockCache is an alternative to LRUCache specifically tailored for +// use as BlockBasedTableOptions::block_cache +// +// Benefits +// -------- +// * Fully lock free (no waits or spins) for efficiency under high concurrency +// * Optimized for hot path reads. For concurrency control, most Lookup() and +// essentially all Release() are a single atomic add operation. +// * Eviction on insertion is fully parallel and lock-free. +// * Uses a generalized + aging variant of CLOCK eviction that might outperform +// LRU in some cases. (For background, see +// https://en.wikipedia.org/wiki/Page_replacement_algorithm) +// +// Costs +// ----- +// * Hash table is not resizable (for lock-free efficiency) so capacity is not +// dynamically changeable. Rely on an estimated average value (block) size for +// space+time efficiency. (See estimated_entry_charge option details.) +// * Insert usually does not (but might) overwrite a previous entry associated +// with a cache key. This is OK for RocksDB uses of Cache. +// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for +// block cache (not row cache or table cache). +// * SecondaryCache is not supported. +// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough +// transient LOW or BOTTOM priority items can evict HIGH priority entries that +// are not referenced recently (or often) enough. +// * If pinned entries leave little or nothing eligible for eviction, +// performance can degrade substantially, because of clock eviction eating +// CPU looking for evictable entries and because Release does not +// pro-actively delete unreferenced entries when the cache is over-full. +// Specifically, this makes this implementation more susceptible to the +// following combination: +// * num_shard_bits is high (e.g. 6) +// * capacity small (e.g. some MBs) +// * some large individual entries (e.g. non-partitioned filters) +// where individual entries occupy a large portion of their shard capacity. +// This should be mostly mitigated by the implementation picking a lower +// number of cache shards than LRUCache for a given capacity (when +// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()). +// * With strict_capacity_limit=false, respecting the capacity limit is not as +// aggressive as LRUCache. The limit might be transiently exceeded by a very +// small number of entries even when not strictly necessary, and slower to +// recover after pinning forces limit to be substantially exceeded. (Even with +// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate +// memory before discovering it is over the block cache capacity, so this +// should not be a detectable regression in respecting memory limits, except +// on exceptionally small caches.) +// * In some cases, erased or duplicated entries might not be freed +// immediately. They will eventually be freed by eviction from further Inserts. +// * Internal metadata can overflow if the number of simultaneous references +// to a cache handle reaches many millions. +// +// High-level eviction algorithm +// ----------------------------- +// A score (or "countdown") is maintained for each entry, initially determined +// by priority. The score is incremented on each Lookup, up to a max of 3, +// though is easily returned to previous state if useful=false with Release. +// During CLOCK-style eviction iteration, entries with score > 0 are +// decremented if currently unreferenced and entries with score == 0 are +// evicted if currently unreferenced. Note that scoring might not be perfect +// because entries can be referenced transiently within the cache even when +// there are no outside references to the entry. +// +// Cache sharding like LRUCache is used to reduce contention on usage+eviction +// state, though here the performance improvement from more shards is small, +// and (as noted above) potentially detrimental if shard capacity is too close +// to largest entry size. Here cache sharding mostly only affects cache update +// (Insert / Erase) performance, not read performance. +// +// Read efficiency (hot path) +// -------------------------- +// Mostly to minimize the cost of accessing metadata blocks with +// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and +// Release. In terms of concurrency, at a minimum, these operations have +// to do reference counting (and Lookup has to compare full keys in a safe +// way). Can we fold in all the other metadata tracking *for free* with +// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume +// for the moment that Lookup succeeds on the first probe.) +// +// We have a clever way of encoding an entry's reference count and countdown +// clock so that Lookup and Release are each usually a single atomic addition. +// In a single metadata word we have both an "acquire" count, incremented by +// Lookup, and a "release" count, incremented by Release. If useful=false, +// Release can instead decrement the acquire count. Thus the current ref +// count is (acquires - releases), and the countdown clock is min(3, acquires). +// Note that only unreferenced entries (acquires == releases) are eligible +// for CLOCK manipulation and eviction. We tolerate use of more expensive +// compare_exchange operations for cache writes (insertions and erasures). +// +// In a cache receiving many reads and little or no writes, it is possible +// for the acquire and release counters to overflow. Assuming the *current* +// refcount never reaches to many millions, we only have to correct for +// overflow in both counters in Release, not in Lookup. The overflow check +// should be only 1-2 CPU cycles per Release because it is a predictable +// branch on a simple condition on data already in registers. +// +// Slot states +// ----------- +// We encode a state indicator into the same metadata word with the +// acquire and release counters. This allows bigger state transitions to +// be atomic. States: +// +// * Empty - slot is not in use and unowned. All other metadata and data is +// in an undefined state. +// * Construction - slot is exclusively owned by one thread, the thread +// successfully entering this state, for populating or freeing data. +// * Shareable (group) - slot holds an entry with counted references for +// pinning and reading, including +// * Visible - slot holds an entry that can be returned by Lookup +// * Invisible - slot holds an entry that is not visible to Lookup +// (erased by user) but can be read by existing references, and ref count +// changed by Ref and Release. +// +// A special case is "detached" entries, which are heap-allocated handles +// not in the table. They are always Invisible and freed on zero refs. +// +// State transitions: +// Empty -> Construction (in Insert): The encoding of state enables Insert to +// perform an optimistic atomic bitwise-or to take ownership if a slot is +// empty, or otherwise make no state change. +// +// Construction -> Visible (in Insert): This can be a simple assignment to the +// metadata word because the current thread has exclusive ownership and other +// metadata is meaningless. +// +// Visible -> Invisible (in Erase): This can be a bitwise-and while holding +// a shared reference, which is safe because the change is idempotent (in case +// of parallel Erase). By the way, we never go Invisible->Visible. +// +// Shareable -> Construction (in Evict part of Insert, in Erase, and in +// Release if Invisible): This is for starting to freeing/deleting an +// unreferenced entry. We have to use compare_exchange to ensure we only make +// this transition when there are zero refs. +// +// Construction -> Empty (in same places): This is for completing free/delete +// of an entry. A "release" atomic store suffices, as we have exclusive +// ownership of the slot but have to ensure none of the data member reads are +// re-ordered after committing the state transition. +// +// Insert +// ------ +// If Insert were to guarantee replacing an existing entry for a key, there +// would be complications for concurrency and efficiency. First, consider how +// many probes to get to an entry. To ensure Lookup never waits and +// availability of a key is uninterrupted, we would need to use a different +// slot for a new entry for the same key. This means it is most likely in a +// later probing position than the old version, which should soon be removed. +// (Also, an entry is too big to replace atomically, even if no current refs.) +// +// However, overwrite capability is not really needed by RocksDB. Also, we +// know from our "redundant" stats that overwrites are very rare for the block +// cache, so we should not spend much to make them effective. +// +// So instead we Insert as soon as we find an empty slot in the probing +// sequence without seeing an existing (visible) entry for the same key. This +// way we only insert if we can improve the probing performance, and we don't +// need to probe beyond our insert position, assuming we are willing to let +// the previous entry for the same key die of old age (eventual eviction from +// not being used). We can reach a similar state with concurrent insertions, +// where one will pass over the other while it is "under construction." +// This temporary duplication is acceptable for RocksDB block cache because +// we know redundant insertion is rare. +// +// Another problem to solve is what to return to the caller when we find an +// existing entry whose probing position we cannot improve on, or when the +// table occupancy limit has been reached. If strict_capacity_limit=false, +// we must never fail Insert, and if a Handle* is provided, we have to return +// a usable Cache handle on success. The solution to this (typically rare) +// problem is "detached" handles, which are usable by the caller but not +// actually available for Lookup in the Cache. Detached handles are allocated +// independently on the heap and specially marked so that they are freed on +// the heap when their last reference is released. +// +// Usage on capacity +// ----------------- +// Insert takes different approaches to usage tracking depending on +// strict_capacity_limit setting. If true, we enforce a kind of strong +// consistency where compare-exchange is used to ensure the usage number never +// exceeds its limit, and provide threads with an authoritative signal on how +// much "usage" they have taken ownership of. With strict_capacity_limit=false, +// we use a kind of "eventual consistency" where all threads Inserting to the +// same cache shard might race on reserving the same space, but the +// over-commitment will be worked out in later insertions. It is kind of a +// dance because we don't want threads racing each other too much on paying +// down the over-commitment (with eviction) either. +// +// Eviction +// -------- +// A key part of Insert is evicting some entries currently unreferenced to +// make room for new entries. The high-level eviction algorithm is described +// above, but the details are also interesting. A key part is parallelizing +// eviction with a single CLOCK pointer. This works by each thread working on +// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK- +// updating or evicting the incremented-over slot(s). To reduce contention at +// the cost of possibly evicting too much, each thread increments the clock +// pointer by 4, so commits to updating at least 4 slots per batch. As +// described above, a CLOCK update will decrement the "countdown" of +// unreferenced entries, or evict unreferenced entries with zero countdown. +// Referenced entries are not updated, because we (presumably) don't want +// long-referenced entries to age while referenced. Note however that we +// cannot distinguish transiently referenced entries from cache user +// references, so some CLOCK updates might be somewhat arbitrarily skipped. +// This is OK as long as it is rare enough that eviction order is still +// pretty good. +// +// There is no synchronization on the completion of the CLOCK updates, so it +// is theoretically possible for another thread to cycle back around and have +// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely +// on any implied exclusivity to make the updates or eviction more efficient. +// These updates use an opportunistic compare-exchange (no loop), where a +// racing thread might cause the update to be skipped without retry, but in +// such case the update is likely not needed because the most likely update +// to an entry is that it has become referenced. (TODO: test efficiency of +// avoiding compare-exchange loop) +// +// Release +// ------- +// In the common case, Release is a simple atomic increment of the release +// counter. There is a simple overflow check that only does another atomic +// update in extremely rare cases, so costs almost nothing. +// +// If the Release specifies "not useful", we can instead decrement the +// acquire counter, which returns to the same CLOCK state as before Lookup +// or Ref. +// +// Adding a check for over-full cache on every release to zero-refs would +// likely be somewhat expensive, increasing read contention on cache shard +// metadata. Instead we are less aggressive about deleting entries right +// away in those cases. +// +// However Release tries to immediately delete entries reaching zero refs +// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already +// marked invisible. Both of these are checks on values already in CPU +// registers so do not increase cross-CPU contention when not applicable. +// When applicable, they use a compare-exchange loop to take exclusive +// ownership of the slot for freeing the entry. These are rare cases +// that should not usually affect performance. +// +// Erase +// ----- +// Searches for an entry like Lookup but moves it to Invisible state if found. +// This state transition is with bit operations so is idempotent and safely +// done while only holding a shared "read" reference. Like Release, it makes +// a best effort to immediately release an Invisible entry that reaches zero +// refs, but there are some corner cases where it will only be freed by the +// clock eviction process. + +// ----------------------------------------------------------------------- // + +// The load factor p is a real number in (0, 1) such that at all +// times at most a fraction p of all slots, without counting tombstones, +// are occupied by elements. This means that the probability that a random +// probe hits an occupied slot is at most p, and thus at most 1/p probes +// are required on average. For example, p = 70% implies that between 1 and 2 +// probes are needed on average (bear in mind that this reasoning doesn't +// consider the effects of clustering over time, which should be negligible +// with double hashing). +// Because the size of the hash table is always rounded up to the next +// power of 2, p is really an upper bound on the actual load factor---the +// actual load factor is anywhere between p/2 and p. This is a bit wasteful, +// but bear in mind that slots only hold metadata, not actual values. +// Since space cost is dominated by the values (the LSM blocks), +// overprovisioning the table with metadata only increases the total cache space +// usage by a tiny fraction. +constexpr double kLoadFactor = 0.7; + +// The user can exceed kLoadFactor if the sizes of the inserted values don't +// match estimated_value_size, or in some rare cases with +// strict_capacity_limit == false. To avoid degenerate performance, we set a +// strict upper bound on the load factor. +constexpr double kStrictLoadFactor = 0.84; + +struct ClockHandleBasicData { + void* value = nullptr; + Cache::DeleterFn deleter = nullptr; + // A lossless, reversible hash of the fixed-size (16 byte) cache key. This + // eliminates the need to store a hash separately. + UniqueId64x2 hashed_key = kNullUniqueId64x2; + size_t total_charge = 0; + + // For total_charge_and_flags + // "Detached" means the handle is allocated separately from hash table. + static constexpr uint64_t kFlagDetached = uint64_t{1} << 63; + // Extract just the total charge + static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1; + + inline size_t GetTotalCharge() const { return total_charge; } + + // Calls deleter (if non-null) on cache key and value + void FreeData() const; + + // Required by concept HandleImpl + const UniqueId64x2& GetHash() const { return hashed_key; } +}; + +struct ClockHandle : public ClockHandleBasicData { + // Constants for handling the atomic `meta` word, which tracks most of the + // state of the handle. The meta word looks like this: + // low bits high bits + // ----------------------------------------------------------------------- + // | acquire counter | release counter | state marker | + // ----------------------------------------------------------------------- + + // For reading or updating counters in meta word. + static constexpr uint8_t kCounterNumBits = 30; + static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1; + + static constexpr uint8_t kAcquireCounterShift = 0; + static constexpr uint64_t kAcquireIncrement = uint64_t{1} + << kAcquireCounterShift; + static constexpr uint8_t kReleaseCounterShift = kCounterNumBits; + static constexpr uint64_t kReleaseIncrement = uint64_t{1} + << kReleaseCounterShift; + + // For reading or updating the state marker in meta word + static constexpr uint8_t kStateShift = 2U * kCounterNumBits; + + // Bits contribution to state marker. + // Occupied means any state other than empty + static constexpr uint8_t kStateOccupiedBit = 0b100; + // Shareable means the entry is reference counted (visible or invisible) + // (only set if also occupied) + static constexpr uint8_t kStateShareableBit = 0b010; + // Visible is only set if also shareable + static constexpr uint8_t kStateVisibleBit = 0b001; + + // Complete state markers (not shifted into full word) + static constexpr uint8_t kStateEmpty = 0b000; + static constexpr uint8_t kStateConstruction = kStateOccupiedBit; + static constexpr uint8_t kStateInvisible = + kStateOccupiedBit | kStateShareableBit; + static constexpr uint8_t kStateVisible = + kStateOccupiedBit | kStateShareableBit | kStateVisibleBit; + + // Constants for initializing the countdown clock. (Countdown clock is only + // in effect with zero refs, acquire counter == release counter, and in that + // case the countdown clock == both of those counters.) + static constexpr uint8_t kHighCountdown = 3; + static constexpr uint8_t kLowCountdown = 2; + static constexpr uint8_t kBottomCountdown = 1; + // During clock update, treat any countdown clock value greater than this + // value the same as this value. + static constexpr uint8_t kMaxCountdown = kHighCountdown; + // TODO: make these coundown values tuning parameters for eviction? + + // See above + std::atomic<uint64_t> meta{}; + + // Anticipating use for SecondaryCache support + void* reserved_for_future_use = nullptr; +}; // struct ClockHandle + +class HyperClockTable { + public: + // Target size to be exactly a common cache line size (see static_assert in + // clock_cache.cc) + struct ALIGN_AS(64U) HandleImpl : public ClockHandle { + // The number of elements that hash to this slot or a lower one, but wind + // up in this slot or a higher one. + std::atomic<uint32_t> displacements{}; + + // Whether this is a "deteched" handle that is independently allocated + // with `new` (so must be deleted with `delete`). + // TODO: ideally this would be packed into some other data field, such + // as upper bits of total_charge, but that incurs a measurable performance + // regression. + bool detached = false; + + inline bool IsDetached() const { return detached; } + + inline void SetDetached() { detached = true; } + }; // struct HandleImpl + + struct Opts { + size_t estimated_value_size; + }; + + HyperClockTable(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + const Opts& opts); + ~HyperClockTable(); + + Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle, + Cache::Priority priority, size_t capacity, + bool strict_capacity_limit); + + HandleImpl* Lookup(const UniqueId64x2& hashed_key); + + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); + + void Ref(HandleImpl& handle); + + void Erase(const UniqueId64x2& hashed_key); + + void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func, + size_t index_begin, size_t index_end, + bool apply_if_will_be_deleted) const; + + void EraseUnRefEntries(); + + size_t GetTableSize() const { return size_t{1} << length_bits_; } + + int GetLengthBits() const { return length_bits_; } + + size_t GetOccupancy() const { + return occupancy_.load(std::memory_order_relaxed); + } + + size_t GetOccupancyLimit() const { return occupancy_limit_; } + + size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); } + + size_t GetDetachedUsage() const { + return detached_usage_.load(std::memory_order_relaxed); + } + + // Acquire/release N references + void TEST_RefN(HandleImpl& handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, size_t n); + + private: // functions + // Returns x mod 2^{length_bits_}. + inline size_t ModTableSize(uint64_t x) { + return static_cast<size_t>(x) & length_bits_mask_; + } + + // Runs the clock eviction algorithm trying to reclaim at least + // requested_charge. Returns how much is evicted, which could be less + // if it appears impossible to evict the requested amount without blocking. + inline void Evict(size_t requested_charge, size_t* freed_charge, + size_t* freed_count); + + // Returns the first slot in the probe sequence, starting from the given + // probe number, with a handle e such that match(e) is true. At every + // step, the function first tests whether match(e) holds. If this is false, + // it evaluates abort(e) to decide whether the search should be aborted, + // and in the affirmative returns -1. For every handle e probed except + // the last one, the function runs update(e). + // The probe parameter is modified as follows. We say a probe to a handle + // e is aborting if match(e) is false and abort(e) is true. Then the final + // value of probe is one more than the last non-aborting probe during the + // call. This is so that that the variable can be used to keep track of + // progress across consecutive calls to FindSlot. + inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, + std::function<bool(HandleImpl*)> match, + std::function<bool(HandleImpl*)> stop, + std::function<void(HandleImpl*)> update, + size_t& probe); + + // Re-decrement all displacements in probe path starting from beginning + // until (not including) the given handle + inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h); + + // Subtracts `total_charge` from `usage_` and 1 from `occupancy_`. + // Ideally this comes after releasing the entry itself so that we + // actually have the available occupancy/usage that is claimed. + // However, that means total_charge has to be saved from the handle + // before releasing it so that it can be provided to this function. + inline void ReclaimEntryUsage(size_t total_charge); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=true rules. This + // means the operation might fail with Status::MemoryLimit. If + // `need_evict_for_occupancy`, then eviction of at least one entry is + // required, and the operation should fail if not possible. + // NOTE: Otherwise, occupancy_ is not managed in this function + inline Status ChargeUsageMaybeEvictStrict(size_t total_charge, + size_t capacity, + bool need_evict_for_occupancy); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=false rules. This + // means that updating `usage_` always succeeds even if forced to exceed + // capacity. If `need_evict_for_occupancy`, then eviction of at least one + // entry is required, and the operation should return false if such eviction + // is not possible. `usage_` is not updated in that case. Otherwise, returns + // true, indicating success. + // NOTE: occupancy_ is not managed in this function + inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, + size_t capacity, + bool need_evict_for_occupancy); + + // Creates a "detached" handle for returning from an Insert operation that + // cannot be completed by actually inserting into the table. + // Updates `detached_usage_` but not `usage_` nor `occupancy_`. + inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto); + + // Returns the number of bits used to hash an element in the hash + // table. + static int CalcHashBits(size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy); + + private: // data + // Number of hash bits used for table index. + // The size of the table is 1 << length_bits_. + const int length_bits_; + + // For faster computation of ModTableSize. + const size_t length_bits_mask_; + + // Maximum number of elements the user can store in the table. + const size_t occupancy_limit_; + + // Array of slots comprising the hash table. + const std::unique_ptr<HandleImpl[]> array_; + + // We partition the following members into different cache lines + // to avoid false sharing among Lookup, Release, Erase and Insert + // operations in ClockCacheShard. + + ALIGN_AS(CACHE_LINE_SIZE) + // Clock algorithm sweep pointer. + std::atomic<uint64_t> clock_pointer_{}; + + ALIGN_AS(CACHE_LINE_SIZE) + // Number of elements in the table. + std::atomic<size_t> occupancy_{}; + + // Memory usage by entries tracked by the cache (including detached) + std::atomic<size_t> usage_{}; + + // Part of usage by detached entries (not in table) + std::atomic<size_t> detached_usage_{}; +}; // class HyperClockTable + +// A single shard of sharded cache. +template <class Table> +class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { + public: + ClockCacheShard(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + const typename Table::Opts& opts); + + // For CacheShard concept + using HandleImpl = typename Table::HandleImpl; + // Hash is lossless hash of 128-bit key + using HashVal = UniqueId64x2; + using HashCref = const HashVal&; + static inline uint32_t HashPieceForSharding(HashCref hash) { + return Upper32of64(hash[0]); + } + static inline HashVal ComputeHash(const Slice& key) { + assert(key.size() == kCacheKeySize); + HashVal in; + HashVal out; + // NOTE: endian dependence + // TODO: use GetUnaligned? + std::memcpy(&in, key.data(), kCacheKeySize); + BijectiveHash2x64(in[1], in[0], &out[1], &out[0]); + return out; + } + + // For reconstructing key from hashed_key. Requires the caller to provide + // backing storage for the Slice in `unhashed` + static inline Slice ReverseHash(const UniqueId64x2& hashed, + UniqueId64x2* unhashed) { + BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]); + // NOTE: endian dependence + return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize); + } + + // Although capacity is dynamically changeable, the number of table slots is + // not, so growing capacity substantially could lead to hitting occupancy + // limit. + void SetCapacity(size_t capacity); + + void SetStrictCapacityLimit(bool strict_capacity_limit); + + Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value, + size_t charge, Cache::DeleterFn deleter, HandleImpl** handle, + Cache::Priority priority); + + HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key); + + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); + + bool Release(HandleImpl* handle, bool erase_if_last_ref = false); + + bool Ref(HandleImpl* handle); + + void Erase(const Slice& key, const UniqueId64x2& hashed_key); + + size_t GetCapacity() const; + + size_t GetUsage() const; + + size_t GetDetachedUsage() const; + + size_t GetPinnedUsage() const; + + size_t GetOccupancyCount() const; + + size_t GetOccupancyLimit() const; + + size_t GetTableAddressCount() const; + + void ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state); + + void EraseUnRefEntries(); + + std::string GetPrintableOptions() const { return std::string{}; } + + // SecondaryCache not yet supported + Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + HandleImpl** handle, Cache::Priority priority) { + return Insert(key, hashed_key, value, charge, helper->del_cb, handle, + priority); + } + + HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key, + const Cache::CacheItemHelper* /*helper*/, + const Cache::CreateCallback& /*create_cb*/, + Cache::Priority /*priority*/, bool /*wait*/, + Statistics* /*stats*/) { + return Lookup(key, hashed_key); + } + + bool IsReady(HandleImpl* /*handle*/) { return true; } + + void Wait(HandleImpl* /*handle*/) {} + + // Acquire/release N references + void TEST_RefN(HandleImpl* handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, size_t n); + + private: // data + Table table_; + + // Maximum total charge of all elements stored in the table. + std::atomic<size_t> capacity_; + + // Whether to reject insertion if cache reaches its full capacity. + std::atomic<bool> strict_capacity_limit_; +}; // class ClockCacheShard + +class HyperClockCache +#ifdef NDEBUG + final +#endif + : public ShardedCache<ClockCacheShard<HyperClockTable>> { + public: + using Shard = ClockCacheShard<HyperClockTable>; + + HyperClockCache(size_t capacity, size_t estimated_value_size, + int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + std::shared_ptr<MemoryAllocator> memory_allocator); + + const char* Name() const override { return "HyperClockCache"; } + + void* Value(Handle* handle) override; + + size_t GetCharge(Handle* handle) const override; + + DeleterFn GetDeleter(Handle* handle) const override; + + void ReportProblems( + const std::shared_ptr<Logger>& /*info_log*/) const override; +}; // class HyperClockCache + +} // namespace clock_cache + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/compressed_secondary_cache.cc b/src/rocksdb/cache/compressed_secondary_cache.cc new file mode 100644 index 000000000..7d1bdc789 --- /dev/null +++ b/src/rocksdb/cache/compressed_secondary_cache.cc @@ -0,0 +1,325 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/compressed_secondary_cache.h" + +#include <algorithm> +#include <cstdint> +#include <memory> + +#include "memory/memory_allocator.h" +#include "monitoring/perf_context_imp.h" +#include "util/compression.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +CompressedSecondaryCache::CompressedSecondaryCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, double low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + CompressionType compression_type, uint32_t compress_format_version, + bool enable_custom_split_merge) + : cache_options_(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, + compression_type, compress_format_version, + enable_custom_split_merge) { + cache_ = + NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, + metadata_charge_policy, low_pri_pool_ratio); +} + +CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); } + +std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, + bool advise_erase, bool& is_in_sec_cache) { + std::unique_ptr<SecondaryCacheResultHandle> handle; + is_in_sec_cache = false; + Cache::Handle* lru_handle = cache_->Lookup(key); + if (lru_handle == nullptr) { + return nullptr; + } + + void* handle_value = cache_->Value(lru_handle); + if (handle_value == nullptr) { + cache_->Release(lru_handle, /*erase_if_last_ref=*/false); + return nullptr; + } + + CacheAllocationPtr* ptr{nullptr}; + CacheAllocationPtr merged_value; + size_t handle_value_charge{0}; + if (cache_options_.enable_custom_split_merge) { + CacheValueChunk* value_chunk_ptr = + reinterpret_cast<CacheValueChunk*>(handle_value); + merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge); + ptr = &merged_value; + } else { + ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value); + handle_value_charge = cache_->GetCharge(lru_handle); + } + + Status s; + void* value{nullptr}; + size_t charge{0}; + if (cache_options_.compression_type == kNoCompression) { + s = create_cb(ptr->get(), handle_value_charge, &value, &charge); + } else { + UncompressionContext uncompression_context(cache_options_.compression_type); + UncompressionInfo uncompression_info(uncompression_context, + UncompressionDict::GetEmptyDict(), + cache_options_.compression_type); + + size_t uncompressed_size{0}; + CacheAllocationPtr uncompressed = UncompressData( + uncompression_info, (char*)ptr->get(), handle_value_charge, + &uncompressed_size, cache_options_.compress_format_version, + cache_options_.memory_allocator.get()); + + if (!uncompressed) { + cache_->Release(lru_handle, /*erase_if_last_ref=*/true); + return nullptr; + } + s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge); + } + + if (!s.ok()) { + cache_->Release(lru_handle, /*erase_if_last_ref=*/true); + return nullptr; + } + + if (advise_erase) { + cache_->Release(lru_handle, /*erase_if_last_ref=*/true); + // Insert a dummy handle. + cache_ + ->Insert(key, /*value=*/nullptr, /*charge=*/0, + GetDeletionCallback(cache_options_.enable_custom_split_merge)) + .PermitUncheckedError(); + } else { + is_in_sec_cache = true; + cache_->Release(lru_handle, /*erase_if_last_ref=*/false); + } + handle.reset(new CompressedSecondaryCacheResultHandle(value, charge)); + return handle; +} + +Status CompressedSecondaryCache::Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) { + if (value == nullptr) { + return Status::InvalidArgument(); + } + + Cache::Handle* lru_handle = cache_->Lookup(key); + Cache::DeleterFn del_cb = + GetDeletionCallback(cache_options_.enable_custom_split_merge); + if (lru_handle == nullptr) { + PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1); + // Insert a dummy handle if the handle is evicted for the first time. + return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb); + } else { + cache_->Release(lru_handle, /*erase_if_last_ref=*/false); + } + + size_t size = (*helper->size_cb)(value); + CacheAllocationPtr ptr = + AllocateBlock(size, cache_options_.memory_allocator.get()); + + Status s = (*helper->saveto_cb)(value, 0, size, ptr.get()); + if (!s.ok()) { + return s; + } + Slice val(ptr.get(), size); + + std::string compressed_val; + if (cache_options_.compression_type != kNoCompression) { + PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size); + CompressionOptions compression_opts; + CompressionContext compression_context(cache_options_.compression_type); + uint64_t sample_for_compression{0}; + CompressionInfo compression_info( + compression_opts, compression_context, CompressionDict::GetEmptyDict(), + cache_options_.compression_type, sample_for_compression); + + bool success = + CompressData(val, compression_info, + cache_options_.compress_format_version, &compressed_val); + + if (!success) { + return Status::Corruption("Error compressing value."); + } + + val = Slice(compressed_val); + size = compressed_val.size(); + PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, size); + + if (!cache_options_.enable_custom_split_merge) { + ptr = AllocateBlock(size, cache_options_.memory_allocator.get()); + memcpy(ptr.get(), compressed_val.data(), size); + } + } + + PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1); + if (cache_options_.enable_custom_split_merge) { + size_t charge{0}; + CacheValueChunk* value_chunks_head = + SplitValueIntoChunks(val, cache_options_.compression_type, charge); + return cache_->Insert(key, value_chunks_head, charge, del_cb); + } else { + CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr)); + return cache_->Insert(key, buf, size, del_cb); + } +} + +void CompressedSecondaryCache::Erase(const Slice& key) { cache_->Erase(key); } + +Status CompressedSecondaryCache::SetCapacity(size_t capacity) { + MutexLock l(&capacity_mutex_); + cache_options_.capacity = capacity; + cache_->SetCapacity(capacity); + return Status::OK(); +} + +Status CompressedSecondaryCache::GetCapacity(size_t& capacity) { + MutexLock l(&capacity_mutex_); + capacity = cache_options_.capacity; + return Status::OK(); +} + +std::string CompressedSecondaryCache::GetPrintableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize{200}; + char buffer[kBufferSize]; + ret.append(cache_->GetPrintableOptions()); + snprintf(buffer, kBufferSize, " compression_type : %s\n", + CompressionTypeToString(cache_options_.compression_type).c_str()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " compress_format_version : %d\n", + cache_options_.compress_format_version); + ret.append(buffer); + return ret; +} + +CompressedSecondaryCache::CacheValueChunk* +CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value, + CompressionType compression_type, + size_t& charge) { + assert(!value.empty()); + const char* src_ptr = value.data(); + size_t src_size{value.size()}; + + CacheValueChunk dummy_head = CacheValueChunk(); + CacheValueChunk* current_chunk = &dummy_head; + // Do not split when value size is large or there is no compression. + size_t predicted_chunk_size{0}; + size_t actual_chunk_size{0}; + size_t tmp_size{0}; + while (src_size > 0) { + predicted_chunk_size = sizeof(CacheValueChunk) - 1 + src_size; + auto upper = + std::upper_bound(malloc_bin_sizes_.begin(), malloc_bin_sizes_.end(), + predicted_chunk_size); + // Do not split when value size is too small, too large, close to a bin + // size, or there is no compression. + if (upper == malloc_bin_sizes_.begin() || + upper == malloc_bin_sizes_.end() || + *upper - predicted_chunk_size < malloc_bin_sizes_.front() || + compression_type == kNoCompression) { + tmp_size = predicted_chunk_size; + } else { + tmp_size = *(--upper); + } + + CacheValueChunk* new_chunk = + reinterpret_cast<CacheValueChunk*>(new char[tmp_size]); + current_chunk->next = new_chunk; + current_chunk = current_chunk->next; + actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1; + memcpy(current_chunk->data, src_ptr, actual_chunk_size); + current_chunk->size = actual_chunk_size; + src_ptr += actual_chunk_size; + src_size -= actual_chunk_size; + charge += tmp_size; + } + current_chunk->next = nullptr; + + return dummy_head.next; +} + +CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue( + const void* chunks_head, size_t& charge) { + const CacheValueChunk* head = + reinterpret_cast<const CacheValueChunk*>(chunks_head); + const CacheValueChunk* current_chunk = head; + charge = 0; + while (current_chunk != nullptr) { + charge += current_chunk->size; + current_chunk = current_chunk->next; + } + + CacheAllocationPtr ptr = + AllocateBlock(charge, cache_options_.memory_allocator.get()); + current_chunk = head; + size_t pos{0}; + while (current_chunk != nullptr) { + memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size); + pos += current_chunk->size; + current_chunk = current_chunk->next; + } + + return ptr; +} + +Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback( + bool enable_custom_split_merge) { + if (enable_custom_split_merge) { + return [](const Slice& /*key*/, void* obj) { + CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj); + while (chunks_head != nullptr) { + CacheValueChunk* tmp_chunk = chunks_head; + chunks_head = chunks_head->next; + tmp_chunk->Free(); + obj = nullptr; + }; + }; + } else { + return [](const Slice& /*key*/, void* obj) { + delete reinterpret_cast<CacheAllocationPtr*>(obj); + obj = nullptr; + }; + } +} + +std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, double low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + CompressionType compression_type, uint32_t compress_format_version, + bool enable_custom_split_merge) { + return std::make_shared<CompressedSecondaryCache>( + capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, + low_pri_pool_ratio, memory_allocator, use_adaptive_mutex, + metadata_charge_policy, compression_type, compress_format_version, + enable_custom_split_merge); +} + +std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts) { + // The secondary_cache is disabled for this LRUCache instance. + assert(opts.secondary_cache == nullptr); + return NewCompressedSecondaryCache( + opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit, + opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator, + opts.use_adaptive_mutex, opts.metadata_charge_policy, + opts.compression_type, opts.compress_format_version, + opts.enable_custom_split_merge); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/compressed_secondary_cache.h b/src/rocksdb/cache/compressed_secondary_cache.h new file mode 100644 index 000000000..4dee38802 --- /dev/null +++ b/src/rocksdb/cache/compressed_secondary_cache.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <array> +#include <cstddef> +#include <memory> + +#include "cache/lru_cache.h" +#include "memory/memory_allocator.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/compression.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle { + public: + CompressedSecondaryCacheResultHandle(void* value, size_t size) + : value_(value), size_(size) {} + ~CompressedSecondaryCacheResultHandle() override = default; + + CompressedSecondaryCacheResultHandle( + const CompressedSecondaryCacheResultHandle&) = delete; + CompressedSecondaryCacheResultHandle& operator=( + const CompressedSecondaryCacheResultHandle&) = delete; + + bool IsReady() override { return true; } + + void Wait() override {} + + void* Value() override { return value_; } + + size_t Size() override { return size_; } + + private: + void* value_; + size_t size_; +}; + +// The CompressedSecondaryCache is a concrete implementation of +// rocksdb::SecondaryCache. +// +// When a block is found from CompressedSecondaryCache::Lookup, we check whether +// there is a dummy block with the same key in the primary cache. +// 1. If the dummy block exits, we erase the block from +// CompressedSecondaryCache and insert it into the primary cache. +// 2. If not, we just insert a dummy block into the primary cache +// (charging the actual size of the block) and don not erase the block from +// CompressedSecondaryCache. A standalone handle is returned to the caller. +// +// When a block is evicted from the primary cache, we check whether +// there is a dummy block with the same key in CompressedSecondaryCache. +// 1. If the dummy block exits, the block is inserted into +// CompressedSecondaryCache. +// 2. If not, we just insert a dummy block (size 0) in CompressedSecondaryCache. +// +// Users can also cast a pointer to CompressedSecondaryCache and call methods on +// it directly, especially custom methods that may be added +// in the future. For example - +// std::unique_ptr<rocksdb::SecondaryCache> cache = +// NewCompressedSecondaryCache(opts); +// static_cast<CompressedSecondaryCache*>(cache.get())->Erase(key); + +class CompressedSecondaryCache : public SecondaryCache { + public: + CompressedSecondaryCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, double low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy, + CompressionType compression_type = CompressionType::kLZ4Compression, + uint32_t compress_format_version = 2, + bool enable_custom_split_merge = false); + ~CompressedSecondaryCache() override; + + const char* Name() const override { return "CompressedSecondaryCache"; } + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) override; + + std::unique_ptr<SecondaryCacheResultHandle> Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, + bool advise_erase, bool& is_in_sec_cache) override; + + bool SupportForceErase() const override { return true; } + + void Erase(const Slice& key) override; + + void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {} + + Status SetCapacity(size_t capacity) override; + + Status GetCapacity(size_t& capacity) override; + + std::string GetPrintableOptions() const override; + + private: + friend class CompressedSecondaryCacheTest; + static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{ + 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + + struct CacheValueChunk { + // TODO try "CacheAllocationPtr next;". + CacheValueChunk* next; + size_t size; + // Beginning of the chunk data (MUST BE THE LAST FIELD IN THIS STRUCT!) + char data[1]; + + void Free() { delete[] reinterpret_cast<char*>(this); } + }; + + // Split value into chunks to better fit into jemalloc bins. The chunks + // are stored in CacheValueChunk and extra charge is needed for each chunk, + // so the cache charge is recalculated here. + CacheValueChunk* SplitValueIntoChunks(const Slice& value, + CompressionType compression_type, + size_t& charge); + + // After merging chunks, the extra charge for each chunk is removed, so + // the charge is recalculated. + CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head, + size_t& charge); + + // An implementation of Cache::DeleterFn. + static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge); + std::shared_ptr<Cache> cache_; + CompressedSecondaryCacheOptions cache_options_; + mutable port::Mutex capacity_mutex_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/compressed_secondary_cache_test.cc b/src/rocksdb/cache/compressed_secondary_cache_test.cc new file mode 100644 index 000000000..574c257a7 --- /dev/null +++ b/src/rocksdb/cache/compressed_secondary_cache_test.cc @@ -0,0 +1,1005 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/compressed_secondary_cache.h" + +#include <iterator> +#include <memory> +#include <tuple> + +#include "memory/jemalloc_nodump_allocator.h" +#include "rocksdb/convenience.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class CompressedSecondaryCacheTest : public testing::Test { + public: + CompressedSecondaryCacheTest() : fail_create_(false) {} + ~CompressedSecondaryCacheTest() override = default; + + protected: + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() = default; + + char* Buf() { return buf_.get(); } + [[nodiscard]] size_t Size() const { return size_; } + + private: + std::unique_ptr<char[]> buf_; + size_t size_; + }; + + static size_t SizeCallback(void* obj) { + return reinterpret_cast<TestItem*>(obj)->Size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + auto item = reinterpret_cast<TestItem*>(from_obj); + const char* buf = item->Buf(); + EXPECT_EQ(length, item->Size()); + EXPECT_EQ(from_offset, 0); + memcpy(out, buf, length); + return Status::OK(); + } + + static void DeletionCallback(const Slice& /*key*/, void* obj) { + delete reinterpret_cast<TestItem*>(obj); + obj = nullptr; + } + + static Cache::CacheItemHelper helper_; + + static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, + size_t /*size*/, void* /*out*/) { + return Status::NotSupported(); + } + + static Cache::CacheItemHelper helper_fail_; + + Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size, + void** out_obj, + size_t* charge) -> Status { + if (fail_create_) { + return Status::NotSupported(); + } + *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size)); + *charge = size; + return Status::OK(); + }; + + void SetFailCreate(bool fail) { fail_create_ = fail; } + + void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache, + bool sec_cache_is_compressed) { + get_perf_context()->Reset(); + bool is_in_sec_cache{true}; + // Lookup an non-existent key. + std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup( + "k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + ASSERT_EQ(handle0, nullptr); + + Random rnd(301); + // Insert and Lookup the item k1 for the first time. + std::string str1(rnd.RandomString(1000)); + TestItem item1(str1.data(), str1.length()); + // A dummy handle is inserted if the item is inserted for the first time. + ASSERT_OK(sec_cache->Insert("k1", &item1, + &CompressedSecondaryCacheTest::helper_)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + + std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup( + "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_EQ(handle1_1, nullptr); + + // Insert and Lookup the item k1 for the second time and advise erasing it. + ASSERT_OK(sec_cache->Insert("k1", &item1, + &CompressedSecondaryCacheTest::helper_)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); + + std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup( + "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + ASSERT_NE(handle1_2, nullptr); + ASSERT_FALSE(is_in_sec_cache); + if (sec_cache_is_compressed) { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + 1000); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, + 1007); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + + std::unique_ptr<TestItem> val1 = + std::unique_ptr<TestItem>(static_cast<TestItem*>(handle1_2->Value())); + ASSERT_NE(val1, nullptr); + ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0); + + // Lookup the item k1 again. + std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup( + "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + ASSERT_EQ(handle1_3, nullptr); + + // Insert and Lookup the item k2. + std::string str2(rnd.RandomString(1000)); + TestItem item2(str2.data(), str2.length()); + ASSERT_OK(sec_cache->Insert("k2", &item2, + &CompressedSecondaryCacheTest::helper_)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); + std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup( + "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_EQ(handle2_1, nullptr); + + ASSERT_OK(sec_cache->Insert("k2", &item2, + &CompressedSecondaryCacheTest::helper_)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); + if (sec_cache_is_compressed) { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + 2000); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, + 2014); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup( + "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_NE(handle2_2, nullptr); + std::unique_ptr<TestItem> val2 = + std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value())); + ASSERT_NE(val2, nullptr); + ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); + + std::vector<SecondaryCacheResultHandle*> handles = {handle1_2.get(), + handle2_2.get()}; + sec_cache->WaitAll(handles); + + sec_cache.reset(); + } + + void BasicTest(bool sec_cache_is_compressed, bool use_jemalloc) { + CompressedSecondaryCacheOptions opts; + opts.capacity = 2048; + opts.num_shard_bits = 0; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + opts.compression_type = CompressionType::kNoCompression; + sec_cache_is_compressed = false; + } + } else { + opts.compression_type = CompressionType::kNoCompression; + } + + if (use_jemalloc) { + JemallocAllocatorOptions jopts; + std::shared_ptr<MemoryAllocator> allocator; + std::string msg; + if (JemallocNodumpAllocator::IsSupported(&msg)) { + Status s = NewJemallocNodumpAllocator(jopts, &allocator); + if (s.ok()) { + opts.memory_allocator = allocator; + } + } else { + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + } + } + std::shared_ptr<SecondaryCache> sec_cache = + NewCompressedSecondaryCache(opts); + + BasicTestHelper(sec_cache, sec_cache_is_compressed); + } + + void FailsTest(bool sec_cache_is_compressed) { + CompressedSecondaryCacheOptions secondary_cache_opts; + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 1100; + secondary_cache_opts.num_shard_bits = 0; + std::shared_ptr<SecondaryCache> sec_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + // Insert and Lookup the first item. + Random rnd(301); + std::string str1(rnd.RandomString(1000)); + TestItem item1(str1.data(), str1.length()); + // Insert a dummy handle. + ASSERT_OK(sec_cache->Insert("k1", &item1, + &CompressedSecondaryCacheTest::helper_)); + // Insert k1. + ASSERT_OK(sec_cache->Insert("k1", &item1, + &CompressedSecondaryCacheTest::helper_)); + + // Insert and Lookup the second item. + std::string str2(rnd.RandomString(200)); + TestItem item2(str2.data(), str2.length()); + // Insert a dummy handle, k1 is not evicted. + ASSERT_OK(sec_cache->Insert("k2", &item2, + &CompressedSecondaryCacheTest::helper_)); + bool is_in_sec_cache{false}; + std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup( + "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_EQ(handle1, nullptr); + + // Insert k2 and k1 is evicted. + ASSERT_OK(sec_cache->Insert("k2", &item2, + &CompressedSecondaryCacheTest::helper_)); + std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup( + "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_NE(handle2, nullptr); + std::unique_ptr<TestItem> val2 = + std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value())); + ASSERT_NE(val2, nullptr); + ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); + + // Insert k1 again and a dummy handle is inserted. + ASSERT_OK(sec_cache->Insert("k1", &item1, + &CompressedSecondaryCacheTest::helper_)); + + std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup( + "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_EQ(handle1_1, nullptr); + + // Create Fails. + SetFailCreate(true); + std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup( + "k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + ASSERT_EQ(handle2_1, nullptr); + + // Save Fails. + std::string str3 = rnd.RandomString(10); + TestItem item3(str3.data(), str3.length()); + // The Status is OK because a dummy handle is inserted. + ASSERT_OK(sec_cache->Insert("k3", &item3, + &CompressedSecondaryCacheTest::helper_fail_)); + ASSERT_NOK(sec_cache->Insert("k3", &item3, + &CompressedSecondaryCacheTest::helper_fail_)); + + sec_cache.reset(); + } + + void BasicIntegrationTest(bool sec_cache_is_compressed, + bool enable_custom_split_merge) { + CompressedSecondaryCacheOptions secondary_cache_opts; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + sec_cache_is_compressed = false; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 6000; + secondary_cache_opts.num_shard_bits = 0; + secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge; + std::shared_ptr<SecondaryCache> secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + LRUCacheOptions lru_cache_opts( + /*_capacity =*/1300, /*_num_shard_bits =*/0, + /*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5, + /*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0); + lru_cache_opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts); + std::shared_ptr<Statistics> stats = CreateDBStatistics(); + + get_perf_context()->Reset(); + Random rnd(301); + std::string str1 = rnd.RandomString(1001); + auto item1_1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert( + "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length())); + + std::string str2 = rnd.RandomString(1012); + auto item2_1 = new TestItem(str2.data(), str2.length()); + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's dummy item. + ASSERT_OK(cache->Insert( + "k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + + std::string str3 = rnd.RandomString(1024); + auto item3_1 = new TestItem(str3.data(), str3.length()); + // After this Insert, primary cache contains k3 and secondary cache contains + // k1's dummy item and k2's dummy item. + ASSERT_OK(cache->Insert( + "k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); + + // After this Insert, primary cache contains k1 and secondary cache contains + // k1's dummy item, k2's dummy item, and k3's dummy item. + auto item1_2 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert( + "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); + + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's item, k2's dummy item, and k3's dummy item. + auto item2_2 = new TestItem(str2.data(), str2.length()); + ASSERT_OK(cache->Insert( + "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); + if (sec_cache_is_compressed) { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + str1.length()); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, + 1008); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + + // After this Insert, primary cache contains k3 and secondary cache contains + // k1's item and k2's item. + auto item3_2 = new TestItem(str3.data(), str3.length()); + ASSERT_OK(cache->Insert( + "k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); + if (sec_cache_is_compressed) { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + str1.length() + str2.length()); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, + 2027); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + + Cache::Handle* handle; + handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_NE(handle, nullptr); + auto val3 = static_cast<TestItem*>(cache->Value(handle)); + ASSERT_NE(val3, nullptr); + ASSERT_EQ(memcmp(val3->Buf(), item3_2->Buf(), item3_2->Size()), 0); + cache->Release(handle); + + // Lookup an non-existent key. + handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_EQ(handle, nullptr); + + // This Lookup should just insert a dummy handle in the primary cache + // and the k1 is still in the secondary cache. + handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); + auto val1_1 = static_cast<TestItem*>(cache->Value(handle)); + ASSERT_NE(val1_1, nullptr); + ASSERT_EQ(memcmp(val1_1->Buf(), str1.data(), str1.size()), 0); + cache->Release(handle); + + // This Lookup should erase k1 from the secondary cache and insert + // it into primary cache; then k3 is demoted. + // k2 and k3 are in secondary cache. + handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3); + cache->Release(handle); + + // k2 is still in secondary cache. + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2); + cache->Release(handle); + + // Testing SetCapacity(). + ASSERT_OK(secondary_cache->SetCapacity(0)); + handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + ASSERT_EQ(handle, nullptr); + + ASSERT_OK(secondary_cache->SetCapacity(7000)); + size_t capacity; + ASSERT_OK(secondary_cache->GetCapacity(capacity)); + ASSERT_EQ(capacity, 7000); + auto item1_3 = new TestItem(str1.data(), str1.length()); + // After this Insert, primary cache contains k1. + ASSERT_OK(cache->Insert( + "k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4); + + auto item2_3 = new TestItem(str2.data(), str2.length()); + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's dummy item. + ASSERT_OK(cache->Insert( + "k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4); + + auto item1_4 = new TestItem(str1.data(), str1.length()); + // After this Insert, primary cache contains k1 and secondary cache contains + // k1's dummy item and k2's dummy item. + ASSERT_OK(cache->Insert( + "k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5); + + auto item2_4 = new TestItem(str2.data(), str2.length()); + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's real item and k2's dummy item. + ASSERT_OK(cache->Insert( + "k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5); + // This Lookup should just insert a dummy handle in the primary cache + // and the k1 is still in the secondary cache. + handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, + stats.get()); + + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 3); + + cache.reset(); + secondary_cache.reset(); + } + + void BasicIntegrationFailTest(bool sec_cache_is_compressed) { + CompressedSecondaryCacheOptions secondary_cache_opts; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 6000; + secondary_cache_opts.num_shard_bits = 0; + std::shared_ptr<SecondaryCache> secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + LRUCacheOptions opts( + /*_capacity=*/1300, /*_num_shard_bits=*/0, + /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, + /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1001); + auto item1 = std::make_unique<TestItem>(str1.data(), str1.length()); + ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length())); + ASSERT_OK(cache->Insert("k1", item1.get(), + &CompressedSecondaryCacheTest::helper_, + str1.length())); + item1.release(); // Appease clang-analyze "potential memory leak" + + Cache::Handle* handle; + handle = cache->Lookup("k2", nullptr, test_item_creator, + Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false); + ASSERT_EQ(handle, nullptr); + + cache.reset(); + secondary_cache.reset(); + } + + void IntegrationSaveFailTest(bool sec_cache_is_compressed) { + CompressedSecondaryCacheOptions secondary_cache_opts; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 6000; + secondary_cache_opts.num_shard_bits = 0; + + std::shared_ptr<SecondaryCache> secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + LRUCacheOptions opts( + /*_capacity=*/1300, /*_num_shard_bits=*/0, + /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, + /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1001); + auto item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, + &CompressedSecondaryCacheTest::helper_fail_, + str1.length())); + + std::string str2 = rnd.RandomString(1002); + auto item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to the secondary cache. + ASSERT_OK(cache->Insert("k2", item2, + &CompressedSecondaryCacheTest::helper_fail_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 demotion would have failed. + handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 was not promoted, k2 should still be in cache. + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + + cache.reset(); + secondary_cache.reset(); + } + + void IntegrationCreateFailTest(bool sec_cache_is_compressed) { + CompressedSecondaryCacheOptions secondary_cache_opts; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 6000; + secondary_cache_opts.num_shard_bits = 0; + + std::shared_ptr<SecondaryCache> secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + LRUCacheOptions opts( + /*_capacity=*/1300, /*_num_shard_bits=*/0, + /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, + /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1001); + auto item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_, + str1.length())); + + std::string str2 = rnd.RandomString(1002); + auto item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to the secondary cache. + ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + SetFailCreate(true); + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 creation would have failed + handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + + cache.reset(); + secondary_cache.reset(); + } + + void IntegrationFullCapacityTest(bool sec_cache_is_compressed) { + CompressedSecondaryCacheOptions secondary_cache_opts; + + if (sec_cache_is_compressed) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + } else { + secondary_cache_opts.compression_type = CompressionType::kNoCompression; + } + + secondary_cache_opts.capacity = 6000; + secondary_cache_opts.num_shard_bits = 0; + + std::shared_ptr<SecondaryCache> secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + LRUCacheOptions opts( + /*_capacity=*/1300, /*_num_shard_bits=*/0, + /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, + /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1001); + auto item1_1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert( + "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length())); + + std::string str2 = rnd.RandomString(1002); + std::string str2_clone{str2}; + auto item2 = new TestItem(str2.data(), str2.length()); + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's dummy item. + ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_, + str2.length())); + + // After this Insert, primary cache contains k1 and secondary cache contains + // k1's dummy item and k2's dummy item. + auto item1_2 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert( + "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length())); + + auto item2_2 = new TestItem(str2.data(), str2.length()); + // After this Insert, primary cache contains k2 and secondary cache contains + // k1's item and k2's dummy item. + ASSERT_OK(cache->Insert( + "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length())); + + Cache::Handle* handle2; + handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle2, nullptr); + cache->Release(handle2); + + // k1 promotion should fail because cache is at capacity and + // strict_capacity_limit is true, but the lookup should still succeed. + // A k1's dummy item is inserted into primary cache. + Cache::Handle* handle1; + handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle1, nullptr); + cache->Release(handle1); + + // Since k1 didn't get inserted, k2 should still be in cache + handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle2, nullptr); + cache->Release(handle2); + + cache.reset(); + secondary_cache.reset(); + } + + void SplitValueIntoChunksTest() { + JemallocAllocatorOptions jopts; + std::shared_ptr<MemoryAllocator> allocator; + std::string msg; + if (JemallocNodumpAllocator::IsSupported(&msg)) { + Status s = NewJemallocNodumpAllocator(jopts, &allocator); + if (!s.ok()) { + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + } + } else { + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + } + + using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; + std::unique_ptr<CompressedSecondaryCache> sec_cache = + std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0, + allocator); + Random rnd(301); + // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. + size_t str_size{8500}; + std::string str = rnd.RandomString(static_cast<int>(str_size)); + size_t charge{0}; + CacheValueChunk* chunks_head = + sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge); + ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1)); + + CacheValueChunk* current_chunk = chunks_head; + ASSERT_EQ(current_chunk->size, 8192 - sizeof(CacheValueChunk) + 1); + current_chunk = current_chunk->next; + ASSERT_EQ(current_chunk->size, 256 - sizeof(CacheValueChunk) + 1); + current_chunk = current_chunk->next; + ASSERT_EQ(current_chunk->size, 98); + + sec_cache->GetDeletionCallback(true)("dummy", chunks_head); + } + + void MergeChunksIntoValueTest() { + using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; + Random rnd(301); + size_t size1{2048}; + std::string str1 = rnd.RandomString(static_cast<int>(size1)); + CacheValueChunk* current_chunk = reinterpret_cast<CacheValueChunk*>( + new char[sizeof(CacheValueChunk) - 1 + size1]); + CacheValueChunk* chunks_head = current_chunk; + memcpy(current_chunk->data, str1.data(), size1); + current_chunk->size = size1; + + size_t size2{256}; + std::string str2 = rnd.RandomString(static_cast<int>(size2)); + current_chunk->next = reinterpret_cast<CacheValueChunk*>( + new char[sizeof(CacheValueChunk) - 1 + size2]); + current_chunk = current_chunk->next; + memcpy(current_chunk->data, str2.data(), size2); + current_chunk->size = size2; + + size_t size3{31}; + std::string str3 = rnd.RandomString(static_cast<int>(size3)); + current_chunk->next = reinterpret_cast<CacheValueChunk*>( + new char[sizeof(CacheValueChunk) - 1 + size3]); + current_chunk = current_chunk->next; + memcpy(current_chunk->data, str3.data(), size3); + current_chunk->size = size3; + current_chunk->next = nullptr; + + std::string str = str1 + str2 + str3; + + std::unique_ptr<CompressedSecondaryCache> sec_cache = + std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0); + size_t charge{0}; + CacheAllocationPtr value = + sec_cache->MergeChunksIntoValue(chunks_head, charge); + ASSERT_EQ(charge, size1 + size2 + size3); + std::string value_str{value.get(), charge}; + ASSERT_EQ(strcmp(value_str.data(), str.data()), 0); + + while (chunks_head != nullptr) { + CacheValueChunk* tmp_chunk = chunks_head; + chunks_head = chunks_head->next; + tmp_chunk->Free(); + } + } + + void SplictValueAndMergeChunksTest() { + JemallocAllocatorOptions jopts; + std::shared_ptr<MemoryAllocator> allocator; + std::string msg; + if (JemallocNodumpAllocator::IsSupported(&msg)) { + Status s = NewJemallocNodumpAllocator(jopts, &allocator); + if (!s.ok()) { + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + } + } else { + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + } + + using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; + std::unique_ptr<CompressedSecondaryCache> sec_cache = + std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0, + allocator); + Random rnd(301); + // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. + size_t str_size{8500}; + std::string str = rnd.RandomString(static_cast<int>(str_size)); + size_t charge{0}; + CacheValueChunk* chunks_head = + sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge); + ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1)); + + CacheAllocationPtr value = + sec_cache->MergeChunksIntoValue(chunks_head, charge); + ASSERT_EQ(charge, str_size); + std::string value_str{value.get(), charge}; + ASSERT_EQ(strcmp(value_str.data(), str.data()), 0); + + sec_cache->GetDeletionCallback(true)("dummy", chunks_head); + } + + private: + bool fail_create_; +}; + +Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_( + CompressedSecondaryCacheTest::SizeCallback, + CompressedSecondaryCacheTest::SaveToCallback, + CompressedSecondaryCacheTest::DeletionCallback); + +Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_( + CompressedSecondaryCacheTest::SizeCallback, + CompressedSecondaryCacheTest::SaveToCallbackFail, + CompressedSecondaryCacheTest::DeletionCallback); + +class CompressedSecCacheTestWithCompressAndAllocatorParam + : public CompressedSecondaryCacheTest, + public ::testing::WithParamInterface<std::tuple<bool, bool>> { + public: + CompressedSecCacheTestWithCompressAndAllocatorParam() { + sec_cache_is_compressed_ = std::get<0>(GetParam()); + use_jemalloc_ = std::get<1>(GetParam()); + } + bool sec_cache_is_compressed_; + bool use_jemalloc_; +}; + +TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) { + BasicTest(sec_cache_is_compressed_, use_jemalloc_); +} + +INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, + CompressedSecCacheTestWithCompressAndAllocatorParam, + ::testing::Combine(testing::Bool(), testing::Bool())); + +class CompressedSecondaryCacheTestWithCompressionParam + : public CompressedSecondaryCacheTest, + public ::testing::WithParamInterface<bool> { + public: + CompressedSecondaryCacheTestWithCompressionParam() { + sec_cache_is_compressed_ = GetParam(); + } + bool sec_cache_is_compressed_; +}; + +#ifndef ROCKSDB_LITE + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) { + std::shared_ptr<SecondaryCache> sec_cache{nullptr}; + std::string sec_cache_uri; + if (sec_cache_is_compressed_) { + if (LZ4_Supported()) { + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;" + "compress_format_version=2"; + } else { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kNoCompression"; + sec_cache_is_compressed_ = false; + } + Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri, + &sec_cache); + EXPECT_OK(s); + } else { + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kNoCompression"; + Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri, + &sec_cache); + EXPECT_OK(s); + } + BasicTestHelper(sec_cache, sec_cache_is_compressed_); +} + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, + BasicTestFromStringWithSplit) { + std::shared_ptr<SecondaryCache> sec_cache{nullptr}; + std::string sec_cache_uri; + if (sec_cache_is_compressed_) { + if (LZ4_Supported()) { + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;" + "compress_format_version=2;enable_custom_split_merge=true"; + } else { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;" + "enable_custom_split_merge=true"; + sec_cache_is_compressed_ = false; + } + Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri, + &sec_cache); + EXPECT_OK(s); + } else { + sec_cache_uri = + "compressed_secondary_cache://" + "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;" + "enable_custom_split_merge=true"; + Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri, + &sec_cache); + EXPECT_OK(s); + } + BasicTestHelper(sec_cache, sec_cache_is_compressed_); +} + +#endif // ROCKSDB_LITE + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) { + FailsTest(sec_cache_is_compressed_); +} + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, + BasicIntegrationFailTest) { + BasicIntegrationFailTest(sec_cache_is_compressed_); +} + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, + IntegrationSaveFailTest) { + IntegrationSaveFailTest(sec_cache_is_compressed_); +} + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, + IntegrationCreateFailTest) { + IntegrationCreateFailTest(sec_cache_is_compressed_); +} + +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, + IntegrationFullCapacityTest) { + IntegrationFullCapacityTest(sec_cache_is_compressed_); +} + +INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, + CompressedSecondaryCacheTestWithCompressionParam, + testing::Bool()); + +class CompressedSecCacheTestWithCompressAndSplitParam + : public CompressedSecondaryCacheTest, + public ::testing::WithParamInterface<std::tuple<bool, bool>> { + public: + CompressedSecCacheTestWithCompressAndSplitParam() { + sec_cache_is_compressed_ = std::get<0>(GetParam()); + enable_custom_split_merge_ = std::get<1>(GetParam()); + } + bool sec_cache_is_compressed_; + bool enable_custom_split_merge_; +}; + +TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) { + BasicIntegrationTest(sec_cache_is_compressed_, enable_custom_split_merge_); +} + +INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, + CompressedSecCacheTestWithCompressAndSplitParam, + ::testing::Combine(testing::Bool(), testing::Bool())); + +TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) { + SplitValueIntoChunksTest(); +} + +TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) { + MergeChunksIntoValueTest(); +} + +TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) { + SplictValueAndMergeChunksTest(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/cache/lru_cache.cc b/src/rocksdb/cache/lru_cache.cc new file mode 100644 index 000000000..c8e4d29ba --- /dev/null +++ b/src/rocksdb/cache/lru_cache.cc @@ -0,0 +1,921 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "cache/lru_cache.h" + +#include <cassert> +#include <cstdint> +#include <cstdio> +#include <cstdlib> + +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" +#include "util/distributed_mutex.h" + +namespace ROCKSDB_NAMESPACE { +namespace lru_cache { + +// A distinct pointer value for marking "dummy" cache entries +void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker"); + +LRUHandleTable::LRUHandleTable(int max_upper_hash_bits) + : length_bits_(/* historical starting size*/ 4), + list_(new LRUHandle* [size_t{1} << length_bits_] {}), + elems_(0), + max_length_bits_(max_upper_hash_bits) {} + +LRUHandleTable::~LRUHandleTable() { + ApplyToEntriesRange( + [](LRUHandle* h) { + if (!h->HasRefs()) { + h->Free(); + } + }, + 0, size_t{1} << length_bits_); +} + +LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); +} + +LRUHandle* LRUHandleTable::Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == nullptr ? nullptr : old->next_hash); + *ptr = h; + if (old == nullptr) { + ++elems_; + if ((elems_ >> length_bits_) > 0) { // elems_ >= length + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; +} + +LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != nullptr) { + *ptr = result->next_hash; + --elems_; + } + return result; +} + +LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash >> (32 - length_bits_)]; + while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; +} + +void LRUHandleTable::Resize() { + if (length_bits_ >= max_length_bits_) { + // Due to reaching limit of hash information, if we made the table bigger, + // we would allocate more addresses but only the same number would be used. + return; + } + if (length_bits_ >= 31) { + // Avoid undefined behavior shifting uint32_t by 32. + return; + } + + uint32_t old_length = uint32_t{1} << length_bits_; + int new_length_bits = length_bits_ + 1; + std::unique_ptr<LRUHandle* []> new_list { + new LRUHandle* [size_t{1} << new_length_bits] {} + }; + uint32_t count = 0; + for (uint32_t i = 0; i < old_length; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + assert(elems_ == count); + list_ = std::move(new_list); + length_bits_ = new_length_bits; +} + +LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, + double high_pri_pool_ratio, + double low_pri_pool_ratio, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, + SecondaryCache* secondary_cache) + : CacheShardBase(metadata_charge_policy), + capacity_(0), + high_pri_pool_usage_(0), + low_pri_pool_usage_(0), + strict_capacity_limit_(strict_capacity_limit), + high_pri_pool_ratio_(high_pri_pool_ratio), + high_pri_pool_capacity_(0), + low_pri_pool_ratio_(low_pri_pool_ratio), + low_pri_pool_capacity_(0), + table_(max_upper_hash_bits), + usage_(0), + lru_usage_(0), + mutex_(use_adaptive_mutex), + secondary_cache_(secondary_cache) { + // Make empty circular linked list. + lru_.next = &lru_; + lru_.prev = &lru_; + lru_low_pri_ = &lru_; + lru_bottom_pri_ = &lru_; + SetCapacity(capacity); +} + +void LRUCacheShard::EraseUnRefEntries() { + autovector<LRUHandle*> last_reference_list; + { + DMutexLock l(mutex_); + while (lru_.next != &lru_) { + LRUHandle* old = lru_.next; + // LRU list contains only elements which can be evicted. + assert(old->InCache() && !old->HasRefs()); + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + old->SetInCache(false); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + last_reference_list.push_back(old); + } + } + + for (auto entry : last_reference_list) { + entry->Free(); + } +} + +void LRUCacheShard::ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state) { + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. + DMutexLock l(mutex_); + int length_bits = table_.GetLengthBits(); + size_t length = size_t{1} << length_bits; + + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow). + assert(average_entries_per_lock < length || *state == 0); + + size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits); + size_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end + index_end = length; + *state = SIZE_MAX; + } else { + *state = index_end << (sizeof(size_t) * 8u - length_bits); + } + + table_.ApplyToEntriesRange( + [callback, + metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) { + DeleterFn deleter = h->IsSecondaryCacheCompatible() + ? h->info_.helper->del_cb + : h->info_.deleter; + callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), + deleter); + }, + index_begin, index_end); +} + +void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri, + LRUHandle** lru_bottom_pri) { + DMutexLock l(mutex_); + *lru = &lru_; + *lru_low_pri = lru_low_pri_; + *lru_bottom_pri = lru_bottom_pri_; +} + +size_t LRUCacheShard::TEST_GetLRUSize() { + DMutexLock l(mutex_); + LRUHandle* lru_handle = lru_.next; + size_t lru_size = 0; + while (lru_handle != &lru_) { + lru_size++; + lru_handle = lru_handle->next; + } + return lru_size; +} + +double LRUCacheShard::GetHighPriPoolRatio() { + DMutexLock l(mutex_); + return high_pri_pool_ratio_; +} + +double LRUCacheShard::GetLowPriPoolRatio() { + DMutexLock l(mutex_); + return low_pri_pool_ratio_; +} + +void LRUCacheShard::LRU_Remove(LRUHandle* e) { + assert(e->next != nullptr); + assert(e->prev != nullptr); + if (lru_low_pri_ == e) { + lru_low_pri_ = e->prev; + } + if (lru_bottom_pri_ == e) { + lru_bottom_pri_ = e->prev; + } + e->next->prev = e->prev; + e->prev->next = e->next; + e->prev = e->next = nullptr; + assert(lru_usage_ >= e->total_charge); + lru_usage_ -= e->total_charge; + assert(!e->InHighPriPool() || !e->InLowPriPool()); + if (e->InHighPriPool()) { + assert(high_pri_pool_usage_ >= e->total_charge); + high_pri_pool_usage_ -= e->total_charge; + } else if (e->InLowPriPool()) { + assert(low_pri_pool_usage_ >= e->total_charge); + low_pri_pool_usage_ -= e->total_charge; + } +} + +void LRUCacheShard::LRU_Insert(LRUHandle* e) { + assert(e->next == nullptr); + assert(e->prev == nullptr); + if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) { + // Inset "e" to head of LRU list. + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + e->SetInHighPriPool(true); + e->SetInLowPriPool(false); + high_pri_pool_usage_ += e->total_charge; + MaintainPoolSize(); + } else if (low_pri_pool_ratio_ > 0 && + (e->IsHighPri() || e->IsLowPri() || e->HasHit())) { + // Insert "e" to the head of low-pri pool. + e->next = lru_low_pri_->next; + e->prev = lru_low_pri_; + e->prev->next = e; + e->next->prev = e; + e->SetInHighPriPool(false); + e->SetInLowPriPool(true); + low_pri_pool_usage_ += e->total_charge; + MaintainPoolSize(); + lru_low_pri_ = e; + } else { + // Insert "e" to the head of bottom-pri pool. + e->next = lru_bottom_pri_->next; + e->prev = lru_bottom_pri_; + e->prev->next = e; + e->next->prev = e; + e->SetInHighPriPool(false); + e->SetInLowPriPool(false); + // if the low-pri pool is empty, lru_low_pri_ also needs to be updated. + if (lru_bottom_pri_ == lru_low_pri_) { + lru_low_pri_ = e; + } + lru_bottom_pri_ = e; + } + lru_usage_ += e->total_charge; +} + +void LRUCacheShard::MaintainPoolSize() { + while (high_pri_pool_usage_ > high_pri_pool_capacity_) { + // Overflow last entry in high-pri pool to low-pri pool. + lru_low_pri_ = lru_low_pri_->next; + assert(lru_low_pri_ != &lru_); + lru_low_pri_->SetInHighPriPool(false); + lru_low_pri_->SetInLowPriPool(true); + assert(high_pri_pool_usage_ >= lru_low_pri_->total_charge); + high_pri_pool_usage_ -= lru_low_pri_->total_charge; + low_pri_pool_usage_ += lru_low_pri_->total_charge; + } + + while (low_pri_pool_usage_ > low_pri_pool_capacity_) { + // Overflow last entry in low-pri pool to bottom-pri pool. + lru_bottom_pri_ = lru_bottom_pri_->next; + assert(lru_bottom_pri_ != &lru_); + lru_bottom_pri_->SetInHighPriPool(false); + lru_bottom_pri_->SetInLowPriPool(false); + assert(low_pri_pool_usage_ >= lru_bottom_pri_->total_charge); + low_pri_pool_usage_ -= lru_bottom_pri_->total_charge; + } +} + +void LRUCacheShard::EvictFromLRU(size_t charge, + autovector<LRUHandle*>* deleted) { + while ((usage_ + charge) > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + // LRU list contains only elements which can be evicted. + assert(old->InCache() && !old->HasRefs()); + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + old->SetInCache(false); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + deleted->push_back(old); + } +} + +void LRUCacheShard::TryInsertIntoSecondaryCache( + autovector<LRUHandle*> evicted_handles) { + for (auto entry : evicted_handles) { + if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && + !entry->IsInSecondaryCache()) { + secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + .PermitUncheckedError(); + } + // Free the entries here outside of mutex for performance reasons. + entry->Free(); + } +} + +void LRUCacheShard::SetCapacity(size_t capacity) { + autovector<LRUHandle*> last_reference_list; + { + DMutexLock l(mutex_); + capacity_ = capacity; + high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_; + low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_; + EvictFromLRU(0, &last_reference_list); + } + + TryInsertIntoSecondaryCache(last_reference_list); +} + +void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { + DMutexLock l(mutex_); + strict_capacity_limit_ = strict_capacity_limit; +} + +Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle, + bool free_handle_on_fail) { + Status s = Status::OK(); + autovector<LRUHandle*> last_reference_list; + + { + DMutexLock l(mutex_); + + // Free the space following strict LRU policy until enough space + // is freed or the lru list is empty. + EvictFromLRU(e->total_charge, &last_reference_list); + + if ((usage_ + e->total_charge) > capacity_ && + (strict_capacity_limit_ || handle == nullptr)) { + e->SetInCache(false); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry inserted + // into cache and get evicted immediately. + last_reference_list.push_back(e); + } else { + if (free_handle_on_fail) { + free(e); + *handle = nullptr; + } + s = Status::MemoryLimit("Insert failed due to LRU cache being full."); + } + } else { + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. + LRUHandle* old = table_.Insert(e); + usage_ += e->total_charge; + if (old != nullptr) { + s = Status::OkOverwritten(); + assert(old->InCache()); + old->SetInCache(false); + if (!old->HasRefs()) { + // old is on LRU because it's in cache and its reference count is 0. + LRU_Remove(old); + assert(usage_ >= old->total_charge); + usage_ -= old->total_charge; + last_reference_list.push_back(old); + } + } + if (handle == nullptr) { + LRU_Insert(e); + } else { + // If caller already holds a ref, no need to take one here. + if (!e->HasRefs()) { + e->Ref(); + } + *handle = e; + } + } + } + + TryInsertIntoSecondaryCache(last_reference_list); + + return s; +} + +void LRUCacheShard::Promote(LRUHandle* e) { + SecondaryCacheResultHandle* secondary_handle = e->sec_handle; + + assert(secondary_handle->IsReady()); + // e is not thread-shared here; OK to modify "immutable" fields as well as + // "mutable" (normally requiring mutex) + e->SetIsPending(false); + e->value = secondary_handle->Value(); + assert(e->total_charge == 0); + size_t value_size = secondary_handle->Size(); + delete secondary_handle; + + if (e->value) { + e->CalcTotalCharge(value_size, metadata_charge_policy_); + Status s; + if (e->IsStandalone()) { + assert(secondary_cache_ && secondary_cache_->SupportForceErase()); + + // Insert a dummy handle and return a standalone handle to caller. + // Charge the standalone handle. + autovector<LRUHandle*> last_reference_list; + bool free_standalone_handle{false}; + { + DMutexLock l(mutex_); + + // Free the space following strict LRU policy until enough space + // is freed or the lru list is empty. + EvictFromLRU(e->total_charge, &last_reference_list); + + if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) { + free_standalone_handle = true; + } else { + usage_ += e->total_charge; + } + } + + TryInsertIntoSecondaryCache(last_reference_list); + if (free_standalone_handle) { + e->Unref(); + e->Free(); + e = nullptr; + } else { + PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1); + } + + // Insert a dummy handle into the primary cache. This dummy handle is + // not IsSecondaryCacheCompatible(). + // FIXME? This should not overwrite an existing non-dummy entry in the + // rare case that one exists + Cache::Priority priority = + e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW; + s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0, + /*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr, + priority); + } else { + e->SetInCache(true); + LRUHandle* handle = e; + // This InsertItem() could fail if the cache is over capacity and + // strict_capacity_limit_ is true. In such a case, we don't want + // InsertItem() to free the handle, since the item is already in memory + // and the caller will most likely just read it from disk if we erase it + // here. + s = InsertItem(e, &handle, /*free_handle_on_fail=*/false); + if (s.ok()) { + PERF_COUNTER_ADD(block_cache_real_handle_count, 1); + } + } + + if (!s.ok()) { + // Item is in memory, but not accounted against the cache capacity. + // When the handle is released, the item should get deleted. + assert(!e->InCache()); + } + } else { + // Secondary cache lookup failed. The caller will take care of detecting + // this and eventually releasing e. + assert(!e->value); + assert(!e->InCache()); + } +} + +LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority, bool wait, + Statistics* stats) { + LRUHandle* e = nullptr; + bool found_dummy_entry{false}; + { + DMutexLock l(mutex_); + e = table_.Lookup(key, hash); + if (e != nullptr) { + assert(e->InCache()); + if (e->value == kDummyValueMarker) { + // For a dummy handle, if it was retrieved from secondary cache, + // it may still exist in secondary cache. + // If the handle exists in secondary cache, the value should be + // erased from sec cache and be inserted into primary cache. + found_dummy_entry = true; + // Let the dummy entry be overwritten + e = nullptr; + } else { + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external + // references. + LRU_Remove(e); + } + e->Ref(); + e->SetHit(); + } + } + } + + // If handle table lookup failed or the handle is a dummy one, allocate + // a handle outside the mutex if we re going to lookup in the secondary cache. + // + // When a block is firstly Lookup from CompressedSecondaryCache, we just + // insert a dummy block into the primary cache (charging the actual size of + // the block) and don't erase the block from CompressedSecondaryCache. A + // standalone handle is returned to the caller. Only if the block is hit + // again, we erase it from CompressedSecondaryCache and add it into the + // primary cache. + if (!e && secondary_cache_ && helper && helper->saveto_cb) { + // For objects from the secondary cache, we expect the caller to provide + // a way to create/delete the primary cache object. The only case where + // a deleter would not be required is for dummy entries inserted for + // accounting purposes, which we won't demote to the secondary cache + // anyway. + assert(create_cb && helper->del_cb); + bool is_in_sec_cache{false}; + std::unique_ptr<SecondaryCacheResultHandle> secondary_handle = + secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry, + is_in_sec_cache); + if (secondary_handle != nullptr) { + e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size())); + + e->m_flags = 0; + e->im_flags = 0; + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + e->key_length = key.size(); + e->hash = hash; + e->refs = 0; + e->next = e->prev = nullptr; + e->SetPriority(priority); + memcpy(e->key_data, key.data(), key.size()); + e->value = nullptr; + e->sec_handle = secondary_handle.release(); + e->total_charge = 0; + e->Ref(); + e->SetIsInSecondaryCache(is_in_sec_cache); + e->SetIsStandalone(secondary_cache_->SupportForceErase() && + !found_dummy_entry); + + if (wait) { + Promote(e); + if (e) { + if (!e->value) { + // The secondary cache returned a handle, but the lookup failed. + e->Unref(); + e->Free(); + e = nullptr; + } else { + PERF_COUNTER_ADD(secondary_cache_hit_count, 1); + RecordTick(stats, SECONDARY_CACHE_HITS); + } + } + } else { + // If wait is false, we always return a handle and let the caller + // release the handle after checking for success or failure. + e->SetIsPending(true); + // This may be slightly inaccurate, if the lookup eventually fails. + // But the probability is very low. + PERF_COUNTER_ADD(secondary_cache_hit_count, 1); + RecordTick(stats, SECONDARY_CACHE_HITS); + } + } else { + // Caller will most likely overwrite the dummy entry with an Insert + // after this Lookup fails + assert(e == nullptr); + } + } + return e; +} + +bool LRUCacheShard::Ref(LRUHandle* e) { + DMutexLock l(mutex_); + // To create another reference - entry must be already externally referenced. + assert(e->HasRefs()); + // Pending handles are not for sharing + assert(!e->IsPending()); + e->Ref(); + return true; +} + +void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) { + DMutexLock l(mutex_); + high_pri_pool_ratio_ = high_pri_pool_ratio; + high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_; + MaintainPoolSize(); +} + +void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) { + DMutexLock l(mutex_); + low_pri_pool_ratio_ = low_pri_pool_ratio; + low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_; + MaintainPoolSize(); +} + +bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, + bool erase_if_last_ref) { + if (e == nullptr) { + return false; + } + bool last_reference = false; + // Must Wait or WaitAll first on pending handles. Otherwise, would leak + // a secondary cache handle. + assert(!e->IsPending()); + { + DMutexLock l(mutex_); + last_reference = e->Unref(); + if (last_reference && e->InCache()) { + // The item is still in cache, and nobody else holds a reference to it. + if (usage_ > capacity_ || erase_if_last_ref) { + // The LRU list must be empty since the cache is full. + assert(lru_.next == &lru_ || erase_if_last_ref); + // Take this opportunity and remove the item. + table_.Remove(e->key(), e->hash); + e->SetInCache(false); + } else { + // Put the item back on the LRU list, and don't free it. + LRU_Insert(e); + last_reference = false; + } + } + // If it was the last reference, then decrement the cache usage. + if (last_reference) { + assert(usage_ >= e->total_charge); + usage_ -= e->total_charge; + } + } + + // Free the entry here outside of mutex for performance reasons. + if (last_reference) { + e->Free(); + } + return last_reference; +} + +Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, + size_t charge, + void (*deleter)(const Slice& key, void* value), + const Cache::CacheItemHelper* helper, + LRUHandle** handle, Cache::Priority priority) { + // Allocate the memory here outside of the mutex. + // If the cache is full, we'll have to release it. + // It shouldn't happen very often though. + LRUHandle* e = + static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size())); + + e->value = value; + e->m_flags = 0; + e->im_flags = 0; + if (helper) { + // Use only one of the two parameters + assert(deleter == nullptr); + // value == nullptr is reserved for indicating failure for when secondary + // cache compatible + assert(value != nullptr); + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + } else { + e->info_.deleter = deleter; + } + e->key_length = key.size(); + e->hash = hash; + e->refs = 0; + e->next = e->prev = nullptr; + e->SetInCache(true); + e->SetPriority(priority); + memcpy(e->key_data, key.data(), key.size()); + e->CalcTotalCharge(charge, metadata_charge_policy_); + + return InsertItem(e, handle, /* free_handle_on_fail */ true); +} + +void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + DMutexLock l(mutex_); + e = table_.Remove(key, hash); + if (e != nullptr) { + assert(e->InCache()); + e->SetInCache(false); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references + LRU_Remove(e); + assert(usage_ >= e->total_charge); + usage_ -= e->total_charge; + last_reference = true; + } + } + } + + // Free the entry here outside of mutex for performance reasons. + // last_reference will only be true if e != nullptr. + if (last_reference) { + e->Free(); + } +} + +bool LRUCacheShard::IsReady(LRUHandle* e) { + bool ready = true; + if (e->IsPending()) { + assert(secondary_cache_); + assert(e->sec_handle); + ready = e->sec_handle->IsReady(); + } + return ready; +} + +size_t LRUCacheShard::GetUsage() const { + DMutexLock l(mutex_); + return usage_; +} + +size_t LRUCacheShard::GetPinnedUsage() const { + DMutexLock l(mutex_); + assert(usage_ >= lru_usage_); + return usage_ - lru_usage_; +} + +size_t LRUCacheShard::GetOccupancyCount() const { + DMutexLock l(mutex_); + return table_.GetOccupancyCount(); +} + +size_t LRUCacheShard::GetTableAddressCount() const { + DMutexLock l(mutex_); + return size_t{1} << table_.GetLengthBits(); +} + +void LRUCacheShard::AppendPrintableOptions(std::string& str) const { + const int kBufferSize = 200; + char buffer[kBufferSize]; + { + DMutexLock l(mutex_); + snprintf(buffer, kBufferSize, " high_pri_pool_ratio: %.3lf\n", + high_pri_pool_ratio_); + snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer), + " low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_); + } + str.append(buffer); +} + +LRUCache::LRUCache(size_t capacity, int num_shard_bits, + bool strict_capacity_limit, double high_pri_pool_ratio, + double low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> allocator, + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + std::shared_ptr<SecondaryCache> _secondary_cache) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, + std::move(allocator)), + secondary_cache_(std::move(_secondary_cache)) { + size_t per_shard = GetPerShardCapacity(); + SecondaryCache* secondary_cache = secondary_cache_.get(); + InitShards([=](LRUCacheShard* cs) { + new (cs) LRUCacheShard( + per_shard, strict_capacity_limit, high_pri_pool_ratio, + low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy, + /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache); + }); +} + +void* LRUCache::Value(Handle* handle) { + auto h = reinterpret_cast<const LRUHandle*>(handle); + assert(!h->IsPending() || h->value == nullptr); + assert(h->value != kDummyValueMarker); + return h->value; +} + +size_t LRUCache::GetCharge(Handle* handle) const { + return reinterpret_cast<const LRUHandle*>(handle)->GetCharge( + GetShard(0).metadata_charge_policy_); +} + +Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast<const LRUHandle*>(handle); + if (h->IsSecondaryCacheCompatible()) { + return h->info_.helper->del_cb; + } else { + return h->info_.deleter; + } +} + +size_t LRUCache::TEST_GetLRUSize() { + return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); }); +} + +double LRUCache::GetHighPriPoolRatio() { + return GetShard(0).GetHighPriPoolRatio(); +} + +void LRUCache::WaitAll(std::vector<Handle*>& handles) { + if (secondary_cache_) { + std::vector<SecondaryCacheResultHandle*> sec_handles; + sec_handles.reserve(handles.size()); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle); + if (!lru_handle->IsPending()) { + continue; + } + sec_handles.emplace_back(lru_handle->sec_handle); + } + secondary_cache_->WaitAll(sec_handles); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle); + if (!lru_handle->IsPending()) { + continue; + } + GetShard(lru_handle->hash).Promote(lru_handle); + } + } +} + +void LRUCache::AppendPrintableOptions(std::string& str) const { + ShardedCache::AppendPrintableOptions(str); // options from shard + if (secondary_cache_) { + str.append(" secondary_cache:\n"); + str.append(secondary_cache_->GetPrintableOptions()); + } +} + +} // namespace lru_cache + +std::shared_ptr<Cache> NewLRUCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + const std::shared_ptr<SecondaryCache>& secondary_cache, + double low_pri_pool_ratio) { + if (num_shard_bits >= 20) { + return nullptr; // The cache cannot be sharded into too many fine pieces. + } + if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) { + // Invalid high_pri_pool_ratio + return nullptr; + } + if (low_pri_pool_ratio < 0.0 || low_pri_pool_ratio > 1.0) { + // Invalid low_pri_pool_ratio + return nullptr; + } + if (low_pri_pool_ratio + high_pri_pool_ratio > 1.0) { + // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination + return nullptr; + } + if (num_shard_bits < 0) { + num_shard_bits = GetDefaultCacheShardBits(capacity); + } + return std::make_shared<LRUCache>( + capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, + low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex, + metadata_charge_policy, secondary_cache); +} + +std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) { + return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, + cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, + cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio); +} + +std::shared_ptr<Cache> NewLRUCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + double low_pri_pool_ratio) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, + metadata_charge_policy, nullptr, low_pri_pool_ratio); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/lru_cache.h b/src/rocksdb/cache/lru_cache.h new file mode 100644 index 000000000..99b2f2b20 --- /dev/null +++ b/src/rocksdb/cache/lru_cache.h @@ -0,0 +1,546 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <memory> +#include <string> + +#include "cache/sharded_cache.h" +#include "port/lang.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/secondary_cache.h" +#include "util/autovector.h" +#include "util/distributed_mutex.h" + +namespace ROCKSDB_NAMESPACE { +namespace lru_cache { + +// LRU cache implementation. This class is not thread-safe. + +// An entry is a variable length heap-allocated structure. +// Entries are referenced by cache and/or by any external entity. +// The cache keeps all its entries in a hash table. Some elements +// are also stored on LRU list. +// +// LRUHandle can be in these states: +// 1. Referenced externally AND in hash table. +// In that case the entry is *not* in the LRU list +// (refs >= 1 && in_cache == true) +// 2. Not referenced externally AND in hash table. +// In that case the entry is in the LRU list and can be freed. +// (refs == 0 && in_cache == true) +// 3. Referenced externally AND not in hash table. +// In that case the entry is not in the LRU list and not in hash table. +// The entry must be freed if refs becomes 0 in this state. +// (refs >= 1 && in_cache == false) +// If you call LRUCacheShard::Release enough times on an entry in state 1, it +// will go into state 2. To move from state 1 to state 3, either call +// LRUCacheShard::Erase or LRUCacheShard::Insert with the same key (but +// possibly different value). To move from state 2 to state 1, use +// LRUCacheShard::Lookup. +// While refs > 0, public properties like value and deleter must not change. + +struct LRUHandle { + void* value; + union Info { + Info() {} + ~Info() {} + Cache::DeleterFn deleter; + const Cache::CacheItemHelper* helper; + } info_; + // An entry is not added to the LRUHandleTable until the secondary cache + // lookup is complete, so its safe to have this union. + union { + LRUHandle* next_hash; + SecondaryCacheResultHandle* sec_handle; + }; + LRUHandle* next; + LRUHandle* prev; + size_t total_charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + // The hash of key(). Used for fast sharding and comparisons. + uint32_t hash; + // The number of external refs to this entry. The cache itself is not counted. + uint32_t refs; + + // Mutable flags - access controlled by mutex + // The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid + // checking an M_ flag on im_flags or an IM_ flag on m_flags. + uint8_t m_flags; + enum MFlags : uint8_t { + // Whether this entry is referenced by the hash table. + M_IN_CACHE = (1 << 0), + // Whether this entry has had any lookups (hits). + M_HAS_HIT = (1 << 1), + // Whether this entry is in high-pri pool. + M_IN_HIGH_PRI_POOL = (1 << 2), + // Whether this entry is in low-pri pool. + M_IN_LOW_PRI_POOL = (1 << 3), + }; + + // "Immutable" flags - only set in single-threaded context and then + // can be accessed without mutex + uint8_t im_flags; + enum ImFlags : uint8_t { + // Whether this entry is high priority entry. + IM_IS_HIGH_PRI = (1 << 0), + // Whether this entry is low priority entry. + IM_IS_LOW_PRI = (1 << 1), + // Can this be inserted into the secondary cache. + IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2), + // Is the handle still being read from a lower tier. + IM_IS_PENDING = (1 << 3), + // Whether this handle is still in a lower tier + IM_IS_IN_SECONDARY_CACHE = (1 << 4), + // Marks result handles that should not be inserted into cache + IM_IS_STANDALONE = (1 << 5), + }; + + // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) + char key_data[1]; + + Slice key() const { return Slice(key_data, key_length); } + + // For HandleImpl concept + uint32_t GetHash() const { return hash; } + + // Increase the reference count by 1. + void Ref() { refs++; } + + // Just reduce the reference count by 1. Return true if it was last reference. + bool Unref() { + assert(refs > 0); + refs--; + return refs == 0; + } + + // Return true if there are external refs, false otherwise. + bool HasRefs() const { return refs > 0; } + + bool InCache() const { return m_flags & M_IN_CACHE; } + bool IsHighPri() const { return im_flags & IM_IS_HIGH_PRI; } + bool InHighPriPool() const { return m_flags & M_IN_HIGH_PRI_POOL; } + bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; } + bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; } + bool HasHit() const { return m_flags & M_HAS_HIT; } + bool IsSecondaryCacheCompatible() const { + return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE; + } + bool IsPending() const { return im_flags & IM_IS_PENDING; } + bool IsInSecondaryCache() const { + return im_flags & IM_IS_IN_SECONDARY_CACHE; + } + bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; } + + void SetInCache(bool in_cache) { + if (in_cache) { + m_flags |= M_IN_CACHE; + } else { + m_flags &= ~M_IN_CACHE; + } + } + + void SetPriority(Cache::Priority priority) { + if (priority == Cache::Priority::HIGH) { + im_flags |= IM_IS_HIGH_PRI; + im_flags &= ~IM_IS_LOW_PRI; + } else if (priority == Cache::Priority::LOW) { + im_flags &= ~IM_IS_HIGH_PRI; + im_flags |= IM_IS_LOW_PRI; + } else { + im_flags &= ~IM_IS_HIGH_PRI; + im_flags &= ~IM_IS_LOW_PRI; + } + } + + void SetInHighPriPool(bool in_high_pri_pool) { + if (in_high_pri_pool) { + m_flags |= M_IN_HIGH_PRI_POOL; + } else { + m_flags &= ~M_IN_HIGH_PRI_POOL; + } + } + + void SetInLowPriPool(bool in_low_pri_pool) { + if (in_low_pri_pool) { + m_flags |= M_IN_LOW_PRI_POOL; + } else { + m_flags &= ~M_IN_LOW_PRI_POOL; + } + } + + void SetHit() { m_flags |= M_HAS_HIT; } + + void SetSecondaryCacheCompatible(bool compat) { + if (compat) { + im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE; + } else { + im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE; + } + } + + void SetIsPending(bool pending) { + if (pending) { + im_flags |= IM_IS_PENDING; + } else { + im_flags &= ~IM_IS_PENDING; + } + } + + void SetIsInSecondaryCache(bool is_in_secondary_cache) { + if (is_in_secondary_cache) { + im_flags |= IM_IS_IN_SECONDARY_CACHE; + } else { + im_flags &= ~IM_IS_IN_SECONDARY_CACHE; + } + } + + void SetIsStandalone(bool is_standalone) { + if (is_standalone) { + im_flags |= IM_IS_STANDALONE; + } else { + im_flags &= ~IM_IS_STANDALONE; + } + } + + void Free() { + assert(refs == 0); + + if (!IsSecondaryCacheCompatible() && info_.deleter) { + (*info_.deleter)(key(), value); + } else if (IsSecondaryCacheCompatible()) { + if (IsPending()) { + assert(sec_handle != nullptr); + SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; + tmp_sec_handle->Wait(); + value = tmp_sec_handle->Value(); + delete tmp_sec_handle; + } + if (value) { + (*info_.helper->del_cb)(key(), value); + } + } + + free(this); + } + + inline size_t CalcuMetaCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + if (metadata_charge_policy != kFullChargeCacheMetadata) { + return 0; + } else { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size( + const_cast<void*>(static_cast<const void*>(this))); +#else + // This is the size that is used when a new handle is created. + return sizeof(LRUHandle) - 1 + key_length; +#endif + } + } + + // Calculate the memory usage by metadata. + inline void CalcTotalCharge( + size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { + total_charge = charge + CalcuMetaCharge(metadata_charge_policy); + } + + inline size_t GetCharge( + CacheMetadataChargePolicy metadata_charge_policy) const { + size_t meta_charge = CalcuMetaCharge(metadata_charge_policy); + assert(total_charge >= meta_charge); + return total_charge - meta_charge; + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class LRUHandleTable { + public: + explicit LRUHandleTable(int max_upper_hash_bits); + ~LRUHandleTable(); + + LRUHandle* Lookup(const Slice& key, uint32_t hash); + LRUHandle* Insert(LRUHandle* h); + LRUHandle* Remove(const Slice& key, uint32_t hash); + + template <typename T> + void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) { + for (size_t i = index_begin; i < index_end; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + auto n = h->next_hash; + assert(h->InCache()); + func(h); + h = n; + } + } + } + + int GetLengthBits() const { return length_bits_; } + + size_t GetOccupancyCount() const { return elems_; } + + private: + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash); + + void Resize(); + + // Number of hash bits (upper because lower bits used for sharding) + // used for table index. Length == 1 << length_bits_ + int length_bits_; + + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + std::unique_ptr<LRUHandle*[]> list_; + + // Number of elements currently in the table. + uint32_t elems_; + + // Set from max_upper_hash_bits (see constructor). + const int max_length_bits_; +}; + +// A single shard of sharded cache. +class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { + public: + LRUCacheShard(size_t capacity, bool strict_capacity_limit, + double high_pri_pool_ratio, double low_pri_pool_ratio, + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, SecondaryCache* secondary_cache); + + public: // Type definitions expected as parameter to ShardedCache + using HandleImpl = LRUHandle; + using HashVal = uint32_t; + using HashCref = uint32_t; + + public: // Function definitions expected as parameter to ShardedCache + static inline HashVal ComputeHash(const Slice& key) { + return Lower32of64(GetSliceNPHash64(key)); + } + + // Separate from constructor so caller can easily make an array of LRUCache + // if current usage is more than new capacity, the function will attempt to + // free the needed space. + void SetCapacity(size_t capacity); + + // Set the flag to reject insertion if cache if full. + void SetStrictCapacityLimit(bool strict_capacity_limit); + + // Set percentage of capacity reserved for high-pri cache entries. + void SetHighPriorityPoolRatio(double high_pri_pool_ratio); + + // Set percentage of capacity reserved for low-pri cache entries. + void SetLowPriorityPoolRatio(double low_pri_pool_ratio); + + // Like Cache methods, but with an extra "hash" parameter. + inline Status Insert(const Slice& key, uint32_t hash, void* value, + size_t charge, Cache::DeleterFn deleter, + LRUHandle** handle, Cache::Priority priority) { + return Insert(key, hash, value, charge, deleter, nullptr, handle, priority); + } + inline Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + LRUHandle** handle, Cache::Priority priority) { + assert(helper); + return Insert(key, hash, value, charge, nullptr, helper, handle, priority); + } + // If helper_cb is null, the values of the following arguments don't matter. + LRUHandle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority, bool wait, Statistics* stats); + inline LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true, + nullptr); + } + bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref); + bool IsReady(LRUHandle* /*handle*/); + void Wait(LRUHandle* /*handle*/) {} + bool Ref(LRUHandle* handle); + void Erase(const Slice& key, uint32_t hash); + + // Although in some platforms the update of size_t is atomic, to make sure + // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll + // protect them with mutex_. + + size_t GetUsage() const; + size_t GetPinnedUsage() const; + size_t GetOccupancyCount() const; + size_t GetTableAddressCount() const; + + void ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state); + + void EraseUnRefEntries(); + + public: // other function definitions + void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri, + LRUHandle** lru_bottom_pri); + + // Retrieves number of elements in LRU, for unit test purpose only. + // Not threadsafe. + size_t TEST_GetLRUSize(); + + // Retrieves high pri pool ratio + double GetHighPriPoolRatio(); + + // Retrieves low pri pool ratio + double GetLowPriPoolRatio(); + + void AppendPrintableOptions(std::string& /*str*/) const; + + private: + friend class LRUCache; + // Insert an item into the hash table and, if handle is null, insert into + // the LRU list. Older items are evicted as necessary. If the cache is full + // and free_handle_on_fail is true, the item is deleted and handle is set to + // nullptr. + Status InsertItem(LRUHandle* item, LRUHandle** handle, + bool free_handle_on_fail); + Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, + DeleterFn deleter, const Cache::CacheItemHelper* helper, + LRUHandle** handle, Cache::Priority priority); + // Promote an item looked up from the secondary cache to the LRU cache. + // The item may be still in the secondary cache. + // It is only inserted into the hash table and not the LRU list, and only + // if the cache is not at full capacity, as is the case during Insert. The + // caller should hold a reference on the LRUHandle. When the caller releases + // the last reference, the item is added to the LRU list. + // The item is promoted to the high pri or low pri pool as specified by the + // caller in Lookup. + void Promote(LRUHandle* e); + void LRU_Remove(LRUHandle* e); + void LRU_Insert(LRUHandle* e); + + // Overflow the last entry in high-pri pool to low-pri pool until size of + // high-pri pool is no larger than the size specify by high_pri_pool_pct. + void MaintainPoolSize(); + + // Free some space following strict LRU policy until enough space + // to hold (usage_ + charge) is freed or the lru list is empty + // This function is not thread safe - it needs to be executed while + // holding the mutex_. + void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted); + + // Try to insert the evicted handles into the secondary cache. + void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles); + + // Initialized before use. + size_t capacity_; + + // Memory size for entries in high-pri pool. + size_t high_pri_pool_usage_; + + // Memory size for entries in low-pri pool. + size_t low_pri_pool_usage_; + + // Whether to reject insertion if cache reaches its full capacity. + bool strict_capacity_limit_; + + // Ratio of capacity reserved for high priority cache entries. + double high_pri_pool_ratio_; + + // High-pri pool size, equals to capacity * high_pri_pool_ratio. + // Remember the value to avoid recomputing each time. + double high_pri_pool_capacity_; + + // Ratio of capacity reserved for low priority cache entries. + double low_pri_pool_ratio_; + + // Low-pri pool size, equals to capacity * low_pri_pool_ratio. + // Remember the value to avoid recomputing each time. + double low_pri_pool_capacity_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + // LRU contains items which can be evicted, ie reference only by cache + LRUHandle lru_; + + // Pointer to head of low-pri pool in LRU list. + LRUHandle* lru_low_pri_; + + // Pointer to head of bottom-pri pool in LRU list. + LRUHandle* lru_bottom_pri_; + + // ------------^^^^^^^^^^^^^----------- + // Not frequently modified data members + // ------------------------------------ + // + // We separate data members that are updated frequently from the ones that + // are not frequently updated so that they don't share the same cache line + // which will lead into false cache sharing + // + // ------------------------------------ + // Frequently modified data members + // ------------vvvvvvvvvvvvv----------- + LRUHandleTable table_; + + // Memory size for entries residing in the cache. + size_t usage_; + + // Memory size for entries residing only in the LRU list. + size_t lru_usage_; + + // mutex_ protects the following state. + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable DMutex mutex_; + + // Owned by LRUCache + SecondaryCache* secondary_cache_; +}; + +class LRUCache +#ifdef NDEBUG + final +#endif + : public ShardedCache<LRUCacheShard> { + public: + LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, double low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDontChargeCacheMetadata, + std::shared_ptr<SecondaryCache> secondary_cache = nullptr); + const char* Name() const override { return "LRUCache"; } + void* Value(Handle* handle) override; + size_t GetCharge(Handle* handle) const override; + DeleterFn GetDeleter(Handle* handle) const override; + void WaitAll(std::vector<Handle*>& handles) override; + + // Retrieves number of elements in LRU, for unit test purpose only. + size_t TEST_GetLRUSize(); + // Retrieves high pri pool ratio. + double GetHighPriPoolRatio(); + + void AppendPrintableOptions(std::string& str) const override; + + private: + std::shared_ptr<SecondaryCache> secondary_cache_; +}; + +} // namespace lru_cache + +using LRUCache = lru_cache::LRUCache; +using LRUHandle = lru_cache::LRUHandle; +using LRUCacheShard = lru_cache::LRUCacheShard; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/lru_cache_test.cc b/src/rocksdb/cache/lru_cache_test.cc new file mode 100644 index 000000000..7904a196d --- /dev/null +++ b/src/rocksdb/cache/lru_cache_test.cc @@ -0,0 +1,2624 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/lru_cache.h" + +#include <string> +#include <vector> + +#include "cache/cache_key.h" +#include "cache/clock_cache.h" +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/io_status.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/utilities/cache_dump_load.h" +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/random.h" +#include "utilities/cache_dump_load_impl.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +class LRUCacheTest : public testing::Test { + public: + LRUCacheTest() {} + ~LRUCacheTest() override { DeleteCache(); } + + void DeleteCache() { + if (cache_ != nullptr) { + cache_->~LRUCacheShard(); + port::cacheline_aligned_free(cache_); + cache_ = nullptr; + } + } + + void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0, + double low_pri_pool_ratio = 1.0, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex) { + DeleteCache(); + cache_ = reinterpret_cast<LRUCacheShard*>( + port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); + new (cache_) LRUCacheShard(capacity, /*strict_capacity_limit=*/false, + high_pri_pool_ratio, low_pri_pool_ratio, + use_adaptive_mutex, kDontChargeCacheMetadata, + /*max_upper_hash_bits=*/24, + /*secondary_cache=*/nullptr); + } + + void Insert(const std::string& key, + Cache::Priority priority = Cache::Priority::LOW) { + EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, + nullptr /*deleter*/, nullptr /*handle*/, + priority)); + } + + void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { + Insert(std::string(1, key), priority); + } + + bool Lookup(const std::string& key) { + auto handle = cache_->Lookup(key, 0 /*hash*/); + if (handle) { + cache_->Release(handle, true /*useful*/, false /*erase*/); + return true; + } + return false; + } + + bool Lookup(char key) { return Lookup(std::string(1, key)); } + + void Erase(const std::string& key) { cache_->Erase(key, 0 /*hash*/); } + + void ValidateLRUList(std::vector<std::string> keys, + size_t num_high_pri_pool_keys = 0, + size_t num_low_pri_pool_keys = 0, + size_t num_bottom_pri_pool_keys = 0) { + LRUHandle* lru; + LRUHandle* lru_low_pri; + LRUHandle* lru_bottom_pri; + cache_->TEST_GetLRUList(&lru, &lru_low_pri, &lru_bottom_pri); + + LRUHandle* iter = lru; + + bool in_low_pri_pool = false; + bool in_high_pri_pool = false; + + size_t high_pri_pool_keys = 0; + size_t low_pri_pool_keys = 0; + size_t bottom_pri_pool_keys = 0; + + if (iter == lru_bottom_pri) { + in_low_pri_pool = true; + in_high_pri_pool = false; + } + if (iter == lru_low_pri) { + in_low_pri_pool = false; + in_high_pri_pool = true; + } + + for (const auto& key : keys) { + iter = iter->next; + ASSERT_NE(lru, iter); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool()); + ASSERT_EQ(in_low_pri_pool, iter->InLowPriPool()); + if (in_high_pri_pool) { + ASSERT_FALSE(iter->InLowPriPool()); + high_pri_pool_keys++; + } else if (in_low_pri_pool) { + ASSERT_FALSE(iter->InHighPriPool()); + low_pri_pool_keys++; + } else { + bottom_pri_pool_keys++; + } + if (iter == lru_bottom_pri) { + ASSERT_FALSE(in_low_pri_pool); + ASSERT_FALSE(in_high_pri_pool); + in_low_pri_pool = true; + in_high_pri_pool = false; + } + if (iter == lru_low_pri) { + ASSERT_TRUE(in_low_pri_pool); + ASSERT_FALSE(in_high_pri_pool); + in_low_pri_pool = false; + in_high_pri_pool = true; + } + } + ASSERT_EQ(lru, iter->next); + ASSERT_FALSE(in_low_pri_pool); + ASSERT_TRUE(in_high_pri_pool); + ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys); + ASSERT_EQ(num_low_pri_pool_keys, low_pri_pool_keys); + ASSERT_EQ(num_bottom_pri_pool_keys, bottom_pri_pool_keys); + } + + private: + LRUCacheShard* cache_ = nullptr; +}; + +TEST_F(LRUCacheTest, BasicLRU) { + NewCache(5); + for (char ch = 'a'; ch <= 'e'; ch++) { + Insert(ch); + } + ValidateLRUList({"a", "b", "c", "d", "e"}, 0, 5); + for (char ch = 'x'; ch <= 'z'; ch++) { + Insert(ch); + } + ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5); + ASSERT_FALSE(Lookup("b")); + ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5); + ASSERT_TRUE(Lookup("e")); + ValidateLRUList({"d", "x", "y", "z", "e"}, 0, 5); + ASSERT_TRUE(Lookup("z")); + ValidateLRUList({"d", "x", "y", "e", "z"}, 0, 5); + Erase("x"); + ValidateLRUList({"d", "y", "e", "z"}, 0, 4); + ASSERT_TRUE(Lookup("d")); + ValidateLRUList({"y", "e", "z", "d"}, 0, 4); + Insert("u"); + ValidateLRUList({"y", "e", "z", "d", "u"}, 0, 5); + Insert("v"); + ValidateLRUList({"e", "z", "d", "u", "v"}, 0, 5); +} + +TEST_F(LRUCacheTest, LowPriorityMidpointInsertion) { + // Allocate 2 cache entries to high-pri pool and 3 to low-pri pool. + NewCache(5, /* high_pri_pool_ratio */ 0.40, /* low_pri_pool_ratio */ 0.60); + + Insert("a", Cache::Priority::LOW); + Insert("b", Cache::Priority::LOW); + Insert("c", Cache::Priority::LOW); + Insert("x", Cache::Priority::HIGH); + Insert("y", Cache::Priority::HIGH); + ValidateLRUList({"a", "b", "c", "x", "y"}, 2, 3); + + // Low-pri entries inserted to the tail of low-pri list (the midpoint). + // After lookup, it will move to the tail of the full list. + Insert("d", Cache::Priority::LOW); + ValidateLRUList({"b", "c", "d", "x", "y"}, 2, 3); + ASSERT_TRUE(Lookup("d")); + ValidateLRUList({"b", "c", "x", "y", "d"}, 2, 3); + + // High-pri entries will be inserted to the tail of full list. + Insert("z", Cache::Priority::HIGH); + ValidateLRUList({"c", "x", "y", "d", "z"}, 2, 3); +} + +TEST_F(LRUCacheTest, BottomPriorityMidpointInsertion) { + // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool. + NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35); + + Insert("a", Cache::Priority::BOTTOM); + Insert("b", Cache::Priority::BOTTOM); + Insert("i", Cache::Priority::LOW); + Insert("j", Cache::Priority::LOW); + Insert("x", Cache::Priority::HIGH); + Insert("y", Cache::Priority::HIGH); + ValidateLRUList({"a", "b", "i", "j", "x", "y"}, 2, 2, 2); + + // Low-pri entries will be inserted to the tail of low-pri list (the + // midpoint). After lookup, 'k' will move to the tail of the full list, and + // 'x' will spill over to the low-pri pool. + Insert("k", Cache::Priority::LOW); + ValidateLRUList({"b", "i", "j", "k", "x", "y"}, 2, 2, 2); + ASSERT_TRUE(Lookup("k")); + ValidateLRUList({"b", "i", "j", "x", "y", "k"}, 2, 2, 2); + + // High-pri entries will be inserted to the tail of full list. Although y was + // inserted with high priority, it got spilled over to the low-pri pool. As + // a result, j also got spilled over to the bottom-pri pool. + Insert("z", Cache::Priority::HIGH); + ValidateLRUList({"i", "j", "x", "y", "k", "z"}, 2, 2, 2); + Erase("x"); + ValidateLRUList({"i", "j", "y", "k", "z"}, 2, 1, 2); + Erase("y"); + ValidateLRUList({"i", "j", "k", "z"}, 2, 0, 2); + + // Bottom-pri entries will be inserted to the tail of bottom-pri list. + Insert("c", Cache::Priority::BOTTOM); + ValidateLRUList({"i", "j", "c", "k", "z"}, 2, 0, 3); + Insert("d", Cache::Priority::BOTTOM); + ValidateLRUList({"i", "j", "c", "d", "k", "z"}, 2, 0, 4); + Insert("e", Cache::Priority::BOTTOM); + ValidateLRUList({"j", "c", "d", "e", "k", "z"}, 2, 0, 4); + + // Low-pri entries will be inserted to the tail of low-pri list (the + // midpoint). + Insert("l", Cache::Priority::LOW); + ValidateLRUList({"c", "d", "e", "l", "k", "z"}, 2, 1, 3); + Insert("m", Cache::Priority::LOW); + ValidateLRUList({"d", "e", "l", "m", "k", "z"}, 2, 2, 2); + + Erase("k"); + ValidateLRUList({"d", "e", "l", "m", "z"}, 1, 2, 2); + Erase("z"); + ValidateLRUList({"d", "e", "l", "m"}, 0, 2, 2); + + // Bottom-pri entries will be inserted to the tail of bottom-pri list. + Insert("f", Cache::Priority::BOTTOM); + ValidateLRUList({"d", "e", "f", "l", "m"}, 0, 2, 3); + Insert("g", Cache::Priority::BOTTOM); + ValidateLRUList({"d", "e", "f", "g", "l", "m"}, 0, 2, 4); + + // High-pri entries will be inserted to the tail of full list. + Insert("o", Cache::Priority::HIGH); + ValidateLRUList({"e", "f", "g", "l", "m", "o"}, 1, 2, 3); + Insert("p", Cache::Priority::HIGH); + ValidateLRUList({"f", "g", "l", "m", "o", "p"}, 2, 2, 2); +} + +TEST_F(LRUCacheTest, EntriesWithPriority) { + // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool. + NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35); + + Insert("a", Cache::Priority::LOW); + Insert("b", Cache::Priority::LOW); + ValidateLRUList({"a", "b"}, 0, 2, 0); + // Low-pri entries can overflow to bottom-pri pool. + Insert("c", Cache::Priority::LOW); + ValidateLRUList({"a", "b", "c"}, 0, 2, 1); + + // Bottom-pri entries can take high-pri pool capacity if available + Insert("t", Cache::Priority::LOW); + Insert("u", Cache::Priority::LOW); + ValidateLRUList({"a", "b", "c", "t", "u"}, 0, 2, 3); + Insert("v", Cache::Priority::LOW); + ValidateLRUList({"a", "b", "c", "t", "u", "v"}, 0, 2, 4); + Insert("w", Cache::Priority::LOW); + ValidateLRUList({"b", "c", "t", "u", "v", "w"}, 0, 2, 4); + + Insert("X", Cache::Priority::HIGH); + Insert("Y", Cache::Priority::HIGH); + ValidateLRUList({"t", "u", "v", "w", "X", "Y"}, 2, 2, 2); + + // After lookup, the high-pri entry 'X' got spilled over to the low-pri pool. + // The low-pri entry 'v' got spilled over to the bottom-pri pool. + Insert("Z", Cache::Priority::HIGH); + ValidateLRUList({"u", "v", "w", "X", "Y", "Z"}, 2, 2, 2); + + // Low-pri entries will be inserted to head of low-pri pool. + Insert("a", Cache::Priority::LOW); + ValidateLRUList({"v", "w", "X", "a", "Y", "Z"}, 2, 2, 2); + + // After lookup, the high-pri entry 'Y' got spilled over to the low-pri pool. + // The low-pri entry 'X' got spilled over to the bottom-pri pool. + ASSERT_TRUE(Lookup("v")); + ValidateLRUList({"w", "X", "a", "Y", "Z", "v"}, 2, 2, 2); + + // After lookup, the high-pri entry 'Z' got spilled over to the low-pri pool. + // The low-pri entry 'a' got spilled over to the bottom-pri pool. + ASSERT_TRUE(Lookup("X")); + ValidateLRUList({"w", "a", "Y", "Z", "v", "X"}, 2, 2, 2); + + // After lookup, the low pri entry 'Z' got promoted back to high-pri pool. The + // high-pri entry 'v' got spilled over to the low-pri pool. + ASSERT_TRUE(Lookup("Z")); + ValidateLRUList({"w", "a", "Y", "v", "X", "Z"}, 2, 2, 2); + + Erase("Y"); + ValidateLRUList({"w", "a", "v", "X", "Z"}, 2, 1, 2); + Erase("X"); + ValidateLRUList({"w", "a", "v", "Z"}, 1, 1, 2); + + Insert("d", Cache::Priority::LOW); + Insert("e", Cache::Priority::LOW); + ValidateLRUList({"w", "a", "v", "d", "e", "Z"}, 1, 2, 3); + + Insert("f", Cache::Priority::LOW); + Insert("g", Cache::Priority::LOW); + ValidateLRUList({"v", "d", "e", "f", "g", "Z"}, 1, 2, 3); + ASSERT_TRUE(Lookup("d")); + ValidateLRUList({"v", "e", "f", "g", "Z", "d"}, 2, 2, 2); + + // Erase some entries. + Erase("e"); + Erase("f"); + Erase("Z"); + ValidateLRUList({"v", "g", "d"}, 1, 1, 1); + + // Bottom-pri entries can take low- and high-pri pool capacity if available + Insert("o", Cache::Priority::BOTTOM); + ValidateLRUList({"v", "o", "g", "d"}, 1, 1, 2); + Insert("p", Cache::Priority::BOTTOM); + ValidateLRUList({"v", "o", "p", "g", "d"}, 1, 1, 3); + Insert("q", Cache::Priority::BOTTOM); + ValidateLRUList({"v", "o", "p", "q", "g", "d"}, 1, 1, 4); + + // High-pri entries can overflow to low-pri pool, and bottom-pri entries will + // be evicted. + Insert("x", Cache::Priority::HIGH); + ValidateLRUList({"o", "p", "q", "g", "d", "x"}, 2, 1, 3); + Insert("y", Cache::Priority::HIGH); + ValidateLRUList({"p", "q", "g", "d", "x", "y"}, 2, 2, 2); + Insert("z", Cache::Priority::HIGH); + ValidateLRUList({"q", "g", "d", "x", "y", "z"}, 2, 2, 2); + + // 'g' is bottom-pri before this lookup, it will be inserted to head of + // high-pri pool after lookup. + ASSERT_TRUE(Lookup("g")); + ValidateLRUList({"q", "d", "x", "y", "z", "g"}, 2, 2, 2); + + // High-pri entries will be inserted to head of high-pri pool after lookup. + ASSERT_TRUE(Lookup("z")); + ValidateLRUList({"q", "d", "x", "y", "g", "z"}, 2, 2, 2); + + // Bottom-pri entries will be inserted to head of high-pri pool after lookup. + ASSERT_TRUE(Lookup("d")); + ValidateLRUList({"q", "x", "y", "g", "z", "d"}, 2, 2, 2); + + // Bottom-pri entries will be inserted to the tail of bottom-pri list. + Insert("m", Cache::Priority::BOTTOM); + ValidateLRUList({"x", "m", "y", "g", "z", "d"}, 2, 2, 2); + + // Bottom-pri entries will be inserted to head of high-pri pool after lookup. + ASSERT_TRUE(Lookup("m")); + ValidateLRUList({"x", "y", "g", "z", "d", "m"}, 2, 2, 2); +} + +namespace clock_cache { + +class ClockCacheTest : public testing::Test { + public: + using Shard = HyperClockCache::Shard; + using Table = HyperClockTable; + using HandleImpl = Shard::HandleImpl; + + ClockCacheTest() {} + ~ClockCacheTest() override { DeleteShard(); } + + void DeleteShard() { + if (shard_ != nullptr) { + shard_->~ClockCacheShard(); + port::cacheline_aligned_free(shard_); + shard_ = nullptr; + } + } + + void NewShard(size_t capacity, bool strict_capacity_limit = true) { + DeleteShard(); + shard_ = + reinterpret_cast<Shard*>(port::cacheline_aligned_alloc(sizeof(Shard))); + + Table::Opts opts; + opts.estimated_value_size = 1; + new (shard_) + Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, opts); + } + + Status Insert(const UniqueId64x2& hashed_key, + Cache::Priority priority = Cache::Priority::LOW) { + return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/, + 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/, + priority); + } + + Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { + return Insert(TestHashedKey(key), priority); + } + + Status InsertWithLen(char key, size_t len) { + std::string skey(len, key); + return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/, + 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/, + Cache::Priority::LOW); + } + + bool Lookup(const Slice& key, const UniqueId64x2& hashed_key, + bool useful = true) { + auto handle = shard_->Lookup(key, hashed_key); + if (handle) { + shard_->Release(handle, useful, /*erase_if_last_ref=*/false); + return true; + } + return false; + } + + bool Lookup(const UniqueId64x2& hashed_key, bool useful = true) { + return Lookup(TestKey(hashed_key), hashed_key, useful); + } + + bool Lookup(char key, bool useful = true) { + return Lookup(TestHashedKey(key), useful); + } + + void Erase(char key) { + UniqueId64x2 hashed_key = TestHashedKey(key); + shard_->Erase(TestKey(hashed_key), hashed_key); + } + + static inline Slice TestKey(const UniqueId64x2& hashed_key) { + return Slice(reinterpret_cast<const char*>(&hashed_key), 16U); + } + + static inline UniqueId64x2 TestHashedKey(char key) { + // For testing hash near-collision behavior, put the variance in + // hashed_key in bits that are unlikely to be used as hash bits. + return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U}; + } + + Shard* shard_ = nullptr; +}; + +TEST_F(ClockCacheTest, Misc) { + NewShard(3); + + // Key size stuff + EXPECT_OK(InsertWithLen('a', 16)); + EXPECT_NOK(InsertWithLen('b', 15)); + EXPECT_OK(InsertWithLen('b', 16)); + EXPECT_NOK(InsertWithLen('c', 17)); + EXPECT_NOK(InsertWithLen('d', 1000)); + EXPECT_NOK(InsertWithLen('e', 11)); + EXPECT_NOK(InsertWithLen('f', 0)); + + // Some of this is motivated by code coverage + std::string wrong_size_key(15, 'x'); + EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x'))); + EXPECT_FALSE(shard_->Ref(nullptr)); + EXPECT_FALSE(shard_->Release(nullptr)); + shard_->Erase(wrong_size_key, TestHashedKey('x')); // no-op +} + +TEST_F(ClockCacheTest, Limits) { + constexpr size_t kCapacity = 3; + NewShard(kCapacity, false /*strict_capacity_limit*/); + for (bool strict_capacity_limit : {false, true, false}) { + SCOPED_TRACE("strict_capacity_limit = " + + std::to_string(strict_capacity_limit)); + + // Also tests switching between strict limit and not + shard_->SetStrictCapacityLimit(strict_capacity_limit); + + UniqueId64x2 hkey = TestHashedKey('x'); + + // Single entry charge beyond capacity + { + Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, + 5 /*charge*/, nullptr /*deleter*/, + nullptr /*handle*/, Cache::Priority::LOW); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + } + + // Single entry fills capacity + { + HandleImpl* h; + ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, + 3 /*charge*/, nullptr /*deleter*/, &h, + Cache::Priority::LOW)); + // Try to insert more + Status s = Insert('a'); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + // Release entry filling capacity. + // Cover useful = false case. + shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/); + } + + // Insert more than table size can handle to exceed occupancy limit. + // (Cleverly using mostly zero-charge entries, but some non-zero to + // verify usage tracking on detached entries.) + { + size_t n = shard_->GetTableAddressCount() + 1; + std::unique_ptr<HandleImpl* []> ha { new HandleImpl* [n] {} }; + Status s; + for (size_t i = 0; i < n && s.ok(); ++i) { + hkey[1] = i; + s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, + (i + kCapacity < n) ? 0 : 1 /*charge*/, + nullptr /*deleter*/, &ha[i], Cache::Priority::LOW); + if (i == 0) { + EXPECT_OK(s); + } + } + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + // Same result if not keeping a reference + s = Insert('a'); + if (strict_capacity_limit) { + EXPECT_TRUE(s.IsMemoryLimit()); + } else { + EXPECT_OK(s); + } + + // Regardless, we didn't allow table to actually get full + EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount()); + + // Release handles + for (size_t i = 0; i < n; ++i) { + if (ha[i]) { + shard_->Release(ha[i]); + } + } + } + } +} + +TEST_F(ClockCacheTest, ClockEvictionTest) { + for (bool strict_capacity_limit : {false, true}) { + SCOPED_TRACE("strict_capacity_limit = " + + std::to_string(strict_capacity_limit)); + + NewShard(6, strict_capacity_limit); + EXPECT_OK(Insert('a', Cache::Priority::BOTTOM)); + EXPECT_OK(Insert('b', Cache::Priority::LOW)); + EXPECT_OK(Insert('c', Cache::Priority::HIGH)); + EXPECT_OK(Insert('d', Cache::Priority::BOTTOM)); + EXPECT_OK(Insert('e', Cache::Priority::LOW)); + EXPECT_OK(Insert('f', Cache::Priority::HIGH)); + + EXPECT_TRUE(Lookup('a', /*use*/ false)); + EXPECT_TRUE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_TRUE(Lookup('d', /*use*/ false)); + EXPECT_TRUE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + + // Ensure bottom are evicted first, even if new entries are low + EXPECT_OK(Insert('g', Cache::Priority::LOW)); + EXPECT_OK(Insert('h', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('a', /*use*/ false)); + EXPECT_TRUE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('d', /*use*/ false)); + EXPECT_TRUE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + // Mark g & h useful + EXPECT_TRUE(Lookup('g', /*use*/ true)); + EXPECT_TRUE(Lookup('h', /*use*/ true)); + + // Then old LOW entries + EXPECT_OK(Insert('i', Cache::Priority::LOW)); + EXPECT_OK(Insert('j', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('b', /*use*/ false)); + EXPECT_TRUE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('e', /*use*/ false)); + EXPECT_TRUE(Lookup('f', /*use*/ false)); + // Mark g & h useful once again + EXPECT_TRUE(Lookup('g', /*use*/ true)); + EXPECT_TRUE(Lookup('h', /*use*/ true)); + EXPECT_TRUE(Lookup('i', /*use*/ false)); + EXPECT_TRUE(Lookup('j', /*use*/ false)); + + // Then old HIGH entries + EXPECT_OK(Insert('k', Cache::Priority::LOW)); + EXPECT_OK(Insert('l', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('c', /*use*/ false)); + EXPECT_FALSE(Lookup('f', /*use*/ false)); + EXPECT_TRUE(Lookup('g', /*use*/ false)); + EXPECT_TRUE(Lookup('h', /*use*/ false)); + EXPECT_TRUE(Lookup('i', /*use*/ false)); + EXPECT_TRUE(Lookup('j', /*use*/ false)); + EXPECT_TRUE(Lookup('k', /*use*/ false)); + EXPECT_TRUE(Lookup('l', /*use*/ false)); + + // Then the (roughly) least recently useful + EXPECT_OK(Insert('m', Cache::Priority::HIGH)); + EXPECT_OK(Insert('n', Cache::Priority::HIGH)); + + EXPECT_TRUE(Lookup('g', /*use*/ false)); + EXPECT_TRUE(Lookup('h', /*use*/ false)); + EXPECT_FALSE(Lookup('i', /*use*/ false)); + EXPECT_FALSE(Lookup('j', /*use*/ false)); + EXPECT_TRUE(Lookup('k', /*use*/ false)); + EXPECT_TRUE(Lookup('l', /*use*/ false)); + + // Now try changing capacity down + shard_->SetCapacity(4); + // Insert to ensure evictions happen + EXPECT_OK(Insert('o', Cache::Priority::LOW)); + EXPECT_OK(Insert('p', Cache::Priority::LOW)); + + EXPECT_FALSE(Lookup('g', /*use*/ false)); + EXPECT_FALSE(Lookup('h', /*use*/ false)); + EXPECT_FALSE(Lookup('k', /*use*/ false)); + EXPECT_FALSE(Lookup('l', /*use*/ false)); + EXPECT_TRUE(Lookup('m', /*use*/ false)); + EXPECT_TRUE(Lookup('n', /*use*/ false)); + EXPECT_TRUE(Lookup('o', /*use*/ false)); + EXPECT_TRUE(Lookup('p', /*use*/ false)); + + // Now try changing capacity up + EXPECT_TRUE(Lookup('m', /*use*/ true)); + EXPECT_TRUE(Lookup('n', /*use*/ true)); + shard_->SetCapacity(6); + EXPECT_OK(Insert('q', Cache::Priority::HIGH)); + EXPECT_OK(Insert('r', Cache::Priority::HIGH)); + EXPECT_OK(Insert('s', Cache::Priority::HIGH)); + EXPECT_OK(Insert('t', Cache::Priority::HIGH)); + + EXPECT_FALSE(Lookup('o', /*use*/ false)); + EXPECT_FALSE(Lookup('p', /*use*/ false)); + EXPECT_TRUE(Lookup('m', /*use*/ false)); + EXPECT_TRUE(Lookup('n', /*use*/ false)); + EXPECT_TRUE(Lookup('q', /*use*/ false)); + EXPECT_TRUE(Lookup('r', /*use*/ false)); + EXPECT_TRUE(Lookup('s', /*use*/ false)); + EXPECT_TRUE(Lookup('t', /*use*/ false)); + } +} + +void IncrementIntDeleter(const Slice& /*key*/, void* value) { + *reinterpret_cast<int*>(value) += 1; +} + +// Testing calls to CorrectNearOverflow in Release +TEST_F(ClockCacheTest, ClockCounterOverflowTest) { + NewShard(6, /*strict_capacity_limit*/ false); + HandleImpl* h; + int deleted = 0; + UniqueId64x2 hkey = TestHashedKey('x'); + ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1, + IncrementIntDeleter, &h, Cache::Priority::HIGH)); + + // Some large number outstanding + shard_->TEST_RefN(h, 123456789); + // Simulate many lookup/ref + release, plenty to overflow counters + for (int i = 0; i < 10000; ++i) { + shard_->TEST_RefN(h, 1234567); + shard_->TEST_ReleaseN(h, 1234567); + } + // Mark it invisible (to reach a different CorrectNearOverflow() in Release) + shard_->Erase(TestKey(hkey), hkey); + // Simulate many more lookup/ref + release (one-by-one would be too + // expensive for unit test) + for (int i = 0; i < 10000; ++i) { + shard_->TEST_RefN(h, 1234567); + shard_->TEST_ReleaseN(h, 1234567); + } + // Free all but last 1 + shard_->TEST_ReleaseN(h, 123456789); + // Still alive + ASSERT_EQ(deleted, 0); + // Free last ref, which will finalize erasure + shard_->Release(h); + // Deleted + ASSERT_EQ(deleted, 1); +} + +// This test is mostly to exercise some corner case logic, by forcing two +// keys to have the same hash, and more +TEST_F(ClockCacheTest, CollidingInsertEraseTest) { + NewShard(6, /*strict_capacity_limit*/ false); + int deleted = 0; + UniqueId64x2 hkey1 = TestHashedKey('x'); + Slice key1 = TestKey(hkey1); + UniqueId64x2 hkey2 = TestHashedKey('y'); + Slice key2 = TestKey(hkey2); + UniqueId64x2 hkey3 = TestHashedKey('z'); + Slice key3 = TestKey(hkey3); + HandleImpl* h1; + ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1, + Cache::Priority::HIGH)); + HandleImpl* h2; + ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2, + Cache::Priority::HIGH)); + HandleImpl* h3; + ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3, + Cache::Priority::HIGH)); + + // Can repeatedly lookup+release despite the hash collision + HandleImpl* tmp_h; + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key1, hkey1); + ASSERT_EQ(h1, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key2, hkey2); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, hkey3); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Make h1 invisible + shard_->Erase(key1, hkey1); + // Redundant erase + shard_->Erase(key1, hkey1); + + // All still alive + ASSERT_EQ(deleted, 0); + + // Invisible to Lookup + tmp_h = shard_->Lookup(key1, hkey1); + ASSERT_EQ(nullptr, tmp_h); + + // Can still find h2, h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key2, hkey2); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, hkey3); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Also Insert with invisible entry there + ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, + nullptr, Cache::Priority::HIGH)); + tmp_h = shard_->Lookup(key1, hkey1); + // Found but distinct handle + ASSERT_NE(nullptr, tmp_h); + ASSERT_NE(h1, tmp_h); + ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true)); + + // tmp_h deleted + ASSERT_EQ(deleted--, 1); + + // Release last ref on h1 (already invisible) + ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false)); + + // h1 deleted + ASSERT_EQ(deleted--, 1); + h1 = nullptr; + + // Can still find h2, h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key2, hkey2); + ASSERT_EQ(h2, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + + tmp_h = shard_->Lookup(key3, hkey3); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Release last ref on h2 + ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false)); + + // h2 still not deleted (unreferenced in cache) + ASSERT_EQ(deleted, 0); + + // Can still find it + tmp_h = shard_->Lookup(key2, hkey2); + ASSERT_EQ(h2, tmp_h); + + // Release last ref on h2, with erase + ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true)); + + // h2 deleted + ASSERT_EQ(deleted--, 1); + tmp_h = shard_->Lookup(key2, hkey2); + ASSERT_EQ(nullptr, tmp_h); + + // Can still find h3 + for (bool erase_if_last_ref : {true, false}) { // but not last ref + tmp_h = shard_->Lookup(key3, hkey3); + ASSERT_EQ(h3, tmp_h); + ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + } + + // Release last ref on h3, without erase + ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false)); + + // h3 still not deleted (unreferenced in cache) + ASSERT_EQ(deleted, 0); + + // Explicit erase + shard_->Erase(key3, hkey3); + + // h3 deleted + ASSERT_EQ(deleted--, 1); + tmp_h = shard_->Lookup(key3, hkey3); + ASSERT_EQ(nullptr, tmp_h); +} + +// This uses the public API to effectively test CalcHashBits etc. +TEST_F(ClockCacheTest, TableSizesTest) { + for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) { + SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size)); + for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) { + SCOPED_TRACE("est_count = " + std::to_string(est_count)); + size_t capacity = static_cast<size_t>(est_val_size * est_count); + // kDontChargeCacheMetadata + auto cache = HyperClockCacheOptions( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, + /*memory_allocator*/ nullptr, kDontChargeCacheMetadata) + .MakeSharedCache(); + // Table sizes are currently only powers of two + EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0); + EXPECT_EQ(cache->GetUsage(), 0); + + // kFullChargeMetaData + // Because table sizes are currently only powers of two, sizes get + // really weird when metadata is a huge portion of capacity. For example, + // doubling the table size could cut by 90% the space available to + // values. Therefore, we omit those weird cases for now. + if (est_val_size >= 512) { + cache = HyperClockCacheOptions( + capacity, est_val_size, /*num shard_bits*/ -1, + /*strict_capacity_limit*/ false, + /*memory_allocator*/ nullptr, kFullChargeCacheMetadata) + .MakeSharedCache(); + double est_count_after_meta = + (capacity - cache->GetUsage()) * 1.0 / est_val_size; + EXPECT_GE(cache->GetTableAddressCount(), + est_count_after_meta / kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), + est_count_after_meta / kLoadFactor * 2.0); + } + } + } +} + +} // namespace clock_cache + +class TestSecondaryCache : public SecondaryCache { + public: + // Specifies what action to take on a lookup for a particular key + enum ResultType { + SUCCESS, + // Fail lookup immediately + FAIL, + // Defer the result. It will returned after Wait/WaitAll is called + DEFER, + // Defer the result and eventually return failure + DEFER_AND_FAIL + }; + + using ResultMap = std::unordered_map<std::string, ResultType>; + + explicit TestSecondaryCache(size_t capacity) + : num_inserts_(0), num_lookups_(0), inject_failure_(false) { + cache_ = + NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + } + ~TestSecondaryCache() override { cache_.reset(); } + + const char* Name() const override { return "TestSecondaryCache"; } + + void InjectFailure() { inject_failure_ = true; } + + void ResetInjectFailure() { inject_failure_ = false; } + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) override { + if (inject_failure_) { + return Status::Corruption("Insertion Data Corrupted"); + } + CheckCacheKeyCommonPrefix(key); + size_t size; + char* buf; + Status s; + + num_inserts_++; + size = (*helper->size_cb)(value); + buf = new char[size + sizeof(uint64_t)]; + EncodeFixed64(buf, size); + s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t)); + if (!s.ok()) { + delete[] buf; + return s; + } + return cache_->Insert(key, buf, size, + [](const Slice& /*key*/, void* val) -> void { + delete[] static_cast<char*>(val); + }); + } + + std::unique_ptr<SecondaryCacheResultHandle> Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, + bool /*advise_erase*/, bool& is_in_sec_cache) override { + std::string key_str = key.ToString(); + TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); + + std::unique_ptr<SecondaryCacheResultHandle> secondary_handle; + is_in_sec_cache = false; + ResultType type = ResultType::SUCCESS; + auto iter = result_map_.find(key.ToString()); + if (iter != result_map_.end()) { + type = iter->second; + } + if (type == ResultType::FAIL) { + return secondary_handle; + } + + Cache::Handle* handle = cache_->Lookup(key); + num_lookups_++; + if (handle) { + void* value = nullptr; + size_t charge = 0; + Status s; + if (type != ResultType::DEFER_AND_FAIL) { + char* ptr = (char*)cache_->Value(handle); + size_t size = DecodeFixed64(ptr); + ptr += sizeof(uint64_t); + s = create_cb(ptr, size, &value, &charge); + } + if (s.ok()) { + secondary_handle.reset(new TestSecondaryCacheResultHandle( + cache_.get(), handle, value, charge, type)); + is_in_sec_cache = true; + } else { + cache_->Release(handle); + } + } + return secondary_handle; + } + + bool SupportForceErase() const override { return false; } + + void Erase(const Slice& /*key*/) override {} + + void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override { + for (SecondaryCacheResultHandle* handle : handles) { + TestSecondaryCacheResultHandle* sec_handle = + static_cast<TestSecondaryCacheResultHandle*>(handle); + sec_handle->SetReady(); + } + } + + std::string GetPrintableOptions() const override { return ""; } + + void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); } + + uint32_t num_inserts() { return num_inserts_; } + + uint32_t num_lookups() { return num_lookups_; } + + void CheckCacheKeyCommonPrefix(const Slice& key) { + Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize); + if (ckey_prefix_.empty()) { + ckey_prefix_ = current_prefix.ToString(); + } else { + EXPECT_EQ(ckey_prefix_, current_prefix.ToString()); + } + } + + private: + class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle { + public: + TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle, + void* value, size_t size, ResultType type) + : cache_(cache), + handle_(handle), + value_(value), + size_(size), + is_ready_(true) { + if (type != ResultType::SUCCESS) { + is_ready_ = false; + } + } + + ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); } + + bool IsReady() override { return is_ready_; } + + void Wait() override {} + + void* Value() override { + assert(is_ready_); + return value_; + } + + size_t Size() override { return Value() ? size_ : 0; } + + void SetReady() { is_ready_ = true; } + + private: + Cache* cache_; + Cache::Handle* handle_; + void* value_; + size_t size_; + bool is_ready_; + }; + + std::shared_ptr<Cache> cache_; + uint32_t num_inserts_; + uint32_t num_lookups_; + bool inject_failure_; + std::string ckey_prefix_; + ResultMap result_map_; +}; + +class DBSecondaryCacheTest : public DBTestBase { + public: + DBSecondaryCacheTest() + : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::shared_ptr<FaultInjectionTestFS> fault_fs_; + std::unique_ptr<Env> fault_env_; +}; + +class LRUCacheSecondaryCacheTest : public LRUCacheTest { + public: + LRUCacheSecondaryCacheTest() : fail_create_(false) {} + ~LRUCacheSecondaryCacheTest() {} + + protected: + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() {} + + char* Buf() { return buf_.get(); } + size_t Size() { return size_; } + std::string ToString() { return std::string(Buf(), Size()); } + + private: + std::unique_ptr<char[]> buf_; + size_t size_; + }; + + static size_t SizeCallback(void* obj) { + return reinterpret_cast<TestItem*>(obj)->Size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + TestItem* item = reinterpret_cast<TestItem*>(from_obj); + char* buf = item->Buf(); + EXPECT_EQ(length, item->Size()); + EXPECT_EQ(from_offset, 0); + memcpy(out, buf, length); + return Status::OK(); + } + + static void DeletionCallback(const Slice& /*key*/, void* obj) { + delete reinterpret_cast<TestItem*>(obj); + } + + static Cache::CacheItemHelper helper_; + + static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, + size_t /*size*/, void* /*out*/) { + return Status::NotSupported(); + } + + static Cache::CacheItemHelper helper_fail_; + + Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size, + void** out_obj, + size_t* charge) -> Status { + if (fail_create_) { + return Status::NotSupported(); + } + *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size)); + *charge = size; + return Status::OK(); + }; + + void SetFailCreate(bool fail) { fail_create_ = fail; } + + private: + bool fail_create_; +}; + +Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_( + LRUCacheSecondaryCacheTest::SizeCallback, + LRUCacheSecondaryCacheTest::SaveToCallback, + LRUCacheSecondaryCacheTest::DeletionCallback); + +Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_( + LRUCacheSecondaryCacheTest::SizeCallback, + LRUCacheSecondaryCacheTest::SaveToCallbackFail, + LRUCacheSecondaryCacheTest::DeletionCallback); + +TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { + LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(4096); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + std::shared_ptr<Statistics> stats = CreateDBStatistics(); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k3 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + // Start with warming k3 + std::string str3 = rnd.RandomString(1021); + ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3)); + + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); + std::string str2 = rnd.RandomString(1021); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); + + get_perf_context()->Reset(); + Cache::Handle* handle; + handle = + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str2.size()); + cache->Release(handle); + + // This lookup should promote k1 and demote k2 + handle = + cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str1.size()); + cache->Release(handle); + + // This lookup should promote k3 and demote k1 + handle = + cache->Lookup(k3.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str3.size()); + cache->Release(handle); + + ASSERT_EQ(secondary_cache->num_inserts(), 3u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS), + secondary_cache->num_lookups()); + PerfContext perf_ctx = *get_perf_context(); + ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups()); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) { + LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + auto item1 = std::make_unique<TestItem>(str1.data(), str1.length()); + ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length()) + .IsInvalidArgument()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(), + &LRUCacheSecondaryCacheTest::helper_, str1.length())); + item1.release(); // Appease clang-analyze "potential memory leak" + + Cache::Handle* handle; + handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator, + Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false); + ASSERT_EQ(handle, nullptr); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) { + LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_fail_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_fail_, + str2.length())); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + + Cache::Handle* handle; + handle = + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 demotion would have failed + handle = + cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = + cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) { + LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); + + Cache::Handle* handle; + SetFailCreate(true); + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 creation would have failed + handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) { + LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, + true /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, + &LRUCacheSecondaryCacheTest::helper_, str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, + &LRUCacheSecondaryCacheTest::helper_, str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + // k1 promotion should fail due to the block cache being at capacity, + // but the lookup should still succeed + Cache::Handle* handle2; + handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle2, nullptr); + // Since k1 didn't get inserted, k2 should still be in cache + cache->Release(handle); + cache->Release(handle2); + handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { + LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + + Destroy(options); +} + +// In this test, the block cache size is set to 6100, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// insert and cache block_1 in the block cache (this is the different place +// from TestSecondaryCacheCorrectness1) +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { + LRUCacheOptions opts(6100 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Thefore, block_1 is evicted from block + // cache and successfully inserted to the secondary cache. Here are 2 + // lookups in the secondary cache for block_1 and block_2. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. After Flush, only block_2 is cached in block cache + // and block_1 is in the secondary cache. So when read block_1, it is + // read out from secondary cache and inserted to block cache. At the same + // time, block_2 is inserted to secondary cache. Now, secondary cache has + // both block_1 and block_2. After compaction, block_1 is in the cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is not in the block cache. So + // it will lookup the secondary cache for block_2 and cache it in the + // block_cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is already in the block cache. + // No need to lookup secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is not in block cache + // there is one econdary cache lookup. Then, block_1 is cached in the + // block cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + Destroy(options); +} + +// The block cache size is set to 1024*1024, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// cache all the blocks in the block cache and there is not secondary cache +// insertion. 2 lookup is needed for the blocks. +TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { + LRUCacheOptions opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. Now, block cache is large enough, it cache + // both block_1 and block_2. When first time read block_1 and block_2 + // there are cache misses. So 2 secondary cache lookups are needed for + // the 2 blocks + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will iterate the whole SST file. Since all the data blocks + // are in the block cache. No need to lookup the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1000, v.size()); + // Since the block cache is large enough, all the blocks are cached. we + // do not need to lookup the seondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { + LRUCacheOptions opts(8 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + Compact("a", "z"); + + Random r_index(47); + std::string v; + for (int i = 0; i < 1000; i++) { + uint32_t key_i = r_index.Next() % N; + v = Get(Key(key_i)); + } + + // We have over 200 data blocks there will be multiple insertion + // and lookups. + ASSERT_GE(secondary_cache->num_inserts(), 1u); + ASSERT_GE(secondary_cache->num_lookups(), 1u); + + Destroy(options); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { + LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + // Fail the insertion, in LRU cache, the secondary insertion returned status + // is not checked, therefore, the DB will not be influenced. + secondary_cache->InjectFailure(); + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + secondary_cache->ResetInjectFailure(); + + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, TestSecondaryWithCompressedCache) { + if (!Snappy_Supported()) { + ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); + return; + } + LRUCacheOptions opts(2000 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache_compressed = cache; + table_options.no_block_cache = true; + table_options.block_size = 1234; + Options options = GetDefaultOptions(); + options.compression = kSnappyCompression; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + // Partly compressible + std::string p_v = rnd.RandomString(507) + std::string(500, ' '); + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 2 * N; i++) { + std::string v = Get(Key(i % N)); + ASSERT_EQ(1007, v.size()); + } +} + +TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { + LRUCacheOptions opts(1024 /* capacity */, 2 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(32 * 1024); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + const int num_keys = 32; + OffsetableCacheKey ock{"foo", "bar", 1}; + + Random rnd(301); + std::vector<std::string> values; + for (int i = 0; i < num_keys; ++i) { + std::string str = rnd.RandomString(1020); + values.emplace_back(str); + TestItem* item = new TestItem(str.data(), str.length()); + ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item, + &LRUCacheSecondaryCacheTest::helper_, + str.length())); + } + // Force all entries to be evicted to the secondary cache + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 32u); + cache->SetCapacity(32 * 1024); + + secondary_cache->SetResultMap( + {{ock.WithOffset(3).AsSlice().ToString(), + TestSecondaryCache::ResultType::DEFER}, + {ock.WithOffset(4).AsSlice().ToString(), + TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {ock.WithOffset(5).AsSlice().ToString(), + TestSecondaryCache::ResultType::FAIL}}); + std::vector<Cache::Handle*> results; + for (int i = 0; i < 6; ++i) { + results.emplace_back(cache->Lookup( + ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false)); + } + cache->WaitAll(results); + for (int i = 0; i < 6; ++i) { + if (i == 4) { + ASSERT_EQ(cache->Value(results[i]), nullptr); + } else if (i == 5) { + ASSERT_EQ(results[i], nullptr); + continue; + } else { + TestItem* item = static_cast<TestItem*>(cache->Value(results[i])); + ASSERT_EQ(item->ToString(), values[i]); + } + cache->Release(results[i]); + } + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, we have one KV pair per data block. We indirectly determine +// the cache key associated with each data block (and thus each KV) by using +// a sync point callback in TestSecondaryCache::Lookup. We then control the +// lookup result by setting the ResultMap. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { + LRUCacheOptions opts(1 << 20 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 8; + std::vector<std::string> keys; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(4000); + keys.emplace_back(p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB does the paranoid check for the new + // SST file. This will try to lookup all data blocks in the secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 8u); + + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 8u); + cache->SetCapacity(1 << 20); + + std::vector<std::string> cache_keys; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void { + cache_keys.emplace_back(*(static_cast<std::string*>(key))); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < N; ++i) { + std::string v = Get(Key(i)); + ASSERT_EQ(4000, v.size()); + ASSERT_EQ(v, keys[i]); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(secondary_cache->num_lookups(), 16u); + cache->SetCapacity(0); + cache->SetCapacity(1 << 20); + + ASSERT_EQ(Get(Key(2)), keys[2]); + ASSERT_EQ(Get(Key(7)), keys[7]); + secondary_cache->SetResultMap( + {{cache_keys[3], TestSecondaryCache::ResultType::DEFER}, + {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {cache_keys[5], TestSecondaryCache::ResultType::FAIL}}); + + std::vector<std::string> mget_keys( + {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)}); + std::vector<PinnableSlice> values(mget_keys.size()); + std::vector<Status> s(keys.size()); + std::vector<Slice> key_slices; + for (const std::string& key : mget_keys) { + key_slices.emplace_back(key); + } + uint32_t num_lookups = secondary_cache->num_lookups(); + dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), + key_slices.size(), key_slices.data(), values.data(), + s.data(), false); + ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5); + for (int i = 0; i < N; ++i) { + ASSERT_OK(s[i]); + ASSERT_EQ(values[i].ToString(), keys[i]); + values[i].Reset(); + } + Destroy(options); +} + +class LRUCacheWithStat : public LRUCache { + public: + LRUCacheWithStat( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + double _high_pri_pool_ratio, double _low_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDontChargeCacheMetadata, + const std::shared_ptr<SecondaryCache>& _secondary_cache = nullptr) + : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit, + _high_pri_pool_ratio, _low_pri_pool_ratio, _memory_allocator, + _use_adaptive_mutex, _metadata_charge_policy, + _secondary_cache) { + insert_count_ = 0; + lookup_count_ = 0; + } + ~LRUCacheWithStat() {} + + Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, + Handle** handle, Priority priority) override { + insert_count_++; + return LRUCache::Insert(key, value, charge, deleter, handle, priority); + } + Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + insert_count_++; + return LRUCache::Insert(key, value, helper, charge, handle, priority); + } + Handle* Lookup(const Slice& key, Statistics* stats) override { + lookup_count_++; + return LRUCache::Lookup(key, stats); + } + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, bool wait, + Statistics* stats = nullptr) override { + lookup_count_++; + return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats); + } + + uint32_t GetInsertCount() { return insert_count_; } + uint32_t GetLookupcount() { return lookup_count_; } + void ResetCount() { + insert_count_ = 0; + lookup_count_ = 0; + } + + private: + uint32_t insert_count_; + uint32_t lookup_count_; +}; + +#ifndef ROCKSDB_LITE + +TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { + LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, + cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, + cache_opts.secondary_cache); + std::shared_ptr<Cache> cache(tmp_cache); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + DestroyAndReopen(options); + fault_fs_->SetFailGetUniqueId(true); + + Random rnd(301); + const int N = 256; + std::vector<std::string> value; + char buf[1000]; + memset(buf, 'a', 1000); + value.resize(N); + for (int i = 0; i < N; i++) { + // std::string p_v = rnd.RandomString(1000); + std::string p_v(buf, 1000); + value[i] = p_v; + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + Compact("a", "z"); + + // do th eread for all the key value pairs, so all the blocks should be in + // cache + uint32_t start_insert = tmp_cache->GetInsertCount(); + uint32_t start_lookup = tmp_cache->GetLookupcount(); + std::string v; + for (int i = 0; i < N; i++) { + v = Get(Key(i)); + ASSERT_EQ(v, value[i]); + } + uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + ASSERT_EQ(63, + static_cast<int>(dump_insert)); // the insert in the block cache + ASSERT_EQ(256, + static_cast<int>(dump_lookup)); // the lookup in the block cache + // We have enough blocks in the block cache + + CacheDumpOptions cd_options; + cd_options.clock = fault_env_->GetSystemClock().get(); + std::string dump_path = db_->GetName() + "/cache_dump"; + std::unique_ptr<CacheDumpWriter> dump_writer; + Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path, + &dump_writer); + ASSERT_OK(s); + std::unique_ptr<CacheDumper> cache_dumper; + s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer), + &cache_dumper); + ASSERT_OK(s); + std::vector<DB*> db_list; + db_list.push_back(db_); + s = cache_dumper->SetDumpFilter(db_list); + ASSERT_OK(s); + s = cache_dumper->DumpCacheEntriesToWriter(); + ASSERT_OK(s); + cache_dumper.reset(); + + // we have a new cache it is empty, then, before we do the Get, we do the + // dumpload + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048 * 1024); + cache_opts.secondary_cache = secondary_cache; + tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, + cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, + cache_opts.secondary_cache); + std::shared_ptr<Cache> cache_new(tmp_cache); + table_options.block_cache = cache_new; + table_options.block_size = 4 * 1024; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + + // start to load the data to new block cache + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + std::unique_ptr<CacheDumpReader> dump_reader; + s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path, + &dump_reader); + ASSERT_OK(s); + std::unique_ptr<CacheDumpedLoader> cache_loader; + s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache, + std::move(dump_reader), &cache_loader); + ASSERT_OK(s); + s = cache_loader->RestoreCacheEntriesToSecondaryCache(); + ASSERT_OK(s); + uint32_t load_insert = secondary_cache->num_inserts() - start_insert; + uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup; + // check the number we inserted + ASSERT_EQ(64, static_cast<int>(load_insert)); + ASSERT_EQ(0, static_cast<int>(load_lookup)); + ASSERT_OK(s); + + Reopen(options); + + // After load, we do the Get again + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + uint32_t cache_insert = tmp_cache->GetInsertCount(); + uint32_t cache_lookup = tmp_cache->GetLookupcount(); + for (int i = 0; i < N; i++) { + v = Get(Key(i)); + ASSERT_EQ(v, value[i]); + } + uint32_t final_insert = secondary_cache->num_inserts() - start_insert; + uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup; + // no insert to secondary cache + ASSERT_EQ(0, static_cast<int>(final_insert)); + // lookup the secondary to get all blocks + ASSERT_EQ(64, static_cast<int>(final_lookup)); + uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + // Check the new block cache insert and lookup, should be no insert since all + // blocks are from the secondary cache. + ASSERT_EQ(0, static_cast<int>(block_insert)); + ASSERT_EQ(256, static_cast<int>(block_lookup)); + + fault_fs_->SetFailGetUniqueId(false); + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { + LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, + cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, + cache_opts.secondary_cache); + std::shared_ptr<Cache> cache(tmp_cache); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + std::string dbname1 = test::PerThreadDBPath("db_1"); + ASSERT_OK(DestroyDB(dbname1, options)); + DB* db1 = nullptr; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + std::string dbname2 = test::PerThreadDBPath("db_2"); + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + fault_fs_->SetFailGetUniqueId(true); + + // write the KVs to db1 + Random rnd(301); + const int N = 256; + std::vector<std::string> value1; + WriteOptions wo; + char buf[1000]; + memset(buf, 'a', 1000); + value1.resize(N); + for (int i = 0; i < N; i++) { + std::string p_v(buf, 1000); + value1[i] = p_v; + ASSERT_OK(db1->Put(wo, Key(i), p_v)); + } + ASSERT_OK(db1->Flush(FlushOptions())); + Slice bg("a"); + Slice ed("b"); + ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed)); + + // Write the KVs to DB2 + std::vector<std::string> value2; + memset(buf, 'b', 1000); + value2.resize(N); + for (int i = 0; i < N; i++) { + std::string p_v(buf, 1000); + value2[i] = p_v; + ASSERT_OK(db2->Put(wo, Key(i), p_v)); + } + ASSERT_OK(db2->Flush(FlushOptions())); + ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed)); + + // do th eread for all the key value pairs, so all the blocks should be in + // cache + uint32_t start_insert = tmp_cache->GetInsertCount(); + uint32_t start_lookup = tmp_cache->GetLookupcount(); + ReadOptions ro; + std::string v; + for (int i = 0; i < N; i++) { + ASSERT_OK(db1->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value1[i]); + } + for (int i = 0; i < N; i++) { + ASSERT_OK(db2->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value2[i]); + } + uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + ASSERT_EQ(128, + static_cast<int>(dump_insert)); // the insert in the block cache + ASSERT_EQ(512, + static_cast<int>(dump_lookup)); // the lookup in the block cache + // We have enough blocks in the block cache + + CacheDumpOptions cd_options; + cd_options.clock = fault_env_->GetSystemClock().get(); + std::string dump_path = db1->GetName() + "/cache_dump"; + std::unique_ptr<CacheDumpWriter> dump_writer; + Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path, + &dump_writer); + ASSERT_OK(s); + std::unique_ptr<CacheDumper> cache_dumper; + s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer), + &cache_dumper); + ASSERT_OK(s); + std::vector<DB*> db_list; + db_list.push_back(db1); + s = cache_dumper->SetDumpFilter(db_list); + ASSERT_OK(s); + s = cache_dumper->DumpCacheEntriesToWriter(); + ASSERT_OK(s); + cache_dumper.reset(); + + // we have a new cache it is empty, then, before we do the Get, we do the + // dumpload + std::shared_ptr<TestSecondaryCache> secondary_cache = + std::make_shared<TestSecondaryCache>(2048 * 1024); + cache_opts.secondary_cache = secondary_cache; + tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, + cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, + cache_opts.secondary_cache); + std::shared_ptr<Cache> cache_new(tmp_cache); + table_options.block_cache = cache_new; + table_options.block_size = 4 * 1024; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + + // Start the cache loading process + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + std::unique_ptr<CacheDumpReader> dump_reader; + s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path, + &dump_reader); + ASSERT_OK(s); + std::unique_ptr<CacheDumpedLoader> cache_loader; + s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache, + std::move(dump_reader), &cache_loader); + ASSERT_OK(s); + s = cache_loader->RestoreCacheEntriesToSecondaryCache(); + ASSERT_OK(s); + uint32_t load_insert = secondary_cache->num_inserts() - start_insert; + uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup; + // check the number we inserted + ASSERT_EQ(64, static_cast<int>(load_insert)); + ASSERT_EQ(0, static_cast<int>(load_lookup)); + ASSERT_OK(s); + + ASSERT_OK(db1->Close()); + delete db1; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + + // After load, we do the Get again. To validate the cache, we do not allow any + // I/O, so we set the file system to false. + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + fault_fs_->SetFilesystemActive(false, error_msg); + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + uint32_t cache_insert = tmp_cache->GetInsertCount(); + uint32_t cache_lookup = tmp_cache->GetLookupcount(); + for (int i = 0; i < N; i++) { + ASSERT_OK(db1->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value1[i]); + } + uint32_t final_insert = secondary_cache->num_inserts() - start_insert; + uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup; + // no insert to secondary cache + ASSERT_EQ(0, static_cast<int>(final_insert)); + // lookup the secondary to get all blocks + ASSERT_EQ(64, static_cast<int>(final_lookup)); + uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + // Check the new block cache insert and lookup, should be no insert since all + // blocks are from the secondary cache. + ASSERT_EQ(0, static_cast<int>(block_insert)); + ASSERT_EQ(256, static_cast<int>(block_lookup)); + fault_fs_->SetFailGetUniqueId(false); + fault_fs_->SetFilesystemActive(true); + delete db1; + delete db2; + ASSERT_OK(DestroyDB(dbname1, options)); + ASSERT_OK(DestroyDB(dbname2, options)); +} + +// Test the option not to use the secondary cache in a certain DB. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { + LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + options.lowest_used_cache_tier = CacheTier::kVolatileTier; + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i + 70), p_v)); + } + + ASSERT_OK(Flush()); + + // Flush will trigger the paranoid check and read blocks. But only block cache + // will be read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Compact("a", "z"); + + // Compaction will also insert and evict blocks, no operations to the block + // cache. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + + // Check the data in first block. Cache miss, direclty read from SST file. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // Check the second block. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // block cache hit + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(70)); + ASSERT_EQ(1007, v.size()); + + // Check the first block in the second SST file. Cache miss and trigger SST + // file read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(75)); + ASSERT_EQ(1007, v.size()); + + // Check the second block in the second SST file. Cache miss and trigger SST + // file read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Destroy(options); +} + +// We disable the secondary cache in DBOptions at first. Close and reopen the DB +// with new options, which set the lowest_used_cache_tier to +// kNonVolatileBlockTier. So secondary cache will be used. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { + LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + options.lowest_used_cache_tier = CacheTier::kVolatileTier; + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i + 70), p_v)); + } + + ASSERT_OK(Flush()); + + // Flush will trigger the paranoid check and read blocks. But only block cache + // will be read. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Compact("a", "z"); + + // Compaction will also insert and evict blocks, no operations to the block + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + + // Check the data in first block. Cache miss, direclty read from SST file. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // Check the second block. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // block cache hit + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + // Change the option to enable secondary cache after we Reopen the DB + options.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + Reopen(options); + + v = Get(Key(70)); + ASSERT_EQ(1007, v.size()); + + // Enable the secondary cache, trigger lookup of the first block in second SST + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + v = Get(Key(75)); + ASSERT_EQ(1007, v.size()); + + // trigger lookup of the second block in second SST + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + Destroy(options); +} + +// Two DB test. We create 2 DBs sharing the same block cache and secondary +// cache. We diable the secondary cache option for DB2. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { + LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr<TestSecondaryCache> secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr<Cache> cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + options.paranoid_file_checks = true; + std::string dbname1 = test::PerThreadDBPath("db_t_1"); + ASSERT_OK(DestroyDB(dbname1, options)); + DB* db1 = nullptr; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + std::string dbname2 = test::PerThreadDBPath("db_t_2"); + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + Options options2 = options; + options2.lowest_used_cache_tier = CacheTier::kVolatileTier; + ASSERT_OK(DB::Open(options2, dbname2, &db2)); + fault_fs_->SetFailGetUniqueId(true); + + WriteOptions wo; + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(db1->Put(wo, Key(i), p_v)); + } + + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + ASSERT_OK(db1->Flush(FlushOptions())); + + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(db2->Put(wo, Key(i), p_v)); + } + + // No change in the secondary cache, since it is disabled in DB2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + ASSERT_OK(db2->Flush(FlushOptions())); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Slice bg("a"); + Slice ed("b"); + ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed)); + ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed)); + + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + ReadOptions ro; + std::string v; + ASSERT_OK(db1->Get(ro, Key(0), &v)); + ASSERT_EQ(1007, v.size()); + + // DB 1 has lookup block 1 and it is miss in block cache, trigger secondary + // cache lookup + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + ASSERT_OK(db1->Get(ro, Key(5), &v)); + ASSERT_EQ(1007, v.size()); + + // DB 1 lookup the second block and it is miss in block cache, trigger + // secondary cache lookup + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + ASSERT_OK(db2->Get(ro, Key(0), &v)); + ASSERT_EQ(1007, v.size()); + + // For db2, it is not enabled with secondary cache, so no search in the + // secondary cache + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + ASSERT_OK(db2->Get(ro, Key(5), &v)); + ASSERT_EQ(1007, v.size()); + + // For db2, it is not enabled with secondary cache, so no search in the + // secondary cache + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + fault_fs_->SetFailGetUniqueId(false); + fault_fs_->SetFilesystemActive(true); + delete db1; + delete db2; + ASSERT_OK(DestroyDB(dbname1, options)); + ASSERT_OK(DestroyDB(dbname2, options)); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/cache/secondary_cache.cc b/src/rocksdb/cache/secondary_cache.cc new file mode 100644 index 000000000..84352db71 --- /dev/null +++ b/src/rocksdb/cache/secondary_cache.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/secondary_cache.h" + +#include "cache/cache_entry_roles.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); } + +Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length, + void* out) { + const Slice& slice = *static_cast<Slice*>(from_obj); + std::memcpy(out, slice.data() + from_offset, length); + return Status::OK(); +} + +} // namespace + +Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) { + static Cache::CacheItemHelper helper{ + &SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()}; + // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved` + return Insert(key, const_cast<Slice*>(&saved), &helper); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/sharded_cache.cc b/src/rocksdb/cache/sharded_cache.cc new file mode 100644 index 000000000..9ebca3ba8 --- /dev/null +++ b/src/rocksdb/cache/sharded_cache.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "cache/sharded_cache.h" + +#include <algorithm> +#include <cstdint> +#include <memory> + +#include "util/hash.h" +#include "util/math.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits, + bool strict_capacity_limit, + std::shared_ptr<MemoryAllocator> allocator) + : Cache(std::move(allocator)), + last_id_(1), + shard_mask_((uint32_t{1} << num_shard_bits) - 1), + strict_capacity_limit_(strict_capacity_limit), + capacity_(capacity) {} + +size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const { + uint32_t num_shards = GetNumShards(); + return (capacity + (num_shards - 1)) / num_shards; +} + +size_t ShardedCacheBase::GetPerShardCapacity() const { + return ComputePerShardCapacity(GetCapacity()); +} + +uint64_t ShardedCacheBase::NewId() { + return last_id_.fetch_add(1, std::memory_order_relaxed); +} + +size_t ShardedCacheBase::GetCapacity() const { + MutexLock l(&config_mutex_); + return capacity_; +} + +bool ShardedCacheBase::HasStrictCapacityLimit() const { + MutexLock l(&config_mutex_); + return strict_capacity_limit_; +} + +size_t ShardedCacheBase::GetUsage(Handle* handle) const { + return GetCharge(handle); +} + +std::string ShardedCacheBase::GetPrintableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + { + MutexLock l(&config_mutex_); + snprintf(buffer, kBufferSize, " capacity : %" ROCKSDB_PRIszt "\n", + capacity_); + ret.append(buffer); + snprintf(buffer, kBufferSize, " num_shard_bits : %d\n", + GetNumShardBits()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " strict_capacity_limit : %d\n", + strict_capacity_limit_); + ret.append(buffer); + } + snprintf(buffer, kBufferSize, " memory_allocator : %s\n", + memory_allocator() ? memory_allocator()->Name() : "None"); + ret.append(buffer); + AppendPrintableOptions(ret); + return ret; +} + +int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) { + int num_shard_bits = 0; + size_t num_shards = capacity / min_shard_size; + while (num_shards >>= 1) { + if (++num_shard_bits >= 6) { + // No more than 6. + return num_shard_bits; + } + } + return num_shard_bits; +} + +int ShardedCacheBase::GetNumShardBits() const { + return BitsSetToOne(shard_mask_); +} + +uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/cache/sharded_cache.h b/src/rocksdb/cache/sharded_cache.h new file mode 100644 index 000000000..e3271cc7b --- /dev/null +++ b/src/rocksdb/cache/sharded_cache.h @@ -0,0 +1,322 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <atomic> +#include <cstdint> +#include <string> + +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// Optional base class for classes implementing the CacheShard concept +class CacheShardBase { + public: + explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy) + : metadata_charge_policy_(metadata_charge_policy) {} + + using DeleterFn = Cache::DeleterFn; + + // Expected by concept CacheShard (TODO with C++20 support) + // Some Defaults + std::string GetPrintableOptions() const { return ""; } + using HashVal = uint64_t; + using HashCref = uint64_t; + static inline HashVal ComputeHash(const Slice& key) { + return GetSliceNPHash64(key); + } + static inline uint32_t HashPieceForSharding(HashCref hash) { + return Lower32of64(hash); + } + void AppendPrintableOptions(std::string& /*str*/) const {} + + // Must be provided for concept CacheShard (TODO with C++20 support) + /* + struct HandleImpl { // for concept HandleImpl + HashVal hash; + HashCref GetHash() const; + ... + }; + Status Insert(const Slice& key, HashCref hash, void* value, size_t charge, + DeleterFn deleter, HandleImpl** handle, + Cache::Priority priority) = 0; + Status Insert(const Slice& key, HashCref hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + HandleImpl** handle, Cache::Priority priority) = 0; + HandleImpl* Lookup(const Slice& key, HashCref hash) = 0; + HandleImpl* Lookup(const Slice& key, HashCref hash, + const Cache::CacheItemHelper* helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority, bool wait, + Statistics* stats) = 0; + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0; + bool IsReady(HandleImpl* handle) = 0; + void Wait(HandleImpl* handle) = 0; + bool Ref(HandleImpl* handle) = 0; + void Erase(const Slice& key, HashCref hash) = 0; + void SetCapacity(size_t capacity) = 0; + void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + size_t GetUsage() const = 0; + size_t GetPinnedUsage() const = 0; + size_t GetOccupancyCount() const = 0; + size_t GetTableAddressCount() const = 0; + // Handles iterating over roughly `average_entries_per_lock` entries, using + // `state` to somehow record where it last ended up. Caller initially uses + // *state == 0 and implementation sets *state = SIZE_MAX to indicate + // completion. + void ApplyToSomeEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + size_t average_entries_per_lock, size_t* state) = 0; + void EraseUnRefEntries() = 0; + */ + + protected: + const CacheMetadataChargePolicy metadata_charge_policy_; +}; + +// Portions of ShardedCache that do not depend on the template parameter +class ShardedCacheBase : public Cache { + public: + ShardedCacheBase(size_t capacity, int num_shard_bits, + bool strict_capacity_limit, + std::shared_ptr<MemoryAllocator> memory_allocator); + virtual ~ShardedCacheBase() = default; + + int GetNumShardBits() const; + uint32_t GetNumShards() const; + + uint64_t NewId() override; + + bool HasStrictCapacityLimit() const override; + size_t GetCapacity() const override; + + using Cache::GetUsage; + size_t GetUsage(Handle* handle) const override; + std::string GetPrintableOptions() const override; + + protected: // fns + virtual void AppendPrintableOptions(std::string& str) const = 0; + size_t GetPerShardCapacity() const; + size_t ComputePerShardCapacity(size_t capacity) const; + + protected: // data + std::atomic<uint64_t> last_id_; // For NewId + const uint32_t shard_mask_; + + // Dynamic configuration parameters, guarded by config_mutex_ + bool strict_capacity_limit_; + size_t capacity_; + mutable port::Mutex config_mutex_; +}; + +// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits +// shards will be created, with capacity split evenly to each of the shards. +// Keys are typically sharded by the lowest num_shard_bits bits of hash value +// so that the upper bits of the hash value can keep a stable ordering of +// table entries even as the table grows (using more upper hash bits). +// See CacheShardBase above for what is expected of the CacheShard parameter. +template <class CacheShard> +class ShardedCache : public ShardedCacheBase { + public: + using HashVal = typename CacheShard::HashVal; + using HashCref = typename CacheShard::HashCref; + using HandleImpl = typename CacheShard::HandleImpl; + + ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, + std::shared_ptr<MemoryAllocator> allocator) + : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit, + allocator), + shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc( + sizeof(CacheShard) * GetNumShards()))), + destroy_shards_in_dtor_(false) {} + + virtual ~ShardedCache() { + if (destroy_shards_in_dtor_) { + ForEachShard([](CacheShard* cs) { cs->~CacheShard(); }); + } + port::cacheline_aligned_free(shards_); + } + + CacheShard& GetShard(HashCref hash) { + return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_]; + } + + const CacheShard& GetShard(HashCref hash) const { + return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_]; + } + + void SetCapacity(size_t capacity) override { + MutexLock l(&config_mutex_); + capacity_ = capacity; + auto per_shard = ComputePerShardCapacity(capacity); + ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); }); + } + + void SetStrictCapacityLimit(bool s_c_l) override { + MutexLock l(&config_mutex_); + strict_capacity_limit_ = s_c_l; + ForEachShard( + [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); }); + } + + Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, + Handle** handle, Priority priority) override { + HashVal hash = CacheShard::ComputeHash(key); + auto h_out = reinterpret_cast<HandleImpl**>(handle); + return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out, + priority); + } + Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + if (!helper) { + return Status::InvalidArgument(); + } + HashVal hash = CacheShard::ComputeHash(key); + auto h_out = reinterpret_cast<HandleImpl**>(handle); + return GetShard(hash).Insert(key, hash, value, helper, charge, h_out, + priority); + } + + Handle* Lookup(const Slice& key, Statistics* /*stats*/) override { + HashVal hash = CacheShard::ComputeHash(key); + HandleImpl* result = GetShard(hash).Lookup(key, hash); + return reinterpret_cast<Handle*>(result); + } + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, bool wait, + Statistics* stats = nullptr) override { + HashVal hash = CacheShard::ComputeHash(key); + HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb, + priority, wait, stats); + return reinterpret_cast<Handle*>(result); + } + + void Erase(const Slice& key) override { + HashVal hash = CacheShard::ComputeHash(key); + GetShard(hash).Erase(key, hash); + } + + bool Release(Handle* handle, bool useful, + bool erase_if_last_ref = false) override { + auto h = reinterpret_cast<HandleImpl*>(handle); + return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref); + } + bool IsReady(Handle* handle) override { + auto h = reinterpret_cast<HandleImpl*>(handle); + return GetShard(h->GetHash()).IsReady(h); + } + void Wait(Handle* handle) override { + auto h = reinterpret_cast<HandleImpl*>(handle); + GetShard(h->GetHash()).Wait(h); + } + bool Ref(Handle* handle) override { + auto h = reinterpret_cast<HandleImpl*>(handle); + return GetShard(h->GetHash()).Ref(h); + } + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return Release(handle, true /*useful*/, erase_if_last_ref); + } + using ShardedCacheBase::GetUsage; + size_t GetUsage() const override { + return SumOverShards2(&CacheShard::GetUsage); + } + size_t GetPinnedUsage() const override { + return SumOverShards2(&CacheShard::GetPinnedUsage); + } + size_t GetOccupancyCount() const override { + return SumOverShards2(&CacheShard::GetPinnedUsage); + } + size_t GetTableAddressCount() const override { + return SumOverShards2(&CacheShard::GetTableAddressCount); + } + void ApplyToAllEntries( + const std::function<void(const Slice& key, void* value, size_t charge, + DeleterFn deleter)>& callback, + const ApplyToAllEntriesOptions& opts) override { + uint32_t num_shards = GetNumShards(); + // Iterate over part of each shard, rotating between shards, to + // minimize impact on latency of concurrent operations. + std::unique_ptr<size_t[]> states(new size_t[num_shards]{}); + + size_t aepl = opts.average_entries_per_lock; + aepl = std::min(aepl, size_t{1}); + + bool remaining_work; + do { + remaining_work = false; + for (uint32_t i = 0; i < num_shards; i++) { + if (states[i] != SIZE_MAX) { + shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]); + remaining_work |= states[i] != SIZE_MAX; + } + } + } while (remaining_work); + } + + virtual void EraseUnRefEntries() override { + ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); }); + } + + void DisownData() override { + // Leak data only if that won't generate an ASAN/valgrind warning. + if (!kMustFreeHeapAllocations) { + destroy_shards_in_dtor_ = false; + } + } + + protected: + inline void ForEachShard(const std::function<void(CacheShard*)>& fn) { + uint32_t num_shards = GetNumShards(); + for (uint32_t i = 0; i < num_shards; i++) { + fn(shards_ + i); + } + } + + inline size_t SumOverShards( + const std::function<size_t(CacheShard&)>& fn) const { + uint32_t num_shards = GetNumShards(); + size_t result = 0; + for (uint32_t i = 0; i < num_shards; i++) { + result += fn(shards_[i]); + } + return result; + } + + inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const { + return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); }); + } + + // Must be called exactly once by derived class constructor + void InitShards(const std::function<void(CacheShard*)>& placement_new) { + ForEachShard(placement_new); + destroy_shards_in_dtor_ = true; + } + + void AppendPrintableOptions(std::string& str) const override { + shards_[0].AppendPrintableOptions(str); + } + + private: + CacheShard* const shards_; + bool destroy_shards_in_dtor_; +}; + +// 512KB is traditional minimum shard size. +int GetDefaultCacheShardBits(size_t capacity, + size_t min_shard_size = 512U * 1024U); + +} // namespace ROCKSDB_NAMESPACE |