summaryrefslogtreecommitdiffstats
path: root/src/evict.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 17:31:02 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 17:31:02 +0000
commitbb12c1fd00eb51118749bbbc69c5596835fcbd3b (patch)
tree88038a98bd31c1b765f3390767a2ec12e37c79ec /src/evict.c
parentInitial commit. (diff)
downloadredis-bb12c1fd00eb51118749bbbc69c5596835fcbd3b.tar.xz
redis-bb12c1fd00eb51118749bbbc69c5596835fcbd3b.zip
Adding upstream version 5:7.0.15.upstream/5%7.0.15upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/evict.c')
-rw-r--r--src/evict.c771
1 files changed, 771 insertions, 0 deletions
diff --git a/src/evict.c b/src/evict.c
new file mode 100644
index 0000000..a9e24a7
--- /dev/null
+++ b/src/evict.c
@@ -0,0 +1,771 @@
+/* Maxmemory directive handling (LRU eviction and other policies).
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+#include "bio.h"
+#include "atomicvar.h"
+#include "script.h"
+#include <math.h>
+
+/* ----------------------------------------------------------------------------
+ * Data structures
+ * --------------------------------------------------------------------------*/
+
+/* To improve the quality of the LRU approximation we take a set of keys
+ * that are good candidate for eviction across performEvictions() calls.
+ *
+ * Entries inside the eviction pool are taken ordered by idle time, putting
+ * greater idle times to the right (ascending order).
+ *
+ * When an LFU policy is used instead, a reverse frequency indication is used
+ * instead of the idle time, so that we still evict by larger value (larger
+ * inverse frequency means to evict keys with the least frequent accesses).
+ *
+ * Empty entries have the key pointer set to NULL. */
+#define EVPOOL_SIZE 16
+#define EVPOOL_CACHED_SDS_SIZE 255
+struct evictionPoolEntry {
+ unsigned long long idle; /* Object idle time (inverse frequency for LFU) */
+ sds key; /* Key name. */
+ sds cached; /* Cached SDS object for key name. */
+ int dbid; /* Key DB number. */
+};
+
+static struct evictionPoolEntry *EvictionPoolLRU;
+
+/* ----------------------------------------------------------------------------
+ * Implementation of eviction, aging and LRU
+ * --------------------------------------------------------------------------*/
+
+/* Return the LRU clock, based on the clock resolution. This is a time
+ * in a reduced-bits format that can be used to set and check the
+ * object->lru field of redisObject structures. */
+unsigned int getLRUClock(void) {
+ return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
+}
+
+/* This function is used to obtain the current LRU clock.
+ * If the current resolution is lower than the frequency we refresh the
+ * LRU clock (as it should be in production servers) we return the
+ * precomputed value, otherwise we need to resort to a system call. */
+unsigned int LRU_CLOCK(void) {
+ unsigned int lruclock;
+ if (1000/server.hz <= LRU_CLOCK_RESOLUTION) {
+ atomicGet(server.lruclock,lruclock);
+ } else {
+ lruclock = getLRUClock();
+ }
+ return lruclock;
+}
+
+/* Given an object returns the min number of milliseconds the object was never
+ * requested, using an approximated LRU algorithm. */
+unsigned long long estimateObjectIdleTime(robj *o) {
+ unsigned long long lruclock = LRU_CLOCK();
+ if (lruclock >= o->lru) {
+ return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
+ } else {
+ return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
+ LRU_CLOCK_RESOLUTION;
+ }
+}
+
+/* LRU approximation algorithm
+ *
+ * Redis uses an approximation of the LRU algorithm that runs in constant
+ * memory. Every time there is a key to expire, we sample N keys (with
+ * N very small, usually in around 5) to populate a pool of best keys to
+ * evict of M keys (the pool size is defined by EVPOOL_SIZE).
+ *
+ * The N keys sampled are added in the pool of good keys to expire (the one
+ * with an old access time) if they are better than one of the current keys
+ * in the pool.
+ *
+ * After the pool is populated, the best key we have in the pool is expired.
+ * However note that we don't remove keys from the pool when they are deleted
+ * so the pool may contain keys that no longer exist.
+ *
+ * When we try to evict a key, and all the entries in the pool don't exist
+ * we populate it again. This time we'll be sure that the pool has at least
+ * one key that can be evicted, if there is at least one key that can be
+ * evicted in the whole database. */
+
+/* Create a new eviction pool. */
+void evictionPoolAlloc(void) {
+ struct evictionPoolEntry *ep;
+ int j;
+
+ ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE);
+ for (j = 0; j < EVPOOL_SIZE; j++) {
+ ep[j].idle = 0;
+ ep[j].key = NULL;
+ ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
+ ep[j].dbid = 0;
+ }
+ EvictionPoolLRU = ep;
+}
+
+/* This is a helper function for performEvictions(), it is used in order
+ * to populate the evictionPool with a few entries every time we want to
+ * expire a key. Keys with idle time bigger than one of the current
+ * keys are added. Keys are always added if there are free entries.
+ *
+ * We insert keys on place in ascending order, so keys with the smaller
+ * idle time are on the left, and keys with the higher idle time on the
+ * right. */
+
+void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) {
+ int j, k, count;
+ dictEntry *samples[server.maxmemory_samples];
+
+ count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
+ for (j = 0; j < count; j++) {
+ unsigned long long idle;
+ sds key;
+ robj *o;
+ dictEntry *de;
+
+ de = samples[j];
+ key = dictGetKey(de);
+
+ /* If the dictionary we are sampling from is not the main
+ * dictionary (but the expires one) we need to lookup the key
+ * again in the key dictionary to obtain the value object. */
+ if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
+ if (sampledict != keydict) de = dictFind(keydict, key);
+ o = dictGetVal(de);
+ }
+
+ /* Calculate the idle time according to the policy. This is called
+ * idle just because the code initially handled LRU, but is in fact
+ * just a score where an higher score means better candidate. */
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
+ idle = estimateObjectIdleTime(o);
+ } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ /* When we use an LRU policy, we sort the keys by idle time
+ * so that we expire keys starting from greater idle time.
+ * However when the policy is an LFU one, we have a frequency
+ * estimation, and we want to evict keys with lower frequency
+ * first. So inside the pool we put objects using the inverted
+ * frequency subtracting the actual frequency to the maximum
+ * frequency of 255. */
+ idle = 255-LFUDecrAndReturn(o);
+ } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
+ /* In this case the sooner the expire the better. */
+ idle = ULLONG_MAX - (long)dictGetVal(de);
+ } else {
+ serverPanic("Unknown eviction policy in evictionPoolPopulate()");
+ }
+
+ /* Insert the element inside the pool.
+ * First, find the first empty bucket or the first populated
+ * bucket that has an idle time smaller than our idle time. */
+ k = 0;
+ while (k < EVPOOL_SIZE &&
+ pool[k].key &&
+ pool[k].idle < idle) k++;
+ if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) {
+ /* Can't insert if the element is < the worst element we have
+ * and there are no empty buckets. */
+ continue;
+ } else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
+ /* Inserting into empty position. No setup needed before insert. */
+ } else {
+ /* Inserting in the middle. Now k points to the first element
+ * greater than the element to insert. */
+ if (pool[EVPOOL_SIZE-1].key == NULL) {
+ /* Free space on the right? Insert at k shifting
+ * all the elements from k to end to the right. */
+
+ /* Save SDS before overwriting. */
+ sds cached = pool[EVPOOL_SIZE-1].cached;
+ memmove(pool+k+1,pool+k,
+ sizeof(pool[0])*(EVPOOL_SIZE-k-1));
+ pool[k].cached = cached;
+ } else {
+ /* No free space on right? Insert at k-1 */
+ k--;
+ /* Shift all elements on the left of k (included) to the
+ * left, so we discard the element with smaller idle time. */
+ sds cached = pool[0].cached; /* Save SDS before overwriting. */
+ if (pool[0].key != pool[0].cached) sdsfree(pool[0].key);
+ memmove(pool,pool+1,sizeof(pool[0])*k);
+ pool[k].cached = cached;
+ }
+ }
+
+ /* Try to reuse the cached SDS string allocated in the pool entry,
+ * because allocating and deallocating this object is costly
+ * (according to the profiler, not my fantasy. Remember:
+ * premature optimization bla bla bla. */
+ int klen = sdslen(key);
+ if (klen > EVPOOL_CACHED_SDS_SIZE) {
+ pool[k].key = sdsdup(key);
+ } else {
+ memcpy(pool[k].cached,key,klen+1);
+ sdssetlen(pool[k].cached,klen);
+ pool[k].key = pool[k].cached;
+ }
+ pool[k].idle = idle;
+ pool[k].dbid = dbid;
+ }
+}
+
+/* ----------------------------------------------------------------------------
+ * LFU (Least Frequently Used) implementation.
+
+ * We have 24 total bits of space in each object in order to implement
+ * an LFU (Least Frequently Used) eviction policy, since we re-use the
+ * LRU field for this purpose.
+ *
+ * We split the 24 bits into two fields:
+ *
+ * 16 bits 8 bits
+ * +----------------+--------+
+ * + Last decr time | LOG_C |
+ * +----------------+--------+
+ *
+ * LOG_C is a logarithmic counter that provides an indication of the access
+ * frequency. However this field must also be decremented otherwise what used
+ * to be a frequently accessed key in the past, will remain ranked like that
+ * forever, while we want the algorithm to adapt to access pattern changes.
+ *
+ * So the remaining 16 bits are used in order to store the "decrement time",
+ * a reduced-precision Unix time (we take 16 bits of the time converted
+ * in minutes since we don't care about wrapping around) where the LOG_C
+ * counter is halved if it has an high value, or just decremented if it
+ * has a low value.
+ *
+ * New keys don't start at zero, in order to have the ability to collect
+ * some accesses before being trashed away, so they start at LFU_INIT_VAL.
+ * The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL
+ * when incrementing the key, so that keys starting at LFU_INIT_VAL
+ * (or having a smaller value) have a very high chance of being incremented
+ * on access.
+ *
+ * During decrement, the value of the logarithmic counter is halved if
+ * its current value is greater than two times the LFU_INIT_VAL, otherwise
+ * it is just decremented by one.
+ * --------------------------------------------------------------------------*/
+
+/* Return the current time in minutes, just taking the least significant
+ * 16 bits. The returned time is suitable to be stored as LDT (last decrement
+ * time) for the LFU implementation. */
+unsigned long LFUGetTimeInMinutes(void) {
+ return (server.unixtime/60) & 65535;
+}
+
+/* Given an object last access time, compute the minimum number of minutes
+ * that elapsed since the last access. Handle overflow (ldt greater than
+ * the current 16 bits minutes time) considering the time as wrapping
+ * exactly once. */
+unsigned long LFUTimeElapsed(unsigned long ldt) {
+ unsigned long now = LFUGetTimeInMinutes();
+ if (now >= ldt) return now-ldt;
+ return 65535-ldt+now;
+}
+
+/* Logarithmically increment a counter. The greater is the current counter value
+ * the less likely is that it gets really incremented. Saturate it at 255. */
+uint8_t LFULogIncr(uint8_t counter) {
+ if (counter == 255) return 255;
+ double r = (double)rand()/RAND_MAX;
+ double baseval = counter - LFU_INIT_VAL;
+ if (baseval < 0) baseval = 0;
+ double p = 1.0/(baseval*server.lfu_log_factor+1);
+ if (r < p) counter++;
+ return counter;
+}
+
+/* If the object decrement time is reached decrement the LFU counter but
+ * do not update LFU fields of the object, we update the access time
+ * and counter in an explicit way when the object is really accessed.
+ * And we will times halve the counter according to the times of
+ * elapsed time than server.lfu_decay_time.
+ * Return the object frequency counter.
+ *
+ * This function is used in order to scan the dataset for the best object
+ * to fit: as we check for the candidate, we incrementally decrement the
+ * counter of the scanned objects if needed. */
+unsigned long LFUDecrAndReturn(robj *o) {
+ unsigned long ldt = o->lru >> 8;
+ unsigned long counter = o->lru & 255;
+ unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0;
+ if (num_periods)
+ counter = (num_periods > counter) ? 0 : counter - num_periods;
+ return counter;
+}
+
+/* We don't want to count AOF buffers and slaves output buffers as
+ * used memory: the eviction should use mostly data size, because
+ * it can cause feedback-loop when we push DELs into them, putting
+ * more and more DELs will make them bigger, if we count them, we
+ * need to evict more keys, and then generate more DELs, maybe cause
+ * massive eviction loop, even all keys are evicted.
+ *
+ * This function returns the sum of AOF and replication buffer. */
+size_t freeMemoryGetNotCountedMemory(void) {
+ size_t overhead = 0;
+
+ /* Since all replicas and replication backlog share global replication
+ * buffer, we think only the part of exceeding backlog size is the extra
+ * separate consumption of replicas.
+ *
+ * Note that although the backlog is also initially incrementally grown
+ * (pushing DELs consumes memory), it'll eventually stop growing and
+ * remain constant in size, so even if its creation will cause some
+ * eviction, it's capped, and also here to stay (no resonance effect)
+ *
+ * Note that, because we trim backlog incrementally in the background,
+ * backlog size may exceeds our setting if slow replicas that reference
+ * vast replication buffer blocks disconnect. To avoid massive eviction
+ * loop, we don't count the delayed freed replication backlog into used
+ * memory even if there are no replicas, i.e. we still regard this memory
+ * as replicas'. */
+ if ((long long)server.repl_buffer_mem > server.repl_backlog_size) {
+ /* We use list structure to manage replication buffer blocks, so backlog
+ * also occupies some extra memory, we can't know exact blocks numbers,
+ * we only get approximate size according to per block size. */
+ size_t extra_approx_size =
+ (server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + 1) *
+ (sizeof(replBufBlock)+sizeof(listNode));
+ size_t counted_mem = server.repl_backlog_size + extra_approx_size;
+ if (server.repl_buffer_mem > counted_mem) {
+ overhead += (server.repl_buffer_mem - counted_mem);
+ }
+ }
+
+ if (server.aof_state != AOF_OFF) {
+ overhead += sdsAllocSize(server.aof_buf);
+ }
+ return overhead;
+}
+
+/* Get the memory status from the point of view of the maxmemory directive:
+ * if the memory used is under the maxmemory setting then C_OK is returned.
+ * Otherwise, if we are over the memory limit, the function returns
+ * C_ERR.
+ *
+ * The function may return additional info via reference, only if the
+ * pointers to the respective arguments is not NULL. Certain fields are
+ * populated only when C_ERR is returned:
+ *
+ * 'total' total amount of bytes used.
+ * (Populated both for C_ERR and C_OK)
+ *
+ * 'logical' the amount of memory used minus the slaves/AOF buffers.
+ * (Populated when C_ERR is returned)
+ *
+ * 'tofree' the amount of memory that should be released
+ * in order to return back into the memory limits.
+ * (Populated when C_ERR is returned)
+ *
+ * 'level' this usually ranges from 0 to 1, and reports the amount of
+ * memory currently used. May be > 1 if we are over the memory
+ * limit.
+ * (Populated both for C_ERR and C_OK)
+ */
+int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level) {
+ size_t mem_reported, mem_used, mem_tofree;
+
+ /* Check if we are over the memory usage limit. If we are not, no need
+ * to subtract the slaves output buffers. We can just return ASAP. */
+ mem_reported = zmalloc_used_memory();
+ if (total) *total = mem_reported;
+
+ /* We may return ASAP if there is no need to compute the level. */
+ if (!server.maxmemory) {
+ if (level) *level = 0;
+ return C_OK;
+ }
+ if (mem_reported <= server.maxmemory && !level) return C_OK;
+
+ /* Remove the size of slaves output buffers and AOF buffer from the
+ * count of used memory. */
+ mem_used = mem_reported;
+ size_t overhead = freeMemoryGetNotCountedMemory();
+ mem_used = (mem_used > overhead) ? mem_used-overhead : 0;
+
+ /* Compute the ratio of memory usage. */
+ if (level) *level = (float)mem_used / (float)server.maxmemory;
+
+ if (mem_reported <= server.maxmemory) return C_OK;
+
+ /* Check if we are still over the memory limit. */
+ if (mem_used <= server.maxmemory) return C_OK;
+
+ /* Compute how much memory we need to free. */
+ mem_tofree = mem_used - server.maxmemory;
+
+ if (logical) *logical = mem_used;
+ if (tofree) *tofree = mem_tofree;
+
+ return C_ERR;
+}
+
+/* Return 1 if used memory is more than maxmemory after allocating more memory,
+ * return 0 if not. Redis may reject user's requests or evict some keys if used
+ * memory exceeds maxmemory, especially, when we allocate huge memory at once. */
+int overMaxmemoryAfterAlloc(size_t moremem) {
+ if (!server.maxmemory) return 0; /* No limit. */
+
+ /* Check quickly. */
+ size_t mem_used = zmalloc_used_memory();
+ if (mem_used + moremem <= server.maxmemory) return 0;
+
+ size_t overhead = freeMemoryGetNotCountedMemory();
+ mem_used = (mem_used > overhead) ? mem_used - overhead : 0;
+ return mem_used + moremem > server.maxmemory;
+}
+
+/* The evictionTimeProc is started when "maxmemory" has been breached and
+ * could not immediately be resolved. This will spin the event loop with short
+ * eviction cycles until the "maxmemory" condition has resolved or there are no
+ * more evictable items. */
+static int isEvictionProcRunning = 0;
+static int evictionTimeProc(
+ struct aeEventLoop *eventLoop, long long id, void *clientData) {
+ UNUSED(eventLoop);
+ UNUSED(id);
+ UNUSED(clientData);
+
+ if (performEvictions() == EVICT_RUNNING) return 0; /* keep evicting */
+
+ /* For EVICT_OK - things are good, no need to keep evicting.
+ * For EVICT_FAIL - there is nothing left to evict. */
+ isEvictionProcRunning = 0;
+ return AE_NOMORE;
+}
+
+void startEvictionTimeProc(void) {
+ if (!isEvictionProcRunning) {
+ isEvictionProcRunning = 1;
+ aeCreateTimeEvent(server.el, 0,
+ evictionTimeProc, NULL, NULL);
+ }
+}
+
+/* Check if it's safe to perform evictions.
+ * Returns 1 if evictions can be performed
+ * Returns 0 if eviction processing should be skipped
+ */
+static int isSafeToPerformEvictions(void) {
+ /* - There must be no script in timeout condition.
+ * - Nor we are loading data right now. */
+ if (isInsideYieldingLongCommand() || server.loading) return 0;
+
+ /* By default replicas should ignore maxmemory
+ * and just be masters exact copies. */
+ if (server.masterhost && server.repl_slave_ignore_maxmemory) return 0;
+
+ /* When clients are paused the dataset should be static not just from the
+ * POV of clients not being able to write, but also from the POV of
+ * expires and evictions of keys not being performed. */
+ if (checkClientPauseTimeoutAndReturnIfPaused()) return 0;
+
+ return 1;
+}
+
+/* Algorithm for converting tenacity (0-100) to a time limit. */
+static unsigned long evictionTimeLimitUs() {
+ serverAssert(server.maxmemory_eviction_tenacity >= 0);
+ serverAssert(server.maxmemory_eviction_tenacity <= 100);
+
+ if (server.maxmemory_eviction_tenacity <= 10) {
+ /* A linear progression from 0..500us */
+ return 50uL * server.maxmemory_eviction_tenacity;
+ }
+
+ if (server.maxmemory_eviction_tenacity < 100) {
+ /* A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 */
+ return (unsigned long)(500.0 * pow(1.15, server.maxmemory_eviction_tenacity - 10.0));
+ }
+
+ return ULONG_MAX; /* No limit to eviction time */
+}
+
+/* Check that memory usage is within the current "maxmemory" limit. If over
+ * "maxmemory", attempt to free memory by evicting data (if it's safe to do so).
+ *
+ * It's possible for Redis to suddenly be significantly over the "maxmemory"
+ * setting. This can happen if there is a large allocation (like a hash table
+ * resize) or even if the "maxmemory" setting is manually adjusted. Because of
+ * this, it's important to evict for a managed period of time - otherwise Redis
+ * would become unresponsive while evicting.
+ *
+ * The goal of this function is to improve the memory situation - not to
+ * immediately resolve it. In the case that some items have been evicted but
+ * the "maxmemory" limit has not been achieved, an aeTimeProc will be started
+ * which will continue to evict items until memory limits are achieved or
+ * nothing more is evictable.
+ *
+ * This should be called before execution of commands. If EVICT_FAIL is
+ * returned, commands which will result in increased memory usage should be
+ * rejected.
+ *
+ * Returns:
+ * EVICT_OK - memory is OK or it's not possible to perform evictions now
+ * EVICT_RUNNING - memory is over the limit, but eviction is still processing
+ * EVICT_FAIL - memory is over the limit, and there's nothing to evict
+ * */
+int performEvictions(void) {
+ /* Note, we don't goto update_metrics here because this check skips eviction
+ * as if it wasn't triggered. it's a fake EVICT_OK. */
+ if (!isSafeToPerformEvictions()) return EVICT_OK;
+
+ int keys_freed = 0;
+ size_t mem_reported, mem_tofree;
+ long long mem_freed; /* May be negative */
+ mstime_t latency, eviction_latency;
+ long long delta;
+ int slaves = listLength(server.slaves);
+ int result = EVICT_FAIL;
+
+ if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) {
+ result = EVICT_OK;
+ goto update_metrics;
+ }
+
+ if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) {
+ result = EVICT_FAIL; /* We need to free memory, but policy forbids. */
+ goto update_metrics;
+ }
+
+ unsigned long eviction_time_limit_us = evictionTimeLimitUs();
+
+ mem_freed = 0;
+
+ latencyStartMonitor(latency);
+
+ monotime evictionTimer;
+ elapsedStart(&evictionTimer);
+
+ /* Unlike active-expire and blocked client, we can reach here from 'CONFIG SET maxmemory'
+ * so we have to back-up and restore server.core_propagates. */
+ int prev_core_propagates = server.core_propagates;
+ serverAssert(server.also_propagate.numops == 0);
+ server.core_propagates = 1;
+ server.propagate_no_multi = 1;
+
+ while (mem_freed < (long long)mem_tofree) {
+ int j, k, i;
+ static unsigned int next_db = 0;
+ sds bestkey = NULL;
+ int bestdbid;
+ redisDb *db;
+ dict *dict;
+ dictEntry *de;
+
+ if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) ||
+ server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL)
+ {
+ struct evictionPoolEntry *pool = EvictionPoolLRU;
+
+ while (bestkey == NULL) {
+ unsigned long total_keys = 0, keys;
+
+ /* We don't want to make local-db choices when expiring keys,
+ * so to start populate the eviction pool sampling keys from
+ * every DB. */
+ for (i = 0; i < server.dbnum; i++) {
+ db = server.db+i;
+ dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ?
+ db->dict : db->expires;
+ if ((keys = dictSize(dict)) != 0) {
+ evictionPoolPopulate(i, dict, db->dict, pool);
+ total_keys += keys;
+ }
+ }
+ if (!total_keys) break; /* No keys to evict. */
+
+ /* Go backward from best to worst element to evict. */
+ for (k = EVPOOL_SIZE-1; k >= 0; k--) {
+ if (pool[k].key == NULL) continue;
+ bestdbid = pool[k].dbid;
+
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
+ de = dictFind(server.db[bestdbid].dict,
+ pool[k].key);
+ } else {
+ de = dictFind(server.db[bestdbid].expires,
+ pool[k].key);
+ }
+
+ /* Remove the entry from the pool. */
+ if (pool[k].key != pool[k].cached)
+ sdsfree(pool[k].key);
+ pool[k].key = NULL;
+ pool[k].idle = 0;
+
+ /* If the key exists, is our pick. Otherwise it is
+ * a ghost and we need to try the next element. */
+ if (de) {
+ bestkey = dictGetKey(de);
+ break;
+ } else {
+ /* Ghost... Iterate again. */
+ }
+ }
+ }
+ }
+
+ /* volatile-random and allkeys-random policy */
+ else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
+ server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
+ {
+ /* When evicting a random key, we try to evict a key for
+ * each DB, so we use the static 'next_db' variable to
+ * incrementally visit all DBs. */
+ for (i = 0; i < server.dbnum; i++) {
+ j = (++next_db) % server.dbnum;
+ db = server.db+j;
+ dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ?
+ db->dict : db->expires;
+ if (dictSize(dict) != 0) {
+ de = dictGetRandomKey(dict);
+ bestkey = dictGetKey(de);
+ bestdbid = j;
+ break;
+ }
+ }
+ }
+
+ /* Finally remove the selected key. */
+ if (bestkey) {
+ db = server.db+bestdbid;
+ robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
+ /* We compute the amount of memory freed by db*Delete() alone.
+ * It is possible that actually the memory needed to propagate
+ * the DEL in AOF and replication link is greater than the one
+ * we are freeing removing the key, but we can't account for
+ * that otherwise we would never exit the loop.
+ *
+ * Same for CSC invalidation messages generated by signalModifiedKey.
+ *
+ * AOF and Output buffer memory will be freed eventually so
+ * we only care about memory used by the key space. */
+ delta = (long long) zmalloc_used_memory();
+ latencyStartMonitor(eviction_latency);
+ if (server.lazyfree_lazy_eviction)
+ dbAsyncDelete(db,keyobj);
+ else
+ dbSyncDelete(db,keyobj);
+ latencyEndMonitor(eviction_latency);
+ latencyAddSampleIfNeeded("eviction-del",eviction_latency);
+ delta -= (long long) zmalloc_used_memory();
+ mem_freed += delta;
+ server.stat_evictedkeys++;
+ signalModifiedKey(NULL,db,keyobj);
+ notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
+ keyobj, db->id);
+ propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction);
+ decrRefCount(keyobj);
+ keys_freed++;
+
+ if (keys_freed % 16 == 0) {
+ /* When the memory to free starts to be big enough, we may
+ * start spending so much time here that is impossible to
+ * deliver data to the replicas fast enough, so we force the
+ * transmission here inside the loop. */
+ if (slaves) flushSlavesOutputBuffers();
+
+ /* Normally our stop condition is the ability to release
+ * a fixed, pre-computed amount of memory. However when we
+ * are deleting objects in another thread, it's better to
+ * check, from time to time, if we already reached our target
+ * memory, since the "mem_freed" amount is computed only
+ * across the dbAsyncDelete() call, while the thread can
+ * release the memory all the time. */
+ if (server.lazyfree_lazy_eviction) {
+ if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
+ break;
+ }
+ }
+
+ /* After some time, exit the loop early - even if memory limit
+ * hasn't been reached. If we suddenly need to free a lot of
+ * memory, don't want to spend too much time here. */
+ if (elapsedUs(evictionTimer) > eviction_time_limit_us) {
+ // We still need to free memory - start eviction timer proc
+ startEvictionTimeProc();
+ break;
+ }
+ }
+ } else {
+ goto cant_free; /* nothing to free... */
+ }
+ }
+ /* at this point, the memory is OK, or we have reached the time limit */
+ result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK;
+
+cant_free:
+ if (result == EVICT_FAIL) {
+ /* At this point, we have run out of evictable items. It's possible
+ * that some items are being freed in the lazyfree thread. Perform a
+ * short wait here if such jobs exist, but don't wait long. */
+ mstime_t lazyfree_latency;
+ latencyStartMonitor(lazyfree_latency);
+ while (bioPendingJobsOfType(BIO_LAZY_FREE) &&
+ elapsedUs(evictionTimer) < eviction_time_limit_us) {
+ if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
+ result = EVICT_OK;
+ break;
+ }
+ usleep(eviction_time_limit_us < 1000 ? eviction_time_limit_us : 1000);
+ }
+ latencyEndMonitor(lazyfree_latency);
+ latencyAddSampleIfNeeded("eviction-lazyfree",lazyfree_latency);
+ }
+
+ serverAssert(server.core_propagates); /* This function should not be re-entrant */
+
+ /* Propagate all DELs */
+ propagatePendingCommands();
+
+ server.core_propagates = prev_core_propagates;
+ server.propagate_no_multi = 0;
+
+ latencyEndMonitor(latency);
+ latencyAddSampleIfNeeded("eviction-cycle",latency);
+
+update_metrics:
+ if (result == EVICT_RUNNING || result == EVICT_FAIL) {
+ if (server.stat_last_eviction_exceeded_time == 0)
+ elapsedStart(&server.stat_last_eviction_exceeded_time);
+ } else if (result == EVICT_OK) {
+ if (server.stat_last_eviction_exceeded_time != 0) {
+ server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time);
+ server.stat_last_eviction_exceeded_time = 0;
+ }
+ }
+ return result;
+}