diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:31:02 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:31:02 +0000 |
commit | bb12c1fd00eb51118749bbbc69c5596835fcbd3b (patch) | |
tree | 88038a98bd31c1b765f3390767a2ec12e37c79ec /src/evict.c | |
parent | Initial commit. (diff) | |
download | redis-bb12c1fd00eb51118749bbbc69c5596835fcbd3b.tar.xz redis-bb12c1fd00eb51118749bbbc69c5596835fcbd3b.zip |
Adding upstream version 5:7.0.15.upstream/5%7.0.15upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/evict.c')
-rw-r--r-- | src/evict.c | 771 |
1 files changed, 771 insertions, 0 deletions
diff --git a/src/evict.c b/src/evict.c new file mode 100644 index 0000000..a9e24a7 --- /dev/null +++ b/src/evict.c @@ -0,0 +1,771 @@ +/* Maxmemory directive handling (LRU eviction and other policies). + * + * ---------------------------------------------------------------------------- + * + * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "server.h" +#include "bio.h" +#include "atomicvar.h" +#include "script.h" +#include <math.h> + +/* ---------------------------------------------------------------------------- + * Data structures + * --------------------------------------------------------------------------*/ + +/* To improve the quality of the LRU approximation we take a set of keys + * that are good candidate for eviction across performEvictions() calls. + * + * Entries inside the eviction pool are taken ordered by idle time, putting + * greater idle times to the right (ascending order). + * + * When an LFU policy is used instead, a reverse frequency indication is used + * instead of the idle time, so that we still evict by larger value (larger + * inverse frequency means to evict keys with the least frequent accesses). + * + * Empty entries have the key pointer set to NULL. */ +#define EVPOOL_SIZE 16 +#define EVPOOL_CACHED_SDS_SIZE 255 +struct evictionPoolEntry { + unsigned long long idle; /* Object idle time (inverse frequency for LFU) */ + sds key; /* Key name. */ + sds cached; /* Cached SDS object for key name. */ + int dbid; /* Key DB number. */ +}; + +static struct evictionPoolEntry *EvictionPoolLRU; + +/* ---------------------------------------------------------------------------- + * Implementation of eviction, aging and LRU + * --------------------------------------------------------------------------*/ + +/* Return the LRU clock, based on the clock resolution. This is a time + * in a reduced-bits format that can be used to set and check the + * object->lru field of redisObject structures. */ +unsigned int getLRUClock(void) { + return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX; +} + +/* This function is used to obtain the current LRU clock. + * If the current resolution is lower than the frequency we refresh the + * LRU clock (as it should be in production servers) we return the + * precomputed value, otherwise we need to resort to a system call. */ +unsigned int LRU_CLOCK(void) { + unsigned int lruclock; + if (1000/server.hz <= LRU_CLOCK_RESOLUTION) { + atomicGet(server.lruclock,lruclock); + } else { + lruclock = getLRUClock(); + } + return lruclock; +} + +/* Given an object returns the min number of milliseconds the object was never + * requested, using an approximated LRU algorithm. */ +unsigned long long estimateObjectIdleTime(robj *o) { + unsigned long long lruclock = LRU_CLOCK(); + if (lruclock >= o->lru) { + return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION; + } else { + return (lruclock + (LRU_CLOCK_MAX - o->lru)) * + LRU_CLOCK_RESOLUTION; + } +} + +/* LRU approximation algorithm + * + * Redis uses an approximation of the LRU algorithm that runs in constant + * memory. Every time there is a key to expire, we sample N keys (with + * N very small, usually in around 5) to populate a pool of best keys to + * evict of M keys (the pool size is defined by EVPOOL_SIZE). + * + * The N keys sampled are added in the pool of good keys to expire (the one + * with an old access time) if they are better than one of the current keys + * in the pool. + * + * After the pool is populated, the best key we have in the pool is expired. + * However note that we don't remove keys from the pool when they are deleted + * so the pool may contain keys that no longer exist. + * + * When we try to evict a key, and all the entries in the pool don't exist + * we populate it again. This time we'll be sure that the pool has at least + * one key that can be evicted, if there is at least one key that can be + * evicted in the whole database. */ + +/* Create a new eviction pool. */ +void evictionPoolAlloc(void) { + struct evictionPoolEntry *ep; + int j; + + ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE); + for (j = 0; j < EVPOOL_SIZE; j++) { + ep[j].idle = 0; + ep[j].key = NULL; + ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE); + ep[j].dbid = 0; + } + EvictionPoolLRU = ep; +} + +/* This is a helper function for performEvictions(), it is used in order + * to populate the evictionPool with a few entries every time we want to + * expire a key. Keys with idle time bigger than one of the current + * keys are added. Keys are always added if there are free entries. + * + * We insert keys on place in ascending order, so keys with the smaller + * idle time are on the left, and keys with the higher idle time on the + * right. */ + +void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { + int j, k, count; + dictEntry *samples[server.maxmemory_samples]; + + count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); + for (j = 0; j < count; j++) { + unsigned long long idle; + sds key; + robj *o; + dictEntry *de; + + de = samples[j]; + key = dictGetKey(de); + + /* If the dictionary we are sampling from is not the main + * dictionary (but the expires one) we need to lookup the key + * again in the key dictionary to obtain the value object. */ + if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { + if (sampledict != keydict) de = dictFind(keydict, key); + o = dictGetVal(de); + } + + /* Calculate the idle time according to the policy. This is called + * idle just because the code initially handled LRU, but is in fact + * just a score where an higher score means better candidate. */ + if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) { + idle = estimateObjectIdleTime(o); + } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { + /* When we use an LRU policy, we sort the keys by idle time + * so that we expire keys starting from greater idle time. + * However when the policy is an LFU one, we have a frequency + * estimation, and we want to evict keys with lower frequency + * first. So inside the pool we put objects using the inverted + * frequency subtracting the actual frequency to the maximum + * frequency of 255. */ + idle = 255-LFUDecrAndReturn(o); + } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { + /* In this case the sooner the expire the better. */ + idle = ULLONG_MAX - (long)dictGetVal(de); + } else { + serverPanic("Unknown eviction policy in evictionPoolPopulate()"); + } + + /* Insert the element inside the pool. + * First, find the first empty bucket or the first populated + * bucket that has an idle time smaller than our idle time. */ + k = 0; + while (k < EVPOOL_SIZE && + pool[k].key && + pool[k].idle < idle) k++; + if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { + /* Can't insert if the element is < the worst element we have + * and there are no empty buckets. */ + continue; + } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { + /* Inserting into empty position. No setup needed before insert. */ + } else { + /* Inserting in the middle. Now k points to the first element + * greater than the element to insert. */ + if (pool[EVPOOL_SIZE-1].key == NULL) { + /* Free space on the right? Insert at k shifting + * all the elements from k to end to the right. */ + + /* Save SDS before overwriting. */ + sds cached = pool[EVPOOL_SIZE-1].cached; + memmove(pool+k+1,pool+k, + sizeof(pool[0])*(EVPOOL_SIZE-k-1)); + pool[k].cached = cached; + } else { + /* No free space on right? Insert at k-1 */ + k--; + /* Shift all elements on the left of k (included) to the + * left, so we discard the element with smaller idle time. */ + sds cached = pool[0].cached; /* Save SDS before overwriting. */ + if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); + memmove(pool,pool+1,sizeof(pool[0])*k); + pool[k].cached = cached; + } + } + + /* Try to reuse the cached SDS string allocated in the pool entry, + * because allocating and deallocating this object is costly + * (according to the profiler, not my fantasy. Remember: + * premature optimization bla bla bla. */ + int klen = sdslen(key); + if (klen > EVPOOL_CACHED_SDS_SIZE) { + pool[k].key = sdsdup(key); + } else { + memcpy(pool[k].cached,key,klen+1); + sdssetlen(pool[k].cached,klen); + pool[k].key = pool[k].cached; + } + pool[k].idle = idle; + pool[k].dbid = dbid; + } +} + +/* ---------------------------------------------------------------------------- + * LFU (Least Frequently Used) implementation. + + * We have 24 total bits of space in each object in order to implement + * an LFU (Least Frequently Used) eviction policy, since we re-use the + * LRU field for this purpose. + * + * We split the 24 bits into two fields: + * + * 16 bits 8 bits + * +----------------+--------+ + * + Last decr time | LOG_C | + * +----------------+--------+ + * + * LOG_C is a logarithmic counter that provides an indication of the access + * frequency. However this field must also be decremented otherwise what used + * to be a frequently accessed key in the past, will remain ranked like that + * forever, while we want the algorithm to adapt to access pattern changes. + * + * So the remaining 16 bits are used in order to store the "decrement time", + * a reduced-precision Unix time (we take 16 bits of the time converted + * in minutes since we don't care about wrapping around) where the LOG_C + * counter is halved if it has an high value, or just decremented if it + * has a low value. + * + * New keys don't start at zero, in order to have the ability to collect + * some accesses before being trashed away, so they start at LFU_INIT_VAL. + * The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL + * when incrementing the key, so that keys starting at LFU_INIT_VAL + * (or having a smaller value) have a very high chance of being incremented + * on access. + * + * During decrement, the value of the logarithmic counter is halved if + * its current value is greater than two times the LFU_INIT_VAL, otherwise + * it is just decremented by one. + * --------------------------------------------------------------------------*/ + +/* Return the current time in minutes, just taking the least significant + * 16 bits. The returned time is suitable to be stored as LDT (last decrement + * time) for the LFU implementation. */ +unsigned long LFUGetTimeInMinutes(void) { + return (server.unixtime/60) & 65535; +} + +/* Given an object last access time, compute the minimum number of minutes + * that elapsed since the last access. Handle overflow (ldt greater than + * the current 16 bits minutes time) considering the time as wrapping + * exactly once. */ +unsigned long LFUTimeElapsed(unsigned long ldt) { + unsigned long now = LFUGetTimeInMinutes(); + if (now >= ldt) return now-ldt; + return 65535-ldt+now; +} + +/* Logarithmically increment a counter. The greater is the current counter value + * the less likely is that it gets really incremented. Saturate it at 255. */ +uint8_t LFULogIncr(uint8_t counter) { + if (counter == 255) return 255; + double r = (double)rand()/RAND_MAX; + double baseval = counter - LFU_INIT_VAL; + if (baseval < 0) baseval = 0; + double p = 1.0/(baseval*server.lfu_log_factor+1); + if (r < p) counter++; + return counter; +} + +/* If the object decrement time is reached decrement the LFU counter but + * do not update LFU fields of the object, we update the access time + * and counter in an explicit way when the object is really accessed. + * And we will times halve the counter according to the times of + * elapsed time than server.lfu_decay_time. + * Return the object frequency counter. + * + * This function is used in order to scan the dataset for the best object + * to fit: as we check for the candidate, we incrementally decrement the + * counter of the scanned objects if needed. */ +unsigned long LFUDecrAndReturn(robj *o) { + unsigned long ldt = o->lru >> 8; + unsigned long counter = o->lru & 255; + unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0; + if (num_periods) + counter = (num_periods > counter) ? 0 : counter - num_periods; + return counter; +} + +/* We don't want to count AOF buffers and slaves output buffers as + * used memory: the eviction should use mostly data size, because + * it can cause feedback-loop when we push DELs into them, putting + * more and more DELs will make them bigger, if we count them, we + * need to evict more keys, and then generate more DELs, maybe cause + * massive eviction loop, even all keys are evicted. + * + * This function returns the sum of AOF and replication buffer. */ +size_t freeMemoryGetNotCountedMemory(void) { + size_t overhead = 0; + + /* Since all replicas and replication backlog share global replication + * buffer, we think only the part of exceeding backlog size is the extra + * separate consumption of replicas. + * + * Note that although the backlog is also initially incrementally grown + * (pushing DELs consumes memory), it'll eventually stop growing and + * remain constant in size, so even if its creation will cause some + * eviction, it's capped, and also here to stay (no resonance effect) + * + * Note that, because we trim backlog incrementally in the background, + * backlog size may exceeds our setting if slow replicas that reference + * vast replication buffer blocks disconnect. To avoid massive eviction + * loop, we don't count the delayed freed replication backlog into used + * memory even if there are no replicas, i.e. we still regard this memory + * as replicas'. */ + if ((long long)server.repl_buffer_mem > server.repl_backlog_size) { + /* We use list structure to manage replication buffer blocks, so backlog + * also occupies some extra memory, we can't know exact blocks numbers, + * we only get approximate size according to per block size. */ + size_t extra_approx_size = + (server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + 1) * + (sizeof(replBufBlock)+sizeof(listNode)); + size_t counted_mem = server.repl_backlog_size + extra_approx_size; + if (server.repl_buffer_mem > counted_mem) { + overhead += (server.repl_buffer_mem - counted_mem); + } + } + + if (server.aof_state != AOF_OFF) { + overhead += sdsAllocSize(server.aof_buf); + } + return overhead; +} + +/* Get the memory status from the point of view of the maxmemory directive: + * if the memory used is under the maxmemory setting then C_OK is returned. + * Otherwise, if we are over the memory limit, the function returns + * C_ERR. + * + * The function may return additional info via reference, only if the + * pointers to the respective arguments is not NULL. Certain fields are + * populated only when C_ERR is returned: + * + * 'total' total amount of bytes used. + * (Populated both for C_ERR and C_OK) + * + * 'logical' the amount of memory used minus the slaves/AOF buffers. + * (Populated when C_ERR is returned) + * + * 'tofree' the amount of memory that should be released + * in order to return back into the memory limits. + * (Populated when C_ERR is returned) + * + * 'level' this usually ranges from 0 to 1, and reports the amount of + * memory currently used. May be > 1 if we are over the memory + * limit. + * (Populated both for C_ERR and C_OK) + */ +int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level) { + size_t mem_reported, mem_used, mem_tofree; + + /* Check if we are over the memory usage limit. If we are not, no need + * to subtract the slaves output buffers. We can just return ASAP. */ + mem_reported = zmalloc_used_memory(); + if (total) *total = mem_reported; + + /* We may return ASAP if there is no need to compute the level. */ + if (!server.maxmemory) { + if (level) *level = 0; + return C_OK; + } + if (mem_reported <= server.maxmemory && !level) return C_OK; + + /* Remove the size of slaves output buffers and AOF buffer from the + * count of used memory. */ + mem_used = mem_reported; + size_t overhead = freeMemoryGetNotCountedMemory(); + mem_used = (mem_used > overhead) ? mem_used-overhead : 0; + + /* Compute the ratio of memory usage. */ + if (level) *level = (float)mem_used / (float)server.maxmemory; + + if (mem_reported <= server.maxmemory) return C_OK; + + /* Check if we are still over the memory limit. */ + if (mem_used <= server.maxmemory) return C_OK; + + /* Compute how much memory we need to free. */ + mem_tofree = mem_used - server.maxmemory; + + if (logical) *logical = mem_used; + if (tofree) *tofree = mem_tofree; + + return C_ERR; +} + +/* Return 1 if used memory is more than maxmemory after allocating more memory, + * return 0 if not. Redis may reject user's requests or evict some keys if used + * memory exceeds maxmemory, especially, when we allocate huge memory at once. */ +int overMaxmemoryAfterAlloc(size_t moremem) { + if (!server.maxmemory) return 0; /* No limit. */ + + /* Check quickly. */ + size_t mem_used = zmalloc_used_memory(); + if (mem_used + moremem <= server.maxmemory) return 0; + + size_t overhead = freeMemoryGetNotCountedMemory(); + mem_used = (mem_used > overhead) ? mem_used - overhead : 0; + return mem_used + moremem > server.maxmemory; +} + +/* The evictionTimeProc is started when "maxmemory" has been breached and + * could not immediately be resolved. This will spin the event loop with short + * eviction cycles until the "maxmemory" condition has resolved or there are no + * more evictable items. */ +static int isEvictionProcRunning = 0; +static int evictionTimeProc( + struct aeEventLoop *eventLoop, long long id, void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); + + if (performEvictions() == EVICT_RUNNING) return 0; /* keep evicting */ + + /* For EVICT_OK - things are good, no need to keep evicting. + * For EVICT_FAIL - there is nothing left to evict. */ + isEvictionProcRunning = 0; + return AE_NOMORE; +} + +void startEvictionTimeProc(void) { + if (!isEvictionProcRunning) { + isEvictionProcRunning = 1; + aeCreateTimeEvent(server.el, 0, + evictionTimeProc, NULL, NULL); + } +} + +/* Check if it's safe to perform evictions. + * Returns 1 if evictions can be performed + * Returns 0 if eviction processing should be skipped + */ +static int isSafeToPerformEvictions(void) { + /* - There must be no script in timeout condition. + * - Nor we are loading data right now. */ + if (isInsideYieldingLongCommand() || server.loading) return 0; + + /* By default replicas should ignore maxmemory + * and just be masters exact copies. */ + if (server.masterhost && server.repl_slave_ignore_maxmemory) return 0; + + /* When clients are paused the dataset should be static not just from the + * POV of clients not being able to write, but also from the POV of + * expires and evictions of keys not being performed. */ + if (checkClientPauseTimeoutAndReturnIfPaused()) return 0; + + return 1; +} + +/* Algorithm for converting tenacity (0-100) to a time limit. */ +static unsigned long evictionTimeLimitUs() { + serverAssert(server.maxmemory_eviction_tenacity >= 0); + serverAssert(server.maxmemory_eviction_tenacity <= 100); + + if (server.maxmemory_eviction_tenacity <= 10) { + /* A linear progression from 0..500us */ + return 50uL * server.maxmemory_eviction_tenacity; + } + + if (server.maxmemory_eviction_tenacity < 100) { + /* A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 */ + return (unsigned long)(500.0 * pow(1.15, server.maxmemory_eviction_tenacity - 10.0)); + } + + return ULONG_MAX; /* No limit to eviction time */ +} + +/* Check that memory usage is within the current "maxmemory" limit. If over + * "maxmemory", attempt to free memory by evicting data (if it's safe to do so). + * + * It's possible for Redis to suddenly be significantly over the "maxmemory" + * setting. This can happen if there is a large allocation (like a hash table + * resize) or even if the "maxmemory" setting is manually adjusted. Because of + * this, it's important to evict for a managed period of time - otherwise Redis + * would become unresponsive while evicting. + * + * The goal of this function is to improve the memory situation - not to + * immediately resolve it. In the case that some items have been evicted but + * the "maxmemory" limit has not been achieved, an aeTimeProc will be started + * which will continue to evict items until memory limits are achieved or + * nothing more is evictable. + * + * This should be called before execution of commands. If EVICT_FAIL is + * returned, commands which will result in increased memory usage should be + * rejected. + * + * Returns: + * EVICT_OK - memory is OK or it's not possible to perform evictions now + * EVICT_RUNNING - memory is over the limit, but eviction is still processing + * EVICT_FAIL - memory is over the limit, and there's nothing to evict + * */ +int performEvictions(void) { + /* Note, we don't goto update_metrics here because this check skips eviction + * as if it wasn't triggered. it's a fake EVICT_OK. */ + if (!isSafeToPerformEvictions()) return EVICT_OK; + + int keys_freed = 0; + size_t mem_reported, mem_tofree; + long long mem_freed; /* May be negative */ + mstime_t latency, eviction_latency; + long long delta; + int slaves = listLength(server.slaves); + int result = EVICT_FAIL; + + if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) { + result = EVICT_OK; + goto update_metrics; + } + + if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) { + result = EVICT_FAIL; /* We need to free memory, but policy forbids. */ + goto update_metrics; + } + + unsigned long eviction_time_limit_us = evictionTimeLimitUs(); + + mem_freed = 0; + + latencyStartMonitor(latency); + + monotime evictionTimer; + elapsedStart(&evictionTimer); + + /* Unlike active-expire and blocked client, we can reach here from 'CONFIG SET maxmemory' + * so we have to back-up and restore server.core_propagates. */ + int prev_core_propagates = server.core_propagates; + serverAssert(server.also_propagate.numops == 0); + server.core_propagates = 1; + server.propagate_no_multi = 1; + + while (mem_freed < (long long)mem_tofree) { + int j, k, i; + static unsigned int next_db = 0; + sds bestkey = NULL; + int bestdbid; + redisDb *db; + dict *dict; + dictEntry *de; + + if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) || + server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) + { + struct evictionPoolEntry *pool = EvictionPoolLRU; + + while (bestkey == NULL) { + unsigned long total_keys = 0, keys; + + /* We don't want to make local-db choices when expiring keys, + * so to start populate the eviction pool sampling keys from + * every DB. */ + for (i = 0; i < server.dbnum; i++) { + db = server.db+i; + dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ? + db->dict : db->expires; + if ((keys = dictSize(dict)) != 0) { + evictionPoolPopulate(i, dict, db->dict, pool); + total_keys += keys; + } + } + if (!total_keys) break; /* No keys to evict. */ + + /* Go backward from best to worst element to evict. */ + for (k = EVPOOL_SIZE-1; k >= 0; k--) { + if (pool[k].key == NULL) continue; + bestdbid = pool[k].dbid; + + if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { + de = dictFind(server.db[bestdbid].dict, + pool[k].key); + } else { + de = dictFind(server.db[bestdbid].expires, + pool[k].key); + } + + /* Remove the entry from the pool. */ + if (pool[k].key != pool[k].cached) + sdsfree(pool[k].key); + pool[k].key = NULL; + pool[k].idle = 0; + + /* If the key exists, is our pick. Otherwise it is + * a ghost and we need to try the next element. */ + if (de) { + bestkey = dictGetKey(de); + break; + } else { + /* Ghost... Iterate again. */ + } + } + } + } + + /* volatile-random and allkeys-random policy */ + else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM || + server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM) + { + /* When evicting a random key, we try to evict a key for + * each DB, so we use the static 'next_db' variable to + * incrementally visit all DBs. */ + for (i = 0; i < server.dbnum; i++) { + j = (++next_db) % server.dbnum; + db = server.db+j; + dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ? + db->dict : db->expires; + if (dictSize(dict) != 0) { + de = dictGetRandomKey(dict); + bestkey = dictGetKey(de); + bestdbid = j; + break; + } + } + } + + /* Finally remove the selected key. */ + if (bestkey) { + db = server.db+bestdbid; + robj *keyobj = createStringObject(bestkey,sdslen(bestkey)); + /* We compute the amount of memory freed by db*Delete() alone. + * It is possible that actually the memory needed to propagate + * the DEL in AOF and replication link is greater than the one + * we are freeing removing the key, but we can't account for + * that otherwise we would never exit the loop. + * + * Same for CSC invalidation messages generated by signalModifiedKey. + * + * AOF and Output buffer memory will be freed eventually so + * we only care about memory used by the key space. */ + delta = (long long) zmalloc_used_memory(); + latencyStartMonitor(eviction_latency); + if (server.lazyfree_lazy_eviction) + dbAsyncDelete(db,keyobj); + else + dbSyncDelete(db,keyobj); + latencyEndMonitor(eviction_latency); + latencyAddSampleIfNeeded("eviction-del",eviction_latency); + delta -= (long long) zmalloc_used_memory(); + mem_freed += delta; + server.stat_evictedkeys++; + signalModifiedKey(NULL,db,keyobj); + notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted", + keyobj, db->id); + propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction); + decrRefCount(keyobj); + keys_freed++; + + if (keys_freed % 16 == 0) { + /* When the memory to free starts to be big enough, we may + * start spending so much time here that is impossible to + * deliver data to the replicas fast enough, so we force the + * transmission here inside the loop. */ + if (slaves) flushSlavesOutputBuffers(); + + /* Normally our stop condition is the ability to release + * a fixed, pre-computed amount of memory. However when we + * are deleting objects in another thread, it's better to + * check, from time to time, if we already reached our target + * memory, since the "mem_freed" amount is computed only + * across the dbAsyncDelete() call, while the thread can + * release the memory all the time. */ + if (server.lazyfree_lazy_eviction) { + if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) { + break; + } + } + + /* After some time, exit the loop early - even if memory limit + * hasn't been reached. If we suddenly need to free a lot of + * memory, don't want to spend too much time here. */ + if (elapsedUs(evictionTimer) > eviction_time_limit_us) { + // We still need to free memory - start eviction timer proc + startEvictionTimeProc(); + break; + } + } + } else { + goto cant_free; /* nothing to free... */ + } + } + /* at this point, the memory is OK, or we have reached the time limit */ + result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK; + +cant_free: + if (result == EVICT_FAIL) { + /* At this point, we have run out of evictable items. It's possible + * that some items are being freed in the lazyfree thread. Perform a + * short wait here if such jobs exist, but don't wait long. */ + mstime_t lazyfree_latency; + latencyStartMonitor(lazyfree_latency); + while (bioPendingJobsOfType(BIO_LAZY_FREE) && + elapsedUs(evictionTimer) < eviction_time_limit_us) { + if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) { + result = EVICT_OK; + break; + } + usleep(eviction_time_limit_us < 1000 ? eviction_time_limit_us : 1000); + } + latencyEndMonitor(lazyfree_latency); + latencyAddSampleIfNeeded("eviction-lazyfree",lazyfree_latency); + } + + serverAssert(server.core_propagates); /* This function should not be re-entrant */ + + /* Propagate all DELs */ + propagatePendingCommands(); + + server.core_propagates = prev_core_propagates; + server.propagate_no_multi = 0; + + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("eviction-cycle",latency); + +update_metrics: + if (result == EVICT_RUNNING || result == EVICT_FAIL) { + if (server.stat_last_eviction_exceeded_time == 0) + elapsedStart(&server.stat_last_eviction_exceeded_time); + } else if (result == EVICT_OK) { + if (server.stat_last_eviction_exceeded_time != 0) { + server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time); + server.stat_last_eviction_exceeded_time = 0; + } + } + return result; +} |