summaryrefslogtreecommitdiffstats
path: root/lib/dns/rbtdb.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dns/rbtdb.c')
-rw-r--r--lib/dns/rbtdb.c237
1 files changed, 169 insertions, 68 deletions
diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c
index 5d36466..b09d97f 100644
--- a/lib/dns/rbtdb.c
+++ b/lib/dns/rbtdb.c
@@ -82,7 +82,7 @@
typedef uint32_t rbtdb_serial_t;
typedef uint32_t rbtdb_rdatatype_t;
-#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type)&0xFFFF))
+#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF))
#define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16))
#define RBTDB_RDATATYPE_VALUE(base, ext) \
((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \
@@ -477,12 +477,27 @@ struct dns_rbtdb {
*/
rdatasetheaderlist_t *rdatasets;
+ /*
+ * Start point % node_lock_count for next LRU cleanup.
+ */
+ atomic_uint lru_sweep;
+
+ /*
+ * When performing LRU cleaning limit cleaning to headers that were
+ * last used at or before this.
+ */
+ atomic_uint last_used;
+
/*%
* Temporary storage for stale cache nodes and dynamically deleted
* nodes that await being cleaned up.
*/
rbtnodelist_t *deadnodes;
+ /* List of nodes from which recursive tree pruning can be started from.
+ * Locked by tree_lock. */
+ rbtnodelist_t prunenodes;
+
/*
* Heaps. These are used for TTL based expiry in a cache,
* or for zone resigning in a zone DB. hmctx is the memory
@@ -561,8 +576,7 @@ static void
expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
expire_t reason);
static void
-overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
- bool tree_locked);
+overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked);
static void
resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
static void
@@ -993,6 +1007,7 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
unsigned int i;
isc_result_t result;
char buf[DNS_NAME_FORMATSIZE];
+ dns_rbtnode_t *node = NULL;
dns_rbt_t **treep;
isc_time_t start;
@@ -1018,8 +1033,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
* the overhead of unlinking all nodes here should be negligible.
*/
for (i = 0; i < rbtdb->node_lock_count; i++) {
- dns_rbtnode_t *node;
-
node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
while (node != NULL) {
ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
@@ -1027,6 +1040,12 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
}
}
+ node = ISC_LIST_HEAD(rbtdb->prunenodes);
+ while (node != NULL) {
+ ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink);
+ node = ISC_LIST_HEAD(rbtdb->prunenodes);
+ }
+
if (event == NULL) {
rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
}
@@ -1832,19 +1851,32 @@ is_leaf(dns_rbtnode_t *node) {
node->left == NULL && node->right == NULL);
}
+/*%
+ * The tree lock must be held when this function is called as it reads and
+ * updates rbtdb->prunenodes.
+ */
static void
send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
isc_rwlocktype_t locktype) {
- isc_event_t *ev;
- dns_db_t *db;
+ bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL);
+
+ INSIST(locktype == isc_rwlocktype_write);
- ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE,
- prune_tree, node, sizeof(isc_event_t));
new_reference(rbtdb, node, locktype);
- db = NULL;
- attach((dns_db_t *)rbtdb, &db);
- ev->ev_sender = db;
- isc_task_send(rbtdb->task, &ev);
+ INSIST(!ISC_LINK_LINKED(node, prunelink));
+ ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink);
+
+ if (!pruning_queued) {
+ isc_event_t *ev = NULL;
+ dns_db_t *db = NULL;
+
+ attach((dns_db_t *)rbtdb, &db);
+
+ ev = isc_event_allocate(rbtdb->common.mctx, NULL,
+ DNS_EVENT_RBTPRUNE, prune_tree, db,
+ sizeof(isc_event_t));
+ isc_task_send(rbtdb->task, &ev);
+ }
}
/*%
@@ -2119,17 +2151,26 @@ restore_locks:
}
/*
- * Prune the tree by recursively cleaning-up single leaves. In the worst
- * case, the number of iteration is the number of tree levels, which is at
- * most the maximum number of domain name labels, i.e, 127. In practice, this
- * should be much smaller (only a few times), and even the worst case would be
- * acceptable for a single event.
+ * Prune the tree by recursively cleaning up single leaves. Go through all
+ * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst
+ * case, it will be necessary to traverse a number of tree levels equal to the
+ * maximum legal number of domain name labels (127); in practice, the number of
+ * tree levels to traverse will virtually always be much smaller (a few levels
+ * at most). While holding the tree lock throughout this entire operation is
+ * less than ideal, so is splitting the latter up by queueing a separate
+ * prune_tree() run for each node to start pruning from (as queueing requires
+ * allocating memory and can therefore potentially be exploited to exhaust
+ * available memory). Also note that actually freeing up the memory used by
+ * RBTDB nodes (which is what this function does) is essential to keeping cache
+ * memory use in check, so since the tree lock needs to be acquired anyway,
+ * freeing as many nodes as possible before the tree lock gets released is
+ * prudent.
*/
static void
prune_tree(isc_task_t *task, isc_event_t *event) {
- dns_rbtdb_t *rbtdb = event->ev_sender;
- dns_rbtnode_t *node = event->ev_arg;
- dns_rbtnode_t *parent;
+ dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)event->ev_arg;
+ dns_rbtnode_t *node = NULL;
+ dns_rbtnode_t *parent = NULL;
unsigned int locknum;
UNUSED(task);
@@ -2137,44 +2178,60 @@ prune_tree(isc_task_t *task, isc_event_t *event) {
isc_event_free(&event);
RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
- locknum = node->locknum;
- NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
- do {
- parent = node->parent;
- decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
- isc_rwlocktype_write, true);
- if (parent != NULL && parent->down == NULL) {
- /*
- * node was the only down child of the parent and has
- * just been removed. We'll then need to examine the
- * parent. Keep the lock if possible; otherwise,
- * release the old lock and acquire one for the parent.
- */
- if (parent->locknum != locknum) {
- NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
- isc_rwlocktype_write);
- locknum = parent->locknum;
- NODE_LOCK(&rbtdb->node_locks[locknum].lock,
- isc_rwlocktype_write);
+ while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) {
+ locknum = node->locknum;
+ NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ do {
+ if (ISC_LINK_LINKED(node, prunelink)) {
+ ISC_LIST_UNLINK(rbtdb->prunenodes, node,
+ prunelink);
}
- /*
- * We need to gain a reference to the node before
- * decrementing it in the next iteration.
- */
- if (ISC_LINK_LINKED(parent, deadlink)) {
- ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
+ parent = node->parent;
+ decrement_reference(rbtdb, node, 0,
+ isc_rwlocktype_write,
+ isc_rwlocktype_write, true);
+
+ if (parent != NULL && parent->down == NULL) {
+ /*
+ * node was the only down child of the parent
+ * and has just been removed. We'll then need
+ * to examine the parent. Keep the lock if
+ * possible; otherwise, release the old lock and
+ * acquire one for the parent.
+ */
+ if (parent->locknum != locknum) {
+ NODE_UNLOCK(
+ &rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ locknum = parent->locknum;
+ NODE_LOCK(
+ &rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ }
+
+ /*
+ * We need to gain a reference to the node
+ * before decrementing it in the next iteration.
+ */
+ if (ISC_LINK_LINKED(parent, deadlink)) {
+ ISC_LIST_UNLINK(
+ rbtdb->deadnodes[locknum],
parent, deadlink);
+ }
+ new_reference(rbtdb, parent,
+ isc_rwlocktype_write);
+ } else {
+ parent = NULL;
}
- new_reference(rbtdb, parent, isc_rwlocktype_write);
- } else {
- parent = NULL;
- }
- node = parent;
- } while (node != NULL);
- NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+ node = parent;
+ } while (node != NULL);
+ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ }
RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
detach((dns_db_t **)&rbtdb);
@@ -6444,6 +6501,9 @@ find_header:
if (header->rdh_ttl > newheader->rdh_ttl) {
set_ttl(rbtdb, header, newheader->rdh_ttl);
}
+ if (header->last_used != now) {
+ update_header(rbtdb, header, now);
+ }
if (header->noqname == NULL &&
newheader->noqname != NULL)
{
@@ -6496,6 +6556,9 @@ find_header:
if (header->rdh_ttl > newheader->rdh_ttl) {
set_ttl(rbtdb, header, newheader->rdh_ttl);
}
+ if (header->last_used != now) {
+ update_header(rbtdb, header, now);
+ }
if (header->noqname == NULL &&
newheader->noqname != NULL)
{
@@ -6523,6 +6586,9 @@ find_header:
idx = newheader->node->locknum;
if (IS_CACHE(rbtdb)) {
if (ZEROTTL(newheader)) {
+ newheader->last_used =
+ atomic_load(&rbtdb->last_used) +
+ 1;
ISC_LIST_APPEND(rbtdb->rdatasets[idx],
newheader, link);
} else {
@@ -6564,6 +6630,9 @@ find_header:
INSIST(rbtdb->heaps != NULL);
isc_heap_insert(rbtdb->heaps[idx], newheader);
if (ZEROTTL(newheader)) {
+ newheader->last_used =
+ atomic_load(&rbtdb->last_used) +
+ 1;
ISC_LIST_APPEND(rbtdb->rdatasets[idx],
newheader, link);
} else {
@@ -6969,8 +7038,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
}
if (cache_is_overmem) {
- overmem_purge(rbtdb, rbtnode->locknum, rdataset_size(newheader),
- tree_locked);
+ overmem_purge(rbtdb, newheader, tree_locked);
}
NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
@@ -8305,6 +8373,8 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
ISC_LIST_INIT(rbtdb->deadnodes[i]);
}
+ ISC_LIST_INIT(rbtdb->prunenodes);
+
rbtdb->active = rbtdb->node_lock_count;
for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
@@ -9640,7 +9710,7 @@ rehash_bits(rbtdb_version_t *version, size_t newcount) {
uint32_t newbits = oldbits;
while (newcount >= HASHSIZE(newbits) &&
- newbits <= RBTDB_GLUE_TABLE_MAX_BITS)
+ newbits < RBTDB_GLUE_TABLE_MAX_BITS)
{
newbits += 1;
}
@@ -10147,7 +10217,10 @@ expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize,
size_t purged = 0;
for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
- header != NULL && purged <= purgesize; header = header_prev)
+ header != NULL &&
+ header->last_used <= atomic_load(&rbtdb->last_used) &&
+ purged <= purgesize;
+ header = header_prev)
{
header_prev = ISC_LIST_PREV(header, link);
/*
@@ -10171,30 +10244,58 @@ expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize,
* entries under the overmem condition. To recover from this condition quickly,
* we cleanup entries up to the size of newly added rdata (passed as purgesize).
*
- * This process is triggered while adding a new entry, and we specifically avoid
- * purging entries in the same LRU bucket as the one to which the new entry will
- * belong. Otherwise, we might purge entries of the same name of different RR
- * types while adding RRsets from a single response (consider the case where
- * we're adding A and AAAA glue records of the same NS name).
+ * The LRU lists tails are processed in LRU order to the nearest second.
+ *
+ * A write lock on the tree must be held.
*/
static void
-overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, size_t purgesize,
+overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
bool tree_locked) {
- unsigned int locknum;
+ uint32_t locknum_start = atomic_fetch_add(&rbtdb->lru_sweep, 1) %
+ rbtdb->node_lock_count;
+ uint32_t locknum = locknum_start;
+ /* Size of added data, possible node and possible ENT node. */
+ size_t purgesize = rdataset_size(newheader) +
+ 2 * dns__rbtnode_getsize(newheader->node);
size_t purged = 0;
+ isc_stdtime_t min_last_used = 0;
+ size_t max_passes = 8;
- for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
- locknum != locknum_start && purged <= purgesize;
- locknum = (locknum + 1) % rbtdb->node_lock_count)
- {
+again:
+ do {
NODE_LOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
purged += expire_lru_headers(rbtdb, locknum, purgesize - purged,
tree_locked);
+ /*
+ * Work out the oldest remaining last_used values of the list
+ * tails as we walk across the array of lru lists.
+ */
+ rdatasetheader_t *header =
+ ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
+ if (header != NULL &&
+ (min_last_used == 0 || header->last_used < min_last_used))
+ {
+ min_last_used = header->last_used;
+ }
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
+ locknum = (locknum + 1) % rbtdb->node_lock_count;
+ } while (locknum != locknum_start && purged <= purgesize);
+
+ /*
+ * Update rbtdb->last_used if we have walked all the list tails and have
+ * not freed the required amount of memory.
+ */
+ if (purged < purgesize) {
+ if (min_last_used != 0) {
+ atomic_store(&rbtdb->last_used, min_last_used);
+ if (max_passes-- > 0) {
+ goto again;
+ }
+ }
}
}