diff options
Diffstat (limited to 'lib/dns/badcache.c')
-rw-r--r-- | lib/dns/badcache.c | 525 |
1 files changed, 525 insertions, 0 deletions
diff --git a/lib/dns/badcache.c b/lib/dns/badcache.c new file mode 100644 index 0000000..92116c0 --- /dev/null +++ b/lib/dns/badcache.c @@ -0,0 +1,525 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +/*! \file */ + +#include <inttypes.h> +#include <stdbool.h> + +#include <isc/buffer.h> +#include <isc/hash.h> +#include <isc/log.h> +#include <isc/mem.h> +#include <isc/mutex.h> +#include <isc/platform.h> +#include <isc/print.h> +#include <isc/rwlock.h> +#include <isc/string.h> +#include <isc/time.h> +#include <isc/util.h> + +#include <dns/badcache.h> +#include <dns/name.h> +#include <dns/rdatatype.h> +#include <dns/types.h> + +typedef struct dns_bcentry dns_bcentry_t; + +struct dns_badcache { + unsigned int magic; + isc_rwlock_t lock; + isc_mem_t *mctx; + + isc_mutex_t *tlocks; + dns_bcentry_t **table; + + atomic_uint_fast32_t count; + atomic_uint_fast32_t sweep; + + unsigned int minsize; + unsigned int size; +}; + +#define BADCACHE_MAGIC ISC_MAGIC('B', 'd', 'C', 'a') +#define VALID_BADCACHE(m) ISC_MAGIC_VALID(m, BADCACHE_MAGIC) + +struct dns_bcentry { + dns_bcentry_t *next; + dns_rdatatype_t type; + isc_time_t expire; + uint32_t flags; + unsigned int hashval; + dns_name_t name; +}; + +static void +badcache_resize(dns_badcache_t *bc, isc_time_t *now); + +isc_result_t +dns_badcache_init(isc_mem_t *mctx, unsigned int size, dns_badcache_t **bcp) { + dns_badcache_t *bc = NULL; + unsigned int i; + + REQUIRE(bcp != NULL && *bcp == NULL); + REQUIRE(mctx != NULL); + + bc = isc_mem_get(mctx, sizeof(dns_badcache_t)); + memset(bc, 0, sizeof(dns_badcache_t)); + + isc_mem_attach(mctx, &bc->mctx); + isc_rwlock_init(&bc->lock, 0, 0); + + bc->table = isc_mem_get(bc->mctx, sizeof(*bc->table) * size); + bc->tlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * size); + for (i = 0; i < size; i++) { + isc_mutex_init(&bc->tlocks[i]); + } + bc->size = bc->minsize = size; + memset(bc->table, 0, bc->size * sizeof(dns_bcentry_t *)); + + atomic_init(&bc->count, 0); + atomic_init(&bc->sweep, 0); + bc->magic = BADCACHE_MAGIC; + + *bcp = bc; + return (ISC_R_SUCCESS); +} + +void +dns_badcache_destroy(dns_badcache_t **bcp) { + dns_badcache_t *bc; + unsigned int i; + + REQUIRE(bcp != NULL && *bcp != NULL); + bc = *bcp; + *bcp = NULL; + + dns_badcache_flush(bc); + + bc->magic = 0; + isc_rwlock_destroy(&bc->lock); + for (i = 0; i < bc->size; i++) { + isc_mutex_destroy(&bc->tlocks[i]); + } + isc_mem_put(bc->mctx, bc->table, sizeof(dns_bcentry_t *) * bc->size); + isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size); + isc_mem_putanddetach(&bc->mctx, bc, sizeof(dns_badcache_t)); +} + +static void +badcache_resize(dns_badcache_t *bc, isc_time_t *now) { + dns_bcentry_t **newtable, *bad, *next; + isc_mutex_t *newlocks; + unsigned int newsize, i; + bool grow; + + RWLOCK(&bc->lock, isc_rwlocktype_write); + + /* + * XXXWPK we will have a thundering herd problem here, + * as all threads will wait on the RWLOCK when there's + * a need to resize badcache. + * However, it happens so rarely it should not be a + * performance issue. This is because we double the + * size every time we grow it, and we don't shrink + * unless the number of entries really shrunk. In a + * high load situation, the number of badcache entries + * will eventually stabilize. + */ + if (atomic_load_relaxed(&bc->count) > bc->size * 8) { + grow = true; + } else if (atomic_load_relaxed(&bc->count) < bc->size * 2 && + bc->size > bc->minsize) + { + grow = false; + } else { + /* Someone resized it already, bail. */ + RWUNLOCK(&bc->lock, isc_rwlocktype_write); + return; + } + + if (grow) { + newsize = bc->size * 2 + 1; + } else { + newsize = (bc->size - 1) / 2; +#ifdef __clang_analyzer__ + /* + * XXXWPK there's a bug in clang static analyzer - + * `value % newsize` is considered undefined even though + * we check if newsize is larger than 0. This helps. + */ + newsize += 1; +#endif + } + RUNTIME_CHECK(newsize > 0); + + newtable = isc_mem_get(bc->mctx, sizeof(dns_bcentry_t *) * newsize); + memset(newtable, 0, sizeof(dns_bcentry_t *) * newsize); + + newlocks = isc_mem_get(bc->mctx, sizeof(isc_mutex_t) * newsize); + + /* Copy existing mutexes */ + for (i = 0; i < newsize && i < bc->size; i++) { + newlocks[i] = bc->tlocks[i]; + } + /* Initialize additional mutexes if we're growing */ + for (i = bc->size; i < newsize; i++) { + isc_mutex_init(&newlocks[i]); + } + /* Destroy extra mutexes if we're shrinking */ + for (i = newsize; i < bc->size; i++) { + isc_mutex_destroy(&bc->tlocks[i]); + } + + for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) { + for (bad = bc->table[i]; bad != NULL; bad = next) { + next = bad->next; + if (isc_time_compare(&bad->expire, now) < 0) { + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } else { + bad->next = newtable[bad->hashval % newsize]; + newtable[bad->hashval % newsize] = bad; + } + } + bc->table[i] = NULL; + } + + isc_mem_put(bc->mctx, bc->tlocks, sizeof(isc_mutex_t) * bc->size); + bc->tlocks = newlocks; + + isc_mem_put(bc->mctx, bc->table, sizeof(*bc->table) * bc->size); + bc->size = newsize; + bc->table = newtable; + + RWUNLOCK(&bc->lock, isc_rwlocktype_write); +} + +void +dns_badcache_add(dns_badcache_t *bc, const dns_name_t *name, + dns_rdatatype_t type, bool update, uint32_t flags, + isc_time_t *expire) { + isc_result_t result; + unsigned int hashval, hash; + dns_bcentry_t *bad, *prev, *next; + isc_time_t now; + bool resize = false; + + REQUIRE(VALID_BADCACHE(bc)); + REQUIRE(name != NULL); + REQUIRE(expire != NULL); + + RWLOCK(&bc->lock, isc_rwlocktype_read); + + result = isc_time_now(&now); + if (result != ISC_R_SUCCESS) { + isc_time_settoepoch(&now); + } + + hashval = dns_name_hash(name, false); + hash = hashval % bc->size; + LOCK(&bc->tlocks[hash]); + prev = NULL; + for (bad = bc->table[hash]; bad != NULL; bad = next) { + next = bad->next; + if (bad->type == type && dns_name_equal(name, &bad->name)) { + if (update) { + bad->expire = *expire; + bad->flags = flags; + } + break; + } + if (isc_time_compare(&bad->expire, &now) < 0) { + if (prev == NULL) { + bc->table[hash] = bad->next; + } else { + prev->next = bad->next; + } + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } else { + prev = bad; + } + } + + if (bad == NULL) { + isc_buffer_t buffer; + bad = isc_mem_get(bc->mctx, sizeof(*bad) + name->length); + bad->type = type; + bad->hashval = hashval; + bad->expire = *expire; + bad->flags = flags; + isc_buffer_init(&buffer, bad + 1, name->length); + dns_name_init(&bad->name, NULL); + dns_name_copy(name, &bad->name, &buffer); + bad->next = bc->table[hash]; + bc->table[hash] = bad; + unsigned count = atomic_fetch_add_relaxed(&bc->count, 1); + if ((count > bc->size * 8) || + (count < bc->size * 2 && bc->size > bc->minsize)) + { + resize = true; + } + } else { + bad->expire = *expire; + } + + UNLOCK(&bc->tlocks[hash]); + RWUNLOCK(&bc->lock, isc_rwlocktype_read); + if (resize) { + badcache_resize(bc, &now); + } +} + +bool +dns_badcache_find(dns_badcache_t *bc, const dns_name_t *name, + dns_rdatatype_t type, uint32_t *flagp, isc_time_t *now) { + dns_bcentry_t *bad, *prev, *next; + bool answer = false; + unsigned int i; + unsigned int hash; + + REQUIRE(VALID_BADCACHE(bc)); + REQUIRE(name != NULL); + REQUIRE(now != NULL); + + RWLOCK(&bc->lock, isc_rwlocktype_read); + + /* + * XXXMUKS: dns_name_equal() is expensive as it does a + * octet-by-octet comparison, and it can be made better in two + * ways here. First, lowercase the names (use + * dns_name_downcase() instead of dns_name_copy() in + * dns_badcache_add()) so that dns_name_caseequal() can be used + * which the compiler will emit as SIMD instructions. Second, + * don't put multiple copies of the same name in the chain (or + * multiple names will have to be matched for equality), but use + * name->link to store the type specific part. + */ + + if (atomic_load_relaxed(&bc->count) == 0) { + goto skip; + } + + hash = dns_name_hash(name, false) % bc->size; + prev = NULL; + LOCK(&bc->tlocks[hash]); + for (bad = bc->table[hash]; bad != NULL; bad = next) { + next = bad->next; + /* + * Search the hash list. Clean out expired records as we go. + */ + if (isc_time_compare(&bad->expire, now) < 0) { + if (prev != NULL) { + prev->next = bad->next; + } else { + bc->table[hash] = bad->next; + } + + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub(&bc->count, 1); + continue; + } + if (bad->type == type && dns_name_equal(name, &bad->name)) { + if (flagp != NULL) { + *flagp = bad->flags; + } + answer = true; + break; + } + prev = bad; + } + UNLOCK(&bc->tlocks[hash]); +skip: + + /* + * Slow sweep to clean out stale records. + */ + i = atomic_fetch_add(&bc->sweep, 1) % bc->size; + if (isc_mutex_trylock(&bc->tlocks[i]) == ISC_R_SUCCESS) { + bad = bc->table[i]; + if (bad != NULL && isc_time_compare(&bad->expire, now) < 0) { + bc->table[i] = bad->next; + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } + UNLOCK(&bc->tlocks[i]); + } + + RWUNLOCK(&bc->lock, isc_rwlocktype_read); + return (answer); +} + +void +dns_badcache_flush(dns_badcache_t *bc) { + dns_bcentry_t *entry, *next; + unsigned int i; + + RWLOCK(&bc->lock, isc_rwlocktype_write); + REQUIRE(VALID_BADCACHE(bc)); + + for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) { + for (entry = bc->table[i]; entry != NULL; entry = next) { + next = entry->next; + isc_mem_put(bc->mctx, entry, + sizeof(*entry) + entry->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } + bc->table[i] = NULL; + } + RWUNLOCK(&bc->lock, isc_rwlocktype_write); +} + +void +dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) { + dns_bcentry_t *bad, *prev, *next; + isc_result_t result; + isc_time_t now; + unsigned int hash; + + REQUIRE(VALID_BADCACHE(bc)); + REQUIRE(name != NULL); + + RWLOCK(&bc->lock, isc_rwlocktype_read); + + result = isc_time_now(&now); + if (result != ISC_R_SUCCESS) { + isc_time_settoepoch(&now); + } + hash = dns_name_hash(name, false) % bc->size; + LOCK(&bc->tlocks[hash]); + prev = NULL; + for (bad = bc->table[hash]; bad != NULL; bad = next) { + int n; + next = bad->next; + n = isc_time_compare(&bad->expire, &now); + if (n < 0 || dns_name_equal(name, &bad->name)) { + if (prev == NULL) { + bc->table[hash] = bad->next; + } else { + prev->next = bad->next; + } + + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } else { + prev = bad; + } + } + UNLOCK(&bc->tlocks[hash]); + + RWUNLOCK(&bc->lock, isc_rwlocktype_read); +} + +void +dns_badcache_flushtree(dns_badcache_t *bc, const dns_name_t *name) { + dns_bcentry_t *bad, *prev, *next; + unsigned int i; + int n; + isc_time_t now; + isc_result_t result; + + REQUIRE(VALID_BADCACHE(bc)); + REQUIRE(name != NULL); + + /* + * We write lock the tree to avoid relocking every node + * individually. + */ + RWLOCK(&bc->lock, isc_rwlocktype_write); + + result = isc_time_now(&now); + if (result != ISC_R_SUCCESS) { + isc_time_settoepoch(&now); + } + + for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) { + prev = NULL; + for (bad = bc->table[i]; bad != NULL; bad = next) { + next = bad->next; + n = isc_time_compare(&bad->expire, &now); + if (n < 0 || dns_name_issubdomain(&bad->name, name)) { + if (prev == NULL) { + bc->table[i] = bad->next; + } else { + prev->next = bad->next; + } + + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + } else { + prev = bad; + } + } + } + + RWUNLOCK(&bc->lock, isc_rwlocktype_write); +} + +void +dns_badcache_print(dns_badcache_t *bc, const char *cachename, FILE *fp) { + char namebuf[DNS_NAME_FORMATSIZE]; + char typebuf[DNS_RDATATYPE_FORMATSIZE]; + dns_bcentry_t *bad, *next, *prev; + isc_time_t now; + unsigned int i; + uint64_t t; + + REQUIRE(VALID_BADCACHE(bc)); + REQUIRE(cachename != NULL); + REQUIRE(fp != NULL); + + /* + * We write lock the tree to avoid relocking every node + * individually. + */ + RWLOCK(&bc->lock, isc_rwlocktype_write); + fprintf(fp, ";\n; %s\n;\n", cachename); + + TIME_NOW(&now); + for (i = 0; atomic_load_relaxed(&bc->count) > 0 && i < bc->size; i++) { + prev = NULL; + for (bad = bc->table[i]; bad != NULL; bad = next) { + next = bad->next; + if (isc_time_compare(&bad->expire, &now) < 0) { + if (prev != NULL) { + prev->next = bad->next; + } else { + bc->table[i] = bad->next; + } + + isc_mem_put(bc->mctx, bad, + sizeof(*bad) + bad->name.length); + atomic_fetch_sub_relaxed(&bc->count, 1); + continue; + } + prev = bad; + dns_name_format(&bad->name, namebuf, sizeof(namebuf)); + dns_rdatatype_format(bad->type, typebuf, + sizeof(typebuf)); + t = isc_time_microdiff(&bad->expire, &now); + t /= 1000; + fprintf(fp, + "; %s/%s [ttl " + "%" PRIu64 "]\n", + namebuf, typebuf, t); + } + } + RWUNLOCK(&bc->lock, isc_rwlocktype_write); +} |