diff options
Diffstat (limited to '')
-rw-r--r-- | lib/dns/rrl.c | 1320 |
1 files changed, 1320 insertions, 0 deletions
diff --git a/lib/dns/rrl.c b/lib/dns/rrl.c new file mode 100644 index 0000000..cb7e7b6 --- /dev/null +++ b/lib/dns/rrl.c @@ -0,0 +1,1320 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +/*! \file */ + +/* + * Rate limit DNS responses. + */ + +/* #define ISC_LIST_CHECKINIT */ + +#include <config.h> + +#include <inttypes.h> +#include <stdbool.h> + +#include <isc/mem.h> +#include <isc/net.h> +#include <isc/netaddr.h> +#include <isc/print.h> +#include <isc/util.h> + +#include <dns/result.h> +#include <dns/rcode.h> +#include <dns/rdatatype.h> +#include <dns/rdataclass.h> +#include <dns/log.h> +#include <dns/rrl.h> +#include <dns/view.h> + +static void +log_end(dns_rrl_t *rrl, dns_rrl_entry_t *e, bool early, + char *log_buf, unsigned int log_buf_len); + +/* + * Get a modulus for a hash function that is tolerably likely to be + * relatively prime to most inputs. Of course, we get a prime for for initial + * values not larger than the square of the last prime. We often get a prime + * after that. + * This works well in practice for hash tables up to at least 100 + * times the square of the last prime and better than a multiplicative hash. + */ +static int +hash_divisor(unsigned int initial) { + static uint16_t primes[] = { + 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, + 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, +#if 0 + 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, + 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, + 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, + 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, + 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, + 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, + 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, + 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, + 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, + 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, + 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, + 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997,1009, +#endif + }; + int divisions, tries; + unsigned int result; + uint16_t *pp, p; + + result = initial; + + if (primes[sizeof(primes)/sizeof(primes[0])-1] >= result) { + pp = primes; + while (*pp < result) + ++pp; + return (*pp); + } + + if ((result & 1) == 0) + ++result; + + divisions = 0; + tries = 1; + pp = primes; + do { + p = *pp++; + ++divisions; + if ((result % p) == 0) { + ++tries; + result += 2; + pp = primes; + } + } while (pp < &primes[sizeof(primes) / sizeof(primes[0])]); + + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG3)) + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DEBUG3, + "%d hash_divisor() divisions in %d tries" + " to get %d from %d", + divisions, tries, result, initial); + + return (result); +} + +/* + * Convert a timestamp to a number of seconds in the past. + */ +static inline int +delta_rrl_time(isc_stdtime_t ts, isc_stdtime_t now) { + int delta; + + delta = now - ts; + if (delta >= 0) + return (delta); + + /* + * The timestamp is in the future. That future might result from + * re-ordered requests, because we use timestamps on requests + * instead of consulting a clock. Timestamps in the distant future are + * assumed to result from clock changes. When the clock changes to + * the past, make existing timestamps appear to be in the past. + */ + if (delta < -DNS_RRL_MAX_TIME_TRAVEL) + return (DNS_RRL_FOREVER); + return (0); +} + +static inline int +get_age(const dns_rrl_t *rrl, const dns_rrl_entry_t *e, isc_stdtime_t now) { + if (!e->ts_valid) + return (DNS_RRL_FOREVER); + return (delta_rrl_time(e->ts + rrl->ts_bases[e->ts_gen], now)); +} + +static inline void +set_age(dns_rrl_t *rrl, dns_rrl_entry_t *e, isc_stdtime_t now) { + dns_rrl_entry_t *e_old; + unsigned int ts_gen; + int i, ts; + + ts_gen = rrl->ts_gen; + ts = now - rrl->ts_bases[ts_gen]; + if (ts < 0) { + if (ts < -DNS_RRL_MAX_TIME_TRAVEL) + ts = DNS_RRL_FOREVER; + else + ts = 0; + } + + /* + * Make a new timestamp base if the current base is too old. + * All entries older than DNS_RRL_MAX_WINDOW seconds are ancient, + * useless history. Their timestamps can be treated as if they are + * all the same. + * We only do arithmetic on more recent timestamps, so bases for + * older timestamps can be recycled provided the old timestamps are + * marked as ancient history. + * This loop is almost always very short because most entries are + * recycled after one second and any entries that need to be marked + * are older than (DNS_RRL_TS_BASES)*DNS_RRL_MAX_TS seconds. + */ + if (ts >= DNS_RRL_MAX_TS) { + ts_gen = (ts_gen + 1) % DNS_RRL_TS_BASES; + for (e_old = ISC_LIST_TAIL(rrl->lru), i = 0; + e_old != NULL && (e_old->ts_gen == ts_gen || + !ISC_LINK_LINKED(e_old, hlink)); + e_old = ISC_LIST_PREV(e_old, lru), ++i) + { + e_old->ts_valid = false; + } + if (i != 0) + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DEBUG1, + "rrl new time base scanned %d entries" + " at %d for %d %d %d %d", + i, now, rrl->ts_bases[ts_gen], + rrl->ts_bases[(ts_gen + 1) % + DNS_RRL_TS_BASES], + rrl->ts_bases[(ts_gen + 2) % + DNS_RRL_TS_BASES], + rrl->ts_bases[(ts_gen + 3) % + DNS_RRL_TS_BASES]); + rrl->ts_gen = ts_gen; + rrl->ts_bases[ts_gen] = now; + ts = 0; + } + + e->ts_gen = ts_gen; + e->ts = ts; + e->ts_valid = true; +} + +static isc_result_t +expand_entries(dns_rrl_t *rrl, int newsize) { + unsigned int bsize; + dns_rrl_block_t *b; + dns_rrl_entry_t *e; + double rate; + int i; + + if (rrl->num_entries + newsize >= rrl->max_entries && + rrl->max_entries != 0) + { + newsize = rrl->max_entries - rrl->num_entries; + if (newsize <= 0) + return (ISC_R_SUCCESS); + } + + /* + * Log expansions so that the user can tune max-table-size + * and min-table-size. + */ + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DROP) && + rrl->hash != NULL) { + rate = rrl->probes; + if (rrl->searches != 0) + rate /= rrl->searches; + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DROP, + "increase from %d to %d RRL entries with" + " %d bins; average search length %.1f", + rrl->num_entries, rrl->num_entries+newsize, + rrl->hash->length, rate); + } + + bsize = sizeof(dns_rrl_block_t) + (newsize-1)*sizeof(dns_rrl_entry_t); + b = isc_mem_get(rrl->mctx, bsize); + if (b == NULL) { + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_FAIL, + "isc_mem_get(%d) failed for RRL entries", + bsize); + return (ISC_R_NOMEMORY); + } + memset(b, 0, bsize); + b->size = bsize; + + e = b->entries; + for (i = 0; i < newsize; ++i, ++e) { + ISC_LINK_INIT(e, hlink); + ISC_LIST_INITANDAPPEND(rrl->lru, e, lru); + } + rrl->num_entries += newsize; + ISC_LIST_INITANDAPPEND(rrl->blocks, b, link); + + return (ISC_R_SUCCESS); +} + +static inline dns_rrl_bin_t * +get_bin(dns_rrl_hash_t *hash, unsigned int hval) { + INSIST(hash != NULL); + return (&hash->bins[hval % hash->length]); +} + +static void +free_old_hash(dns_rrl_t *rrl) { + dns_rrl_hash_t *old_hash; + dns_rrl_bin_t *old_bin; + dns_rrl_entry_t *e, *e_next; + + old_hash = rrl->old_hash; + for (old_bin = &old_hash->bins[0]; + old_bin < &old_hash->bins[old_hash->length]; + ++old_bin) + { + for (e = ISC_LIST_HEAD(*old_bin); e != NULL; e = e_next) { + e_next = ISC_LIST_NEXT(e, hlink); + ISC_LINK_INIT(e, hlink); + } + } + + isc_mem_put(rrl->mctx, old_hash, + sizeof(*old_hash) + + (old_hash->length - 1) * sizeof(old_hash->bins[0])); + rrl->old_hash = NULL; +} + +static isc_result_t +expand_rrl_hash(dns_rrl_t *rrl, isc_stdtime_t now) { + dns_rrl_hash_t *hash; + int old_bins, new_bins, hsize; + double rate; + + if (rrl->old_hash != NULL) + free_old_hash(rrl); + + /* + * Most searches fail and so go to the end of the chain. + * Use a small hash table load factor. + */ + old_bins = (rrl->hash == NULL) ? 0 : rrl->hash->length; + new_bins = old_bins/8 + old_bins; + if (new_bins < rrl->num_entries) + new_bins = rrl->num_entries; + new_bins = hash_divisor(new_bins); + + hsize = sizeof(dns_rrl_hash_t) + (new_bins-1)*sizeof(hash->bins[0]); + hash = isc_mem_get(rrl->mctx, hsize); + if (hash == NULL) { + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_FAIL, + "isc_mem_get(%d) failed for" + " RRL hash table", + hsize); + return (ISC_R_NOMEMORY); + } + memset(hash, 0, hsize); + hash->length = new_bins; + rrl->hash_gen ^= 1; + hash->gen = rrl->hash_gen; + + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DROP) && old_bins != 0) { + rate = rrl->probes; + if (rrl->searches != 0) + rate /= rrl->searches; + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DROP, + "increase from %d to %d RRL bins for" + " %d entries; average search length %.1f", + old_bins, new_bins, rrl->num_entries, rate); + } + + rrl->old_hash = rrl->hash; + if (rrl->old_hash != NULL) + rrl->old_hash->check_time = now; + rrl->hash = hash; + + return (ISC_R_SUCCESS); +} + +static void +ref_entry(dns_rrl_t *rrl, dns_rrl_entry_t *e, int probes, isc_stdtime_t now) { + /* + * Make the entry most recently used. + */ + if (ISC_LIST_HEAD(rrl->lru) != e) { + if (e == rrl->last_logged) + rrl->last_logged = ISC_LIST_PREV(e, lru); + ISC_LIST_UNLINK(rrl->lru, e, lru); + ISC_LIST_PREPEND(rrl->lru, e, lru); + } + + /* + * Expand the hash table if it is time and necessary. + * This will leave the newly referenced entry in a chain in the + * old hash table. It will migrate to the new hash table the next + * time it is used or be cut loose when the old hash table is destroyed. + */ + rrl->probes += probes; + ++rrl->searches; + if (rrl->searches > 100 && + delta_rrl_time(rrl->hash->check_time, now) > 1) { + if (rrl->probes/rrl->searches > 2) + expand_rrl_hash(rrl, now); + rrl->hash->check_time = now; + rrl->probes = 0; + rrl->searches = 0; + } +} + +static inline bool +key_cmp(const dns_rrl_key_t *a, const dns_rrl_key_t *b) { + if (memcmp(a, b, sizeof(dns_rrl_key_t)) == 0) + return (true); + return (false); +} + +static inline uint32_t +hash_key(const dns_rrl_key_t *key) { + uint32_t hval; + int i; + + hval = key->w[0]; + for (i = sizeof(key->w) / sizeof(key->w[0]) - 1; i >= 0; --i) { + hval = key->w[i] + (hval<<1); + } + return (hval); +} + +/* + * Construct the hash table key. + * Use a hash of the DNS query name to save space in the database. + * Collisions result in legitimate rate limiting responses for one + * query name also limiting responses for other names to the + * same client. This is rare and benign enough given the large + * space costs compared to keeping the entire name in the database + * entry or the time costs of dynamic allocation. + */ +static void +make_key(const dns_rrl_t *rrl, dns_rrl_key_t *key, + const isc_sockaddr_t *client_addr, + dns_rdatatype_t qtype, dns_name_t *qname, dns_rdataclass_t qclass, + dns_rrl_rtype_t rtype) +{ + dns_name_t base; + dns_offsets_t base_offsets; + int labels, i; + + memset(key, 0, sizeof(*key)); + + key->s.rtype = rtype; + if (rtype == DNS_RRL_RTYPE_QUERY) { + key->s.qtype = qtype; + key->s.qclass = qclass & 0xff; + } else if (rtype == DNS_RRL_RTYPE_REFERRAL || + rtype == DNS_RRL_RTYPE_NODATA) { + /* + * Because there is no qtype in the empty answer sections of + * referral and NODATA responses, count them as the same. + */ + key->s.qclass = qclass & 0xff; + } + + if (qname != NULL && qname->labels != 0) { + /* + * Ignore the first label of wildcards. + */ + if ((qname->attributes & DNS_NAMEATTR_WILDCARD) != 0 && + (labels = dns_name_countlabels(qname)) > 1) + { + dns_name_init(&base, base_offsets); + dns_name_getlabelsequence(qname, 1, labels-1, &base); + key->s.qname_hash = + dns_name_fullhash(&base, false); + } else { + key->s.qname_hash = + dns_name_fullhash(qname, false); + } + } + + switch (client_addr->type.sa.sa_family) { + case AF_INET: + key->s.ip[0] = (client_addr->type.sin.sin_addr.s_addr & + rrl->ipv4_mask); + break; + case AF_INET6: + key->s.ipv6 = true; + memmove(key->s.ip, &client_addr->type.sin6.sin6_addr, + sizeof(key->s.ip)); + for (i = 0; i < DNS_RRL_MAX_PREFIX/32; ++i) + key->s.ip[i] &= rrl->ipv6_mask[i]; + break; + } +} + +static inline dns_rrl_rate_t * +get_rate(dns_rrl_t *rrl, dns_rrl_rtype_t rtype) { + switch (rtype) { + case DNS_RRL_RTYPE_QUERY: + return (&rrl->responses_per_second); + case DNS_RRL_RTYPE_REFERRAL: + return (&rrl->referrals_per_second); + case DNS_RRL_RTYPE_NODATA: + return (&rrl->nodata_per_second); + case DNS_RRL_RTYPE_NXDOMAIN: + return (&rrl->nxdomains_per_second); + case DNS_RRL_RTYPE_ERROR: + return (&rrl->errors_per_second); + case DNS_RRL_RTYPE_ALL: + return (&rrl->all_per_second); + default: + INSIST(0); + } + return (NULL); +} + +static int +response_balance(dns_rrl_t *rrl, const dns_rrl_entry_t *e, int age) { + dns_rrl_rate_t *ratep; + int balance, rate; + + if (e->key.s.rtype == DNS_RRL_RTYPE_TCP) { + rate = 1; + } else { + ratep = get_rate(rrl, e->key.s.rtype); + rate = ratep->scaled; + } + + balance = e->responses + age * rate; + if (balance > rate) + balance = rate; + return (balance); +} + +/* + * Search for an entry for a response and optionally create it. + */ +static dns_rrl_entry_t * +get_entry(dns_rrl_t *rrl, const isc_sockaddr_t *client_addr, + dns_rdataclass_t qclass, dns_rdatatype_t qtype, dns_name_t *qname, + dns_rrl_rtype_t rtype, isc_stdtime_t now, bool create, + char *log_buf, unsigned int log_buf_len) +{ + dns_rrl_key_t key; + uint32_t hval; + dns_rrl_entry_t *e; + dns_rrl_hash_t *hash; + dns_rrl_bin_t *new_bin, *old_bin; + int probes, age; + + make_key(rrl, &key, client_addr, qtype, qname, qclass, rtype); + hval = hash_key(&key); + + /* + * Look for the entry in the current hash table. + */ + new_bin = get_bin(rrl->hash, hval); + probes = 1; + e = ISC_LIST_HEAD(*new_bin); + while (e != NULL) { + if (key_cmp(&e->key, &key)) { + ref_entry(rrl, e, probes, now); + return (e); + } + ++probes; + e = ISC_LIST_NEXT(e, hlink); + } + + /* + * Look in the old hash table. + */ + if (rrl->old_hash != NULL) { + old_bin = get_bin(rrl->old_hash, hval); + e = ISC_LIST_HEAD(*old_bin); + while (e != NULL) { + if (key_cmp(&e->key, &key)) { + ISC_LIST_UNLINK(*old_bin, e, hlink); + ISC_LIST_PREPEND(*new_bin, e, hlink); + e->hash_gen = rrl->hash_gen; + ref_entry(rrl, e, probes, now); + return (e); + } + e = ISC_LIST_NEXT(e, hlink); + } + + /* + * Discard prevous hash table when all of its entries are old. + */ + age = delta_rrl_time(rrl->old_hash->check_time, now); + if (age > rrl->window) + free_old_hash(rrl); + } + + if (!create) + return (NULL); + + /* + * The entry does not exist, so create it by finding a free entry. + * Keep currently penalized and logged entries. + * Try to make more entries if none are idle. + * Steal the oldest entry if we cannot create more. + */ + for (e = ISC_LIST_TAIL(rrl->lru); + e != NULL; + e = ISC_LIST_PREV(e, lru)) + { + if (!ISC_LINK_LINKED(e, hlink)) + break; + age = get_age(rrl, e, now); + if (age <= 1) { + e = NULL; + break; + } + if (!e->logged && response_balance(rrl, e, age) > 0) + break; + } + if (e == NULL) { + expand_entries(rrl, ISC_MIN((rrl->num_entries+1)/2, 1000)); + e = ISC_LIST_TAIL(rrl->lru); + } + if (e->logged) + log_end(rrl, e, true, log_buf, log_buf_len); + if (ISC_LINK_LINKED(e, hlink)) { + if (e->hash_gen == rrl->hash_gen) + hash = rrl->hash; + else + hash = rrl->old_hash; + old_bin = get_bin(hash, hash_key(&e->key)); + ISC_LIST_UNLINK(*old_bin, e, hlink); + } + ISC_LIST_PREPEND(*new_bin, e, hlink); + e->hash_gen = rrl->hash_gen; + e->key = key; + e->ts_valid = false; + ref_entry(rrl, e, probes, now); + return (e); +} + +static void +debit_log(const dns_rrl_entry_t *e, int age, const char *action) { + char buf[sizeof("age=12345678")]; + const char *age_str; + + if (age == DNS_RRL_FOREVER) { + age_str = ""; + } else { + snprintf(buf, sizeof(buf), "age=%d", age); + age_str = buf; + } + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DEBUG3, + "rrl %08x %6s responses=%-3d %s", + hash_key(&e->key), age_str, e->responses, action); +} + +static inline dns_rrl_result_t +debit_rrl_entry(dns_rrl_t *rrl, dns_rrl_entry_t *e, double qps, double scale, + const isc_sockaddr_t *client_addr, isc_stdtime_t now, + char *log_buf, unsigned int log_buf_len) +{ + int rate, new_rate, slip, new_slip, age, log_secs, min; + dns_rrl_rate_t *ratep; + dns_rrl_entry_t const *credit_e; + + /* + * Pick the rate counter. + * Optionally adjust the rate by the estimated query/second rate. + */ + ratep = get_rate(rrl, e->key.s.rtype); + rate = ratep->r; + if (rate == 0) + return (DNS_RRL_RESULT_OK); + + if (scale < 1.0) { + /* + * The limit for clients that have used TCP is not scaled. + */ + credit_e = get_entry(rrl, client_addr, + 0, dns_rdatatype_none, NULL, + DNS_RRL_RTYPE_TCP, now, false, + log_buf, log_buf_len); + if (credit_e != NULL) { + age = get_age(rrl, e, now); + if (age < rrl->window) + scale = 1.0; + } + } + if (scale < 1.0) { + new_rate = (int) (rate * scale); + if (new_rate < 1) + new_rate = 1; + if (ratep->scaled != new_rate) { + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, + DNS_RRL_LOG_DEBUG1, + "%d qps scaled %s by %.2f" + " from %d to %d", + (int)qps, ratep->str, scale, + rate, new_rate); + rate = new_rate; + ratep->scaled = rate; + } + } + + min = -rrl->window * rate; + + /* + * Treat time jumps into the recent past as no time. + * Treat entries older than the window as if they were just created + * Credit other entries. + */ + age = get_age(rrl, e, now); + if (age > 0) { + /* + * Credit tokens earned during elapsed time. + */ + if (age > rrl->window) { + e->responses = rate; + e->slip_cnt = 0; + } else { + e->responses += rate*age; + if (e->responses > rate) { + e->responses = rate; + e->slip_cnt = 0; + } + } + /* + * Find the seconds since last log message without overflowing + * small counter. This counter is reset when an entry is + * created. It is not necessarily reset when some requests + * are answered provided other requests continue to be dropped + * or slipped. This can happen when the request rate is just + * at the limit. + */ + if (e->logged) { + log_secs = e->log_secs; + log_secs += age; + if (log_secs > DNS_RRL_MAX_LOG_SECS || log_secs < 0) + log_secs = DNS_RRL_MAX_LOG_SECS; + e->log_secs = log_secs; + } + } + set_age(rrl, e, now); + + /* + * Debit the entry for this response. + */ + if (--e->responses >= 0) { + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG3)) + debit_log(e, age, ""); + return (DNS_RRL_RESULT_OK); + } + + if (e->responses < min) + e->responses = min; + + /* + * Drop this response unless it should slip or leak. + */ + slip = rrl->slip.r; + if (slip > 2 && scale < 1.0) { + new_slip = (int) (slip * scale); + if (new_slip < 2) + new_slip = 2; + if (rrl->slip.scaled != new_slip) { + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, + DNS_RRL_LOG_DEBUG1, + "%d qps scaled slip" + " by %.2f from %d to %d", + (int)qps, scale, + slip, new_slip); + slip = new_slip; + rrl->slip.scaled = slip; + } + } + if (slip != 0 && e->key.s.rtype != DNS_RRL_RTYPE_ALL) { + if (e->slip_cnt++ == 0) { + if ((int) e->slip_cnt >= slip) + e->slip_cnt = 0; + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG3)) + debit_log(e, age, "slip"); + return (DNS_RRL_RESULT_SLIP); + } else if ((int) e->slip_cnt >= slip) { + e->slip_cnt = 0; + } + } + + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG3)) + debit_log(e, age, "drop"); + return (DNS_RRL_RESULT_DROP); +} + +static inline dns_rrl_qname_buf_t * +get_qname(dns_rrl_t *rrl, const dns_rrl_entry_t *e) { + dns_rrl_qname_buf_t *qbuf; + + qbuf = rrl->qnames[e->log_qname]; + if (qbuf == NULL || qbuf->e != e) + return (NULL); + return (qbuf); +} + +static inline void +free_qname(dns_rrl_t *rrl, dns_rrl_entry_t *e) { + dns_rrl_qname_buf_t *qbuf; + + qbuf = get_qname(rrl, e); + if (qbuf != NULL) { + qbuf->e = NULL; + ISC_LIST_APPEND(rrl->qname_free, qbuf, link); + } +} + +static void +add_log_str(isc_buffer_t *lb, const char *str, unsigned int str_len) { + isc_region_t region; + + isc_buffer_availableregion(lb, ®ion); + if (str_len >= region.length) { + if (region.length == 0U) + return; + str_len = region.length; + } + memmove(region.base, str, str_len); + isc_buffer_add(lb, str_len); +} + +#define ADD_LOG_CSTR(eb, s) add_log_str(eb, s, sizeof(s)-1) + +/* + * Build strings for the logs + */ +static void +make_log_buf(dns_rrl_t *rrl, dns_rrl_entry_t *e, + const char *str1, const char *str2, bool plural, + dns_name_t *qname, bool save_qname, + dns_rrl_result_t rrl_result, isc_result_t resp_result, + char *log_buf, unsigned int log_buf_len) +{ + isc_buffer_t lb; + dns_rrl_qname_buf_t *qbuf; + isc_netaddr_t cidr; + char strbuf[ISC_MAX(sizeof("/123"), sizeof(" (12345678)"))]; + const char *rstr; + isc_result_t msg_result; + + if (log_buf_len <= 1) { + if (log_buf_len == 1) + log_buf[0] = '\0'; + return; + } + isc_buffer_init(&lb, log_buf, log_buf_len-1); + + if (str1 != NULL) + add_log_str(&lb, str1, strlen(str1)); + if (str2 != NULL) + add_log_str(&lb, str2, strlen(str2)); + + switch (rrl_result) { + case DNS_RRL_RESULT_OK: + break; + case DNS_RRL_RESULT_DROP: + ADD_LOG_CSTR(&lb, "drop "); + break; + case DNS_RRL_RESULT_SLIP: + ADD_LOG_CSTR(&lb, "slip "); + break; + default: + INSIST(0); + break; + } + + switch (e->key.s.rtype) { + case DNS_RRL_RTYPE_QUERY: + break; + case DNS_RRL_RTYPE_REFERRAL: + ADD_LOG_CSTR(&lb, "referral "); + break; + case DNS_RRL_RTYPE_NODATA: + ADD_LOG_CSTR(&lb, "NODATA "); + break; + case DNS_RRL_RTYPE_NXDOMAIN: + ADD_LOG_CSTR(&lb, "NXDOMAIN "); + break; + case DNS_RRL_RTYPE_ERROR: + if (resp_result == ISC_R_SUCCESS) { + ADD_LOG_CSTR(&lb, "error "); + } else { + rstr = isc_result_totext(resp_result); + add_log_str(&lb, rstr, strlen(rstr)); + ADD_LOG_CSTR(&lb, " error "); + } + break; + case DNS_RRL_RTYPE_ALL: + ADD_LOG_CSTR(&lb, "all "); + break; + default: + INSIST(0); + } + + if (plural) + ADD_LOG_CSTR(&lb, "responses to "); + else + ADD_LOG_CSTR(&lb, "response to "); + + memset(&cidr, 0, sizeof(cidr)); + if (e->key.s.ipv6) { + snprintf(strbuf, sizeof(strbuf), "/%d", rrl->ipv6_prefixlen); + cidr.family = AF_INET6; + memset(&cidr.type.in6, 0, sizeof(cidr.type.in6)); + memmove(&cidr.type.in6, e->key.s.ip, sizeof(e->key.s.ip)); + } else { + snprintf(strbuf, sizeof(strbuf), "/%d", rrl->ipv4_prefixlen); + cidr.family = AF_INET; + cidr.type.in.s_addr = e->key.s.ip[0]; + } + msg_result = isc_netaddr_totext(&cidr, &lb); + if (msg_result != ISC_R_SUCCESS) + ADD_LOG_CSTR(&lb, "?"); + add_log_str(&lb, strbuf, strlen(strbuf)); + + if (e->key.s.rtype == DNS_RRL_RTYPE_QUERY || + e->key.s.rtype == DNS_RRL_RTYPE_REFERRAL || + e->key.s.rtype == DNS_RRL_RTYPE_NODATA || + e->key.s.rtype == DNS_RRL_RTYPE_NXDOMAIN) { + qbuf = get_qname(rrl, e); + if (save_qname && qbuf == NULL && + qname != NULL && dns_name_isabsolute(qname)) { + /* + * Capture the qname for the "stop limiting" message. + */ + qbuf = ISC_LIST_TAIL(rrl->qname_free); + if (qbuf != NULL) { + ISC_LIST_UNLINK(rrl->qname_free, qbuf, link); + } else if (rrl->num_qnames < DNS_RRL_QNAMES) { + qbuf = isc_mem_get(rrl->mctx, sizeof(*qbuf)); + if (qbuf != NULL) { + memset(qbuf, 0, sizeof(*qbuf)); + ISC_LINK_INIT(qbuf, link); + qbuf->index = rrl->num_qnames; + rrl->qnames[rrl->num_qnames++] = qbuf; + } else { + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, + DNS_RRL_LOG_FAIL, + "isc_mem_get(%d)" + " failed for RRL qname", + (int)sizeof(*qbuf)); + } + } + if (qbuf != NULL) { + e->log_qname = qbuf->index; + qbuf->e = e; + dns_fixedname_init(&qbuf->qname); + dns_name_copy(qname, + dns_fixedname_name(&qbuf->qname), + NULL); + } + } + if (qbuf != NULL) + qname = dns_fixedname_name(&qbuf->qname); + if (qname != NULL) { + ADD_LOG_CSTR(&lb, " for "); + (void)dns_name_totext(qname, true, &lb); + } else { + ADD_LOG_CSTR(&lb, " for (?)"); + } + if (e->key.s.rtype != DNS_RRL_RTYPE_NXDOMAIN) { + ADD_LOG_CSTR(&lb, " "); + (void)dns_rdataclass_totext(e->key.s.qclass, &lb); + if (e->key.s.rtype == DNS_RRL_RTYPE_QUERY) { + ADD_LOG_CSTR(&lb, " "); + (void)dns_rdatatype_totext(e->key.s.qtype, &lb); + } + } + snprintf(strbuf, sizeof(strbuf), " (%08x)", + e->key.s.qname_hash); + add_log_str(&lb, strbuf, strlen(strbuf)); + } + + /* + * We saved room for '\0'. + */ + log_buf[isc_buffer_usedlength(&lb)] = '\0'; +} + +static void +log_end(dns_rrl_t *rrl, dns_rrl_entry_t *e, bool early, + char *log_buf, unsigned int log_buf_len) +{ + if (e->logged) { + make_log_buf(rrl, e, + early ? "*" : NULL, + rrl->log_only ? "would stop limiting " + : "stop limiting ", + true, NULL, false, + DNS_RRL_RESULT_OK, ISC_R_SUCCESS, + log_buf, log_buf_len); + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DROP, + "%s", log_buf); + free_qname(rrl, e); + e->logged = false; + --rrl->num_logged; + } +} + +/* + * Log messages for streams that have stopped being rate limited. + */ +static void +log_stops(dns_rrl_t *rrl, isc_stdtime_t now, int limit, + char *log_buf, unsigned int log_buf_len) +{ + dns_rrl_entry_t *e; + int age; + + for (e = rrl->last_logged; e != NULL; e = ISC_LIST_PREV(e, lru)) { + if (!e->logged) + continue; + if (now != 0) { + age = get_age(rrl, e, now); + if (age < DNS_RRL_STOP_LOG_SECS || + response_balance(rrl, e, age) < 0) + break; + } + + log_end(rrl, e, now == 0, log_buf, log_buf_len); + if (rrl->num_logged <= 0) + break; + + /* + * Too many messages could stall real work. + */ + if (--limit < 0) { + rrl->last_logged = ISC_LIST_PREV(e, lru); + return; + } + } + if (e == NULL) { + INSIST(rrl->num_logged == 0); + rrl->log_stops_time = now; + } + rrl->last_logged = e; +} + +/* + * Main rate limit interface. + */ +dns_rrl_result_t +dns_rrl(dns_view_t *view, + const isc_sockaddr_t *client_addr, bool is_tcp, + dns_rdataclass_t qclass, dns_rdatatype_t qtype, + dns_name_t *qname, isc_result_t resp_result, isc_stdtime_t now, + bool wouldlog, char *log_buf, unsigned int log_buf_len) +{ + dns_rrl_t *rrl; + dns_rrl_rtype_t rtype; + dns_rrl_entry_t *e; + isc_netaddr_t netclient; + int secs; + double qps, scale; + int exempt_match; + isc_result_t result; + dns_rrl_result_t rrl_result; + + INSIST(log_buf != NULL && log_buf_len > 0); + + rrl = view->rrl; + if (rrl->exempt != NULL) { + isc_netaddr_fromsockaddr(&netclient, client_addr); + result = dns_acl_match(&netclient, NULL, rrl->exempt, + &view->aclenv, &exempt_match, NULL); + if (result == ISC_R_SUCCESS && exempt_match > 0) + return (DNS_RRL_RESULT_OK); + } + + LOCK(&rrl->lock); + + /* + * Estimate total query per second rate when scaling by qps. + */ + if (rrl->qps_scale == 0) { + qps = 0.0; + scale = 1.0; + } else { + ++rrl->qps_responses; + secs = delta_rrl_time(rrl->qps_time, now); + if (secs <= 0) { + qps = rrl->qps; + } else { + qps = (1.0*rrl->qps_responses) / secs; + if (secs >= rrl->window) { + if (isc_log_wouldlog(dns_lctx, + DNS_RRL_LOG_DEBUG3)) + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, + DNS_RRL_LOG_DEBUG3, + "%d responses/%d seconds" + " = %d qps", + rrl->qps_responses, secs, + (int)qps); + rrl->qps = qps; + rrl->qps_responses = 0; + rrl->qps_time = now; + } else if (qps < rrl->qps) { + qps = rrl->qps; + } + } + scale = rrl->qps_scale / qps; + } + + /* + * Do maintenance once per second. + */ + if (rrl->num_logged > 0 && rrl->log_stops_time != now) + log_stops(rrl, now, 8, log_buf, log_buf_len); + + /* + * Notice TCP responses when scaling limits by qps. + * Do not try to rate limit TCP responses. + */ + if (is_tcp) { + if (scale < 1.0) { + e = get_entry(rrl, client_addr, + 0, dns_rdatatype_none, NULL, + DNS_RRL_RTYPE_TCP, now, true, + log_buf, log_buf_len); + if (e != NULL) { + e->responses = -(rrl->window+1); + set_age(rrl, e, now); + } + } + UNLOCK(&rrl->lock); + return (ISC_R_SUCCESS); + } + + /* + * Find the right kind of entry, creating it if necessary. + * If that is impossible, then nothing more can be done + */ + switch (resp_result) { + case ISC_R_SUCCESS: + rtype = DNS_RRL_RTYPE_QUERY; + break; + case DNS_R_DELEGATION: + rtype = DNS_RRL_RTYPE_REFERRAL; + break; + case DNS_R_NXRRSET: + rtype = DNS_RRL_RTYPE_NODATA; + break; + case DNS_R_NXDOMAIN: + rtype = DNS_RRL_RTYPE_NXDOMAIN; + break; + default: + rtype = DNS_RRL_RTYPE_ERROR; + break; + } + e = get_entry(rrl, client_addr, qclass, qtype, qname, rtype, + now, true, log_buf, log_buf_len); + if (e == NULL) { + UNLOCK(&rrl->lock); + return (DNS_RRL_RESULT_OK); + } + + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG1)) { + /* + * Do not worry about speed or releasing the lock. + * This message appears before messages from debit_rrl_entry(). + */ + make_log_buf(rrl, e, "consider limiting ", NULL, false, + qname, false, DNS_RRL_RESULT_OK, resp_result, + log_buf, log_buf_len); + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DEBUG1, + "%s", log_buf); + } + + rrl_result = debit_rrl_entry(rrl, e, qps, scale, client_addr, now, + log_buf, log_buf_len); + + if (rrl->all_per_second.r != 0) { + /* + * We must debit the all-per-second token bucket if we have + * an all-per-second limit for the IP address. + * The all-per-second limit determines the log message + * when both limits are hit. + * The response limiting must continue if the + * all-per-second limiting lapses. + */ + dns_rrl_entry_t *e_all; + dns_rrl_result_t rrl_all_result; + + e_all = get_entry(rrl, client_addr, + 0, dns_rdatatype_none, NULL, + DNS_RRL_RTYPE_ALL, now, true, + log_buf, log_buf_len); + if (e_all == NULL) { + UNLOCK(&rrl->lock); + return (DNS_RRL_RESULT_OK); + } + rrl_all_result = debit_rrl_entry(rrl, e_all, qps, scale, + client_addr, now, + log_buf, log_buf_len); + if (rrl_all_result != DNS_RRL_RESULT_OK) { + e = e_all; + rrl_result = rrl_all_result; + if (isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DEBUG1)) { + make_log_buf(rrl, e, + "prefer all-per-second limiting ", + NULL, true, qname, false, + DNS_RRL_RESULT_OK, resp_result, + log_buf, log_buf_len); + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, + DNS_RRL_LOG_DEBUG1, + "%s", log_buf); + } + } + } + + if (rrl_result == DNS_RRL_RESULT_OK) { + UNLOCK(&rrl->lock); + return (DNS_RRL_RESULT_OK); + } + + /* + * Log occassionally in the rate-limit category. + */ + if ((!e->logged || e->log_secs >= DNS_RRL_MAX_LOG_SECS) && + isc_log_wouldlog(dns_lctx, DNS_RRL_LOG_DROP)) { + make_log_buf(rrl, e, rrl->log_only ? "would " : NULL, + e->logged ? "continue limiting " : "limit ", + true, qname, true, + DNS_RRL_RESULT_OK, resp_result, + log_buf, log_buf_len); + if (!e->logged) { + e->logged = true; + if (++rrl->num_logged <= 1) + rrl->last_logged = e; + } + e->log_secs = 0; + + /* + * Avoid holding the lock. + */ + if (!wouldlog) { + UNLOCK(&rrl->lock); + e = NULL; + } + isc_log_write(dns_lctx, DNS_LOGCATEGORY_RRL, + DNS_LOGMODULE_REQUEST, DNS_RRL_LOG_DROP, + "%s", log_buf); + } + + /* + * Make a log message for the caller. + */ + if (wouldlog) + make_log_buf(rrl, e, + rrl->log_only ? "would rate limit " : "rate limit ", + NULL, false, qname, false, + rrl_result, resp_result, log_buf, log_buf_len); + + if (e != NULL) { + /* + * Do not save the qname unless we might need it for + * the ending log message. + */ + if (!e->logged) + free_qname(rrl, e); + UNLOCK(&rrl->lock); + } + + return (rrl_result); +} + +void +dns_rrl_view_destroy(dns_view_t *view) { + dns_rrl_t *rrl; + dns_rrl_block_t *b; + dns_rrl_hash_t *h; + char log_buf[DNS_RRL_LOG_BUF_LEN]; + int i; + + rrl = view->rrl; + if (rrl == NULL) + return; + view->rrl = NULL; + + /* + * Assume the caller takes care of locking the view and anything else. + */ + + if (rrl->num_logged > 0) + log_stops(rrl, 0, INT32_MAX, log_buf, sizeof(log_buf)); + + for (i = 0; i < DNS_RRL_QNAMES; ++i) { + if (rrl->qnames[i] == NULL) + break; + isc_mem_put(rrl->mctx, rrl->qnames[i], sizeof(*rrl->qnames[i])); + } + + if (rrl->exempt != NULL) + dns_acl_detach(&rrl->exempt); + + DESTROYLOCK(&rrl->lock); + + while (!ISC_LIST_EMPTY(rrl->blocks)) { + b = ISC_LIST_HEAD(rrl->blocks); + ISC_LIST_UNLINK(rrl->blocks, b, link); + isc_mem_put(rrl->mctx, b, b->size); + } + + h = rrl->hash; + if (h != NULL) + isc_mem_put(rrl->mctx, h, + sizeof(*h) + (h->length - 1) * sizeof(h->bins[0])); + + h = rrl->old_hash; + if (h != NULL) + isc_mem_put(rrl->mctx, h, + sizeof(*h) + (h->length - 1) * sizeof(h->bins[0])); + + isc_mem_putanddetach(&rrl->mctx, rrl, sizeof(*rrl)); +} + +isc_result_t +dns_rrl_init(dns_rrl_t **rrlp, dns_view_t *view, int min_entries) { + dns_rrl_t *rrl; + isc_result_t result; + + *rrlp = NULL; + + rrl = isc_mem_get(view->mctx, sizeof(*rrl)); + if (rrl == NULL) + return (ISC_R_NOMEMORY); + memset(rrl, 0, sizeof(*rrl)); + isc_mem_attach(view->mctx, &rrl->mctx); + result = isc_mutex_init(&rrl->lock); + if (result != ISC_R_SUCCESS) { + isc_mem_putanddetach(&rrl->mctx, rrl, sizeof(*rrl)); + return (result); + } + isc_stdtime_get(&rrl->ts_bases[0]); + + view->rrl = rrl; + + result = expand_entries(rrl, min_entries); + if (result != ISC_R_SUCCESS) { + dns_rrl_view_destroy(view); + return (result); + } + result = expand_rrl_hash(rrl, 0); + if (result != ISC_R_SUCCESS) { + dns_rrl_view_destroy(view); + return (result); + } + + *rrlp = rrl; + return (ISC_R_SUCCESS); +} |