summaryrefslogtreecommitdiffstats
path: root/lib/nsrep.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/nsrep.c')
-rw-r--r--lib/nsrep.c563
1 files changed, 563 insertions, 0 deletions
diff --git a/lib/nsrep.c b/lib/nsrep.c
new file mode 100644
index 0000000..2af12f2
--- /dev/null
+++ b/lib/nsrep.c
@@ -0,0 +1,563 @@
+/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+#include <arpa/inet.h>
+
+#include "lib/nsrep.h"
+#include "lib/rplan.h"
+#include "lib/resolve.h"
+#include "lib/defines.h"
+#include "lib/generic/pack.h"
+#include "contrib/ucw/lib.h"
+
+/** Some built-in unfairness ... */
+#ifndef FAVOUR_IPV6
+#define FAVOUR_IPV6 20 /* 20ms bonus for v6 */
+#endif
+
+/** @internal Macro to set address structure. */
+#define ADDR_SET(sa, family, addr, len, port) do {\
+ memcpy(&sa ## _addr, (addr), (len)); \
+ sa ## _family = (family); \
+ sa ## _port = htons(port); \
+} while (0)
+
+/** Update nameserver representation with current name/address pair. */
+static void update_nsrep(struct kr_nsrep *ns, size_t pos, uint8_t *addr, size_t addr_len, int port)
+{
+ if (addr == NULL) {
+ ns->addr[pos].ip.sa_family = AF_UNSPEC;
+ return;
+ }
+
+ /* Rotate previous addresses to the right. */
+ memmove(ns->addr + pos + 1, ns->addr + pos, (KR_NSREP_MAXADDR - pos - 1) * sizeof(ns->addr[0]));
+
+ switch(addr_len) {
+ case sizeof(struct in_addr):
+ ADDR_SET(ns->addr[pos].ip4.sin, AF_INET, addr, addr_len, port); break;
+ case sizeof(struct in6_addr):
+ ADDR_SET(ns->addr[pos].ip6.sin6, AF_INET6, addr, addr_len, port); break;
+ default: assert(0); break;
+ }
+}
+
+static void update_nsrep_set(struct kr_nsrep *ns, const knot_dname_t *name, uint8_t *addr[], unsigned score)
+{
+ /* NSLIST is not empty, empty NS cannot be a leader. */
+ if (!addr[0] && ns->addr[0].ip.sa_family != AF_UNSPEC) {
+ return;
+ }
+ /* Set new NS leader */
+ ns->name = name;
+ ns->score = score;
+ for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
+ if (addr[i]) {
+ void *addr_val = pack_obj_val(addr[i]);
+ size_t len = pack_obj_len(addr[i]);
+ update_nsrep(ns, i, addr_val, len, KR_DNS_PORT);
+ } else {
+ break;
+ }
+ }
+}
+
+#undef ADDR_SET
+
+/**
+ * \param addr_set pack with one IP address per element */
+static unsigned eval_addr_set(const pack_t *addr_set, struct kr_context *ctx,
+ struct kr_qflags opts, unsigned score, uint8_t *addr[])
+{
+ kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
+ kr_nsrep_rtt_lru_entry_t *rtt_cache_entry_ptr[KR_NSREP_MAXADDR] = { NULL, };
+ assert (KR_NSREP_MAXADDR >= 2);
+ unsigned rtt_cache_entry_score[KR_NSREP_MAXADDR] = { score, KR_NS_MAX_SCORE + 1, };
+ uint64_t now = kr_now();
+
+ /* Name server is better candidate if it has address record. */
+ for (uint8_t *it = pack_head(*addr_set); it != pack_tail(*addr_set);
+ it = pack_obj_next(it)) {
+ void *val = pack_obj_val(it);
+ size_t len = pack_obj_len(it);
+ unsigned favour = 0;
+ bool is_valid = false;
+ /* Check if the address isn't disabled. */
+ if (len == sizeof(struct in6_addr)) {
+ is_valid = !(opts.NO_IPV6);
+ favour = FAVOUR_IPV6;
+ } else if (len == sizeof(struct in_addr)) {
+ is_valid = !(opts.NO_IPV4);
+ } else {
+ assert(!EINVAL);
+ is_valid = false;
+ }
+
+ if (!is_valid) {
+ continue;
+ }
+
+ /* Get score for the current address. */
+ kr_nsrep_rtt_lru_entry_t *cached = rtt_cache ?
+ lru_get_try(rtt_cache, val, len) :
+ NULL;
+ unsigned cur_addr_score = KR_NS_GLUED;
+ if (cached) {
+ cur_addr_score = cached->score;
+ if (cached->score >= KR_NS_TIMEOUT) {
+ /* If NS once was marked as "timeouted",
+ * it won't participate in NS elections
+ * at least ctx->cache_rtt_tout_retry_interval milliseconds. */
+ uint64_t elapsed = now - cached->tout_timestamp;
+ elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
+ if (elapsed > ctx->cache_rtt_tout_retry_interval) {
+ /* Select this NS for probing in this particular query,
+ * but don't change the cached score.
+ * For other queries this NS will remain "timeouted". */
+ cur_addr_score = KR_NS_LONG - 1;
+ }
+ }
+ }
+
+ for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
+ if (cur_addr_score >= KR_NS_TIMEOUT) {
+ /* We can't use favour here.
+ * If all of the conditions below are true
+ *
+ * rtt_cache_entry_score[i] < KR_NS_TIMEOUT
+ * rtt_cache_entry_score[i] + favour > KR_NS_TIMEOUT
+ * cur_addr_score < rtt_cache_entry_score[i] + favour
+ *
+ * we will prefer "certainly dead" cur_addr_score
+ * instead of "almost dead, but alive" rtt_cache_entry_score[i]
+ */
+ if (cur_addr_score >= rtt_cache_entry_score[i]) {
+ continue;
+ }
+ }
+ if (cur_addr_score >= KR_NS_TIMEOUT
+ || cur_addr_score < rtt_cache_entry_score[i] + favour) {
+ /* Shake down previous contenders */
+ for (size_t j = KR_NSREP_MAXADDR - 1; j > i; --j) {
+ addr[j] = addr[j - 1];
+ rtt_cache_entry_ptr[j] = rtt_cache_entry_ptr[j - 1];
+ rtt_cache_entry_score[j] = rtt_cache_entry_score[j - 1];
+ }
+ addr[i] = it;
+ rtt_cache_entry_score[i] = cur_addr_score;
+ rtt_cache_entry_ptr[i] = cached;
+ break;
+ }
+ }
+ }
+
+ /* At this point, rtt_cache_entry_ptr contains up to KR_NSREP_MAXADDR
+ * pointers to the rtt cache entries with the best scores for the given addr_set.
+ * Check if there are timeouted NS. */
+
+ for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
+ if (rtt_cache_entry_ptr[i] == NULL)
+ continue;
+ if (rtt_cache_entry_ptr[i]->score < KR_NS_TIMEOUT)
+ continue;
+
+ uint64_t elapsed = now - rtt_cache_entry_ptr[i]->tout_timestamp;
+ elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
+ if (elapsed <= ctx->cache_rtt_tout_retry_interval)
+ continue;
+
+ /* rtt_cache_entry_ptr[i] points to "timeouted" rtt cache entry.
+ * The period of the ban on participation in elections has expired. */
+
+ if (VERBOSE_STATUS) {
+ void *val = pack_obj_val(addr[i]);
+ size_t len = pack_obj_len(addr[i]);
+ char sa_str[INET6_ADDRSTRLEN];
+ int af = (len == sizeof(struct in6_addr)) ? AF_INET6 : AF_INET;
+ inet_ntop(af, val, sa_str, sizeof(sa_str));
+ kr_log_verbose("[ ][nsre] probing timeouted NS: %s, score %i\n",
+ sa_str, rtt_cache_entry_ptr[i]->score);
+ }
+
+ rtt_cache_entry_ptr[i]->tout_timestamp = now;
+ }
+
+ return rtt_cache_entry_score[0];
+}
+
+static int eval_nsrep(const knot_dname_t *owner, const pack_t *addr_set, struct kr_query *qry)
+{
+ struct kr_nsrep *ns = &qry->ns;
+ struct kr_context *ctx = ns->ctx;
+ unsigned score = KR_NS_MAX_SCORE;
+ unsigned reputation = 0;
+ uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
+
+ /* Fetch NS reputation */
+ if (ctx->cache_rep) {
+ unsigned *cached = lru_get_try(ctx->cache_rep, (const char *)owner,
+ knot_dname_size(owner));
+ if (cached) {
+ reputation = *cached;
+ }
+ }
+
+ /* Favour nameservers with unknown addresses to probe them,
+ * otherwise discover the current best address for the NS. */
+ if (addr_set->len == 0) {
+ score = KR_NS_UNKNOWN;
+ /* If the server doesn't have IPv6, give it disadvantage. */
+ if (reputation & KR_NS_NOIP6) {
+ score += FAVOUR_IPV6;
+ /* If the server is unknown but has rep record, treat it as timeouted */
+ if (reputation & KR_NS_NOIP4) {
+ score = KR_NS_UNKNOWN;
+ /* Try to start with clean slate */
+ if (!(qry->flags.NO_IPV6)) {
+ reputation &= ~KR_NS_NOIP6;
+ }
+ if (!(qry->flags.NO_IPV4)) {
+ reputation &= ~KR_NS_NOIP4;
+ }
+ }
+ }
+ } else {
+ score = eval_addr_set(addr_set, ctx, qry->flags, score, addr_choice);
+ }
+
+ /* Probabilistic bee foraging strategy (naive).
+ * The fastest NS is preferred by workers until it is depleted (timeouts or degrades),
+ * at the same time long distance scouts probe other sources (low probability).
+ * Servers on TIMEOUT will not have probed at all.
+ * Servers with score above KR_NS_LONG will have periodically removed from
+ * reputation cache, so that kresd can reprobe them. */
+ if (score >= KR_NS_TIMEOUT) {
+ return kr_ok();
+ } else if (score <= ns->score &&
+ (score < KR_NS_LONG || qry->flags.NO_THROTTLE)) {
+ update_nsrep_set(ns, owner, addr_choice, score);
+ ns->reputation = reputation;
+ } else if (kr_rand_coin(1, 10) &&
+ !kr_rand_coin(score, KR_NS_MAX_SCORE)) {
+ /* With 10% chance probe server with a probability
+ * given by its RTT / MAX_RTT. */
+ update_nsrep_set(ns, owner, addr_choice, score);
+ ns->reputation = reputation;
+ return 1; /* Stop evaluation */
+ } else if (ns->score > KR_NS_MAX_SCORE) {
+ /* Check if any server was already selected.
+ * If no, pick current server and continue evaluation. */
+ update_nsrep_set(ns, owner, addr_choice, score);
+ ns->reputation = reputation;
+ }
+
+ return kr_ok();
+}
+
+int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock)
+{
+ if (!qry) {
+ return kr_error(EINVAL);
+ }
+ if (index >= KR_NSREP_MAXADDR) {
+ return kr_error(ENOSPC);
+ }
+
+ if (!sock) {
+ qry->ns.name = (const uint8_t *)"";
+ qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
+ return kr_ok();
+ }
+
+ switch (sock->sa_family) {
+ case AF_INET:
+ if (qry->flags.NO_IPV4) {
+ return kr_error(ENOENT);
+ }
+ qry->ns.addr[index].ip4 = *(const struct sockaddr_in *)sock;
+ break;
+ case AF_INET6:
+ if (qry->flags.NO_IPV6) {
+ return kr_error(ENOENT);
+ }
+ qry->ns.addr[index].ip6 = *(const struct sockaddr_in6 *)sock;
+ break;
+ default:
+ qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
+ return kr_error(EINVAL);
+ }
+
+ qry->ns.name = (const uint8_t *)"";
+ /* Reset score on first entry */
+ if (index == 0) {
+ qry->ns.score = KR_NS_UNKNOWN;
+ qry->ns.reputation = 0;
+ }
+
+ /* Retrieve RTT from cache */
+ struct kr_context *ctx = qry->ns.ctx;
+ kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = ctx
+ ? lru_get_try(ctx->cache_rtt, kr_inaddr(sock), kr_family_len(sock->sa_family))
+ : NULL;
+ if (rtt_cache_entry) {
+ qry->ns.score = MIN(qry->ns.score, rtt_cache_entry->score);
+ }
+
+ return kr_ok();
+}
+
+#define ELECT_INIT(ns, ctx_) do { \
+ (ns)->ctx = (ctx_); \
+ (ns)->addr[0].ip.sa_family = AF_UNSPEC; \
+ (ns)->reputation = 0; \
+ (ns)->score = KR_NS_MAX_SCORE + 1; \
+} while (0)
+
+int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx)
+{
+ if (!qry || !ctx) {
+ //assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ struct kr_nsrep *ns = &qry->ns;
+ ELECT_INIT(ns, ctx);
+
+ int ret = kr_ok();
+ trie_it_t *it;
+ for (it = trie_it_begin(qry->zone_cut.nsset); !trie_it_finished(it);
+ trie_it_next(it)) {
+ ret = eval_nsrep(/* we trust it's a correct dname */
+ (const knot_dname_t *)trie_it_key(it, NULL),
+ (const pack_t *)*trie_it_val(it), qry);
+ if (ret) break;
+ }
+ trie_it_free(it);
+
+ if (qry->ns.score <= KR_NS_MAX_SCORE && qry->ns.score >= KR_NS_LONG) {
+ /* This is a low-reliability probe,
+ * go with TCP to get ICMP reachability check. */
+ qry->flags.TCP = true;
+ }
+ return ret;
+}
+
+int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx)
+{
+ if (!qry || !ctx) {
+ //assert(!EINVAL);
+ return kr_error(EINVAL);
+ }
+
+ /* Get address list for this NS */
+ struct kr_nsrep *ns = &qry->ns;
+ ELECT_INIT(ns, ctx);
+ pack_t *addr_set = kr_zonecut_find(&qry->zone_cut, ns->name);
+ if (!addr_set) {
+ return kr_error(ENOENT);
+ }
+ /* Evaluate addr list */
+ uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
+ unsigned score = eval_addr_set(addr_set, ctx, qry->flags, ns->score, addr_choice);
+ update_nsrep_set(ns, ns->name, addr_choice, score);
+ return kr_ok();
+}
+
+#undef ELECT_INIT
+
+int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
+ unsigned score, kr_nsrep_rtt_lru_t *cache, int umode)
+{
+ if (!cache || umode > KR_NS_MAX || umode < 0) {
+ return kr_error(EINVAL);
+ }
+
+ /* Get `addr`, and later its raw string. */
+ if (addr) {
+ /* Caller provided specific address, OK. */
+ } else if (ns != NULL) {
+ addr = &ns->addr[0].ip;
+ } else {
+ assert(false && "kr_nsrep_update_rtt: don't know what address to update");
+ return kr_error(EINVAL);
+ }
+ const char *addr_in = kr_inaddr(addr);
+ size_t addr_len = kr_inaddr_len(addr);
+ if (!addr_in || addr_len <= 0) {
+ assert(false && "kr_nsrep_update_rtt: incorrect address");
+ return kr_error(EINVAL);
+ }
+
+ bool is_new_entry = false;
+ kr_nsrep_rtt_lru_entry_t *cur = lru_get_new(cache, addr_in, addr_len,
+ (&is_new_entry));
+ if (!cur) {
+ return kr_ok();
+ }
+ if (score <= KR_NS_GLUED) {
+ score = KR_NS_GLUED + 1;
+ }
+ /* If there's nothing to update, we reset it unless KR_NS_UPDATE_NORESET
+ * mode was requested. New items are zeroed by LRU automatically. */
+ if (is_new_entry && umode != KR_NS_UPDATE_NORESET) {
+ umode = KR_NS_RESET;
+ }
+ unsigned new_score = 0;
+ /* Update score, by default smooth over last two measurements. */
+ switch (umode) {
+ case KR_NS_UPDATE:
+ case KR_NS_UPDATE_NORESET:
+ new_score = (cur->score + score) / 2; break;
+ case KR_NS_RESET: new_score = score; break;
+ case KR_NS_ADD: new_score = MIN(KR_NS_MAX_SCORE - 1, cur->score + score); break;
+ case KR_NS_MAX: new_score = MAX(cur->score, score); break;
+ default: return kr_error(EINVAL);
+ }
+ /* Score limits */
+ if (new_score > KR_NS_MAX_SCORE) {
+ new_score = KR_NS_MAX_SCORE;
+ }
+ if (new_score >= KR_NS_TIMEOUT && cur->score < KR_NS_TIMEOUT) {
+ /* Set the timestamp only when NS became "timeouted" */
+ cur->tout_timestamp = kr_now();
+ }
+ cur->score = new_score;
+ return kr_ok();
+}
+
+int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache)
+{
+ if (!ns || !cache ) {
+ return kr_error(EINVAL);
+ }
+
+ /* Store in the struct */
+ ns->reputation = reputation;
+ /* Store reputation in the LRU cache */
+ unsigned *cur = lru_get_new(cache, (const char *)ns->name,
+ knot_dname_size(ns->name), NULL);
+ if (cur) {
+ *cur = reputation;
+ }
+ return kr_ok();
+}
+
+int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src)
+{
+ if (!dst || !src ) {
+ return kr_error(EINVAL);
+ }
+
+ memcpy(dst, src, sizeof(struct kr_nsrep));
+ dst->name = (const uint8_t *)"";
+ dst->score = KR_NS_UNKNOWN;
+ dst->reputation = 0;
+
+ return kr_ok();
+}
+
+int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx)
+{
+ if (!ns || !ctx) {
+ assert(false);
+ return kr_error(EINVAL);
+ }
+
+ kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
+
+ ns->reputation = 0;
+ ns->score = KR_NS_MAX_SCORE + 1;
+
+ if (ns->addr[0].ip.sa_family == AF_UNSPEC) {
+ return kr_error(EINVAL);
+ }
+
+ /* Compute the scores. Unfortunately there's no space for scores
+ * along the addresses. */
+ unsigned scores[KR_NSREP_MAXADDR];
+ int i;
+ bool timeouted_address_is_already_selected = false;
+ for (i = 0; i < KR_NSREP_MAXADDR; ++i) {
+ const struct sockaddr *sa = &ns->addr[i].ip;
+ if (sa->sa_family == AF_UNSPEC) {
+ break;
+ }
+ kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = lru_get_try(rtt_cache,
+ kr_inaddr(sa),
+ kr_family_len(sa->sa_family));
+ if (!rtt_cache_entry) {
+ scores[i] = 1; /* prefer unknown to probe RTT */
+ } else if (rtt_cache_entry->score < KR_NS_FWD_TIMEOUT) {
+ /* some probability to bump bad ones up for re-probe */
+ scores[i] = rtt_cache_entry->score;
+ /* The lower the rtt, the more likely it will be selected. */
+ if (!kr_rand_coin(rtt_cache_entry->score, KR_NS_FWD_TIMEOUT)) {
+ scores[i] = 1;
+ }
+ } else {
+ uint64_t now = kr_now();
+ uint64_t elapsed = now - rtt_cache_entry->tout_timestamp;
+ scores[i] = KR_NS_MAX_SCORE + 1;
+ elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
+ if (elapsed > ctx->cache_rtt_tout_retry_interval &&
+ !timeouted_address_is_already_selected) {
+ scores[i] = 1;
+ rtt_cache_entry->tout_timestamp = now;
+ timeouted_address_is_already_selected = true;
+ }
+ }
+
+ /* Give advantage to IPv6. */
+ if (scores[i] <= KR_NS_MAX_SCORE && sa->sa_family == AF_INET) {
+ scores[i] += FAVOUR_IPV6;
+ }
+
+ if (VERBOSE_STATUS) {
+ kr_log_verbose("[ ][nsre] score %d for %s;\t cached RTT: %d\n",
+ scores[i], kr_straddr(sa),
+ rtt_cache_entry ? rtt_cache_entry->score : -1);
+ }
+ }
+
+ /* Select-sort the addresses. */
+ const int count = i;
+ for (i = 0; i < count - 1; ++i) {
+ /* find min from i onwards */
+ int min_i = i;
+ for (int j = i + 1; j < count; ++j) {
+ if (scores[j] < scores[min_i]) {
+ min_i = j;
+ }
+ }
+ /* swap the indices */
+ if (min_i != i) {
+ SWAP(scores[min_i], scores[i]);
+ SWAP(ns->addr[min_i], ns->addr[i]);
+ }
+ }
+
+ if (count > 0) {
+ ns->score = scores[0];
+ ns->reputation = 0;
+ }
+
+ return kr_ok();
+}