diff options
Diffstat (limited to 'lib/cache')
-rw-r--r-- | lib/cache/api.c | 889 | ||||
-rw-r--r-- | lib/cache/api.h | 201 | ||||
-rw-r--r-- | lib/cache/cdb_api.h | 68 | ||||
-rw-r--r-- | lib/cache/cdb_lmdb.c | 710 | ||||
-rw-r--r-- | lib/cache/cdb_lmdb.h | 23 | ||||
-rw-r--r-- | lib/cache/entry_list.c | 293 | ||||
-rw-r--r-- | lib/cache/entry_pkt.c | 224 | ||||
-rw-r--r-- | lib/cache/entry_rr.c | 146 | ||||
-rw-r--r-- | lib/cache/impl.h | 412 | ||||
-rw-r--r-- | lib/cache/knot_pkt.c | 106 | ||||
-rw-r--r-- | lib/cache/nsec1.c | 511 | ||||
-rw-r--r-- | lib/cache/nsec3.c | 501 | ||||
-rw-r--r-- | lib/cache/peek.c | 724 |
13 files changed, 4808 insertions, 0 deletions
diff --git a/lib/cache/api.c b/lib/cache/api.c new file mode 100644 index 0000000..c0591d6 --- /dev/null +++ b/lib/cache/api.c @@ -0,0 +1,889 @@ +/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <errno.h> +#include <limits.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> + +#include <libknot/descriptor.h> +#include <libknot/dname.h> +#include <libknot/errcode.h> +#include <libknot/rrtype/rrsig.h> + +#include "contrib/cleanup.h" +#include "contrib/ucw/lib.h" +#include "lib/cache/api.h" +#include "lib/cache/cdb_lmdb.h" +#include "lib/defines.h" +#include "lib/generic/trie.h" +#include "lib/resolve.h" +#include "lib/rplan.h" +#include "lib/utils.h" + +#include "lib/cache/impl.h" + +/* TODO: + * - Reconsider when RRSIGs are put in and retrieved from the cache. + * Currently it's always done, which _might_ be spurious, depending + * on how kresd will use the returned result. + * There's also the "problem" that kresd ATM does _not_ ask upstream + * with DO bit in some cases. + */ + + +/** Cache version */ +static const uint16_t CACHE_VERSION = 5; +/** Key size */ +#define KEY_HSIZE (sizeof(uint8_t) + sizeof(uint16_t)) +#define KEY_SIZE (KEY_HSIZE + KNOT_DNAME_MAXLEN) + + +/** @internal Forward declarations of the implementation details + * \param optout[out] Set *optout = true; when encountering an opt-out NSEC3 (optional). */ +static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry, + const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp, + uint8_t rank, trie_t *nsec_pmap, bool *has_optout); +/** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */ +static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/); + +/** @internal Removes all records from cache. */ +static inline int cache_clear(struct kr_cache *cache) +{ + cache->stats.delete += 1; + return cache_op(cache, clear); +} + +/** @internal Open cache db transaction and check internal data version. */ +static int assert_right_version(struct kr_cache *cache) +{ + /* Check cache ABI version. */ + /* CACHE_KEY_DEF: to avoid collisions with kr_cache_match(). */ + uint8_t key_str[4] = "VERS"; + knot_db_val_t key = { .data = key_str, .len = sizeof(key_str) }; + knot_db_val_t val = { NULL, 0 }; + int ret = cache_op(cache, read, &key, &val, 1); + if (ret == 0 && val.len == sizeof(CACHE_VERSION) + && memcmp(val.data, &CACHE_VERSION, sizeof(CACHE_VERSION)) == 0) { + ret = kr_error(EEXIST); + } else { + int oldret = ret; + /* Version doesn't match. Recreate cache and write version key. */ + ret = cache_op(cache, count); + if (ret != 0) { /* Non-empty cache, purge it. */ + kr_log_info("[ ][cach] incompatible cache database detected, purging\n"); + if (oldret) { + kr_log_verbose("bad ret: %d\n", oldret); + } else if (val.len != sizeof(CACHE_VERSION)) { + kr_log_verbose("bad length: %d\n", (int)val.len); + } else { + uint16_t ver; + memcpy(&ver, val.data, sizeof(ver)); + kr_log_verbose("bad version: %d\n", (int)ver); + } + ret = cache_clear(cache); + } + /* Either purged or empty. */ + if (ret == 0) { + /* Key/Val is invalidated by cache purge, recreate it */ + val.data = /*const-cast*/(void *)&CACHE_VERSION; + val.len = sizeof(CACHE_VERSION); + ret = cache_op(cache, write, &key, &val, 1); + } + } + kr_cache_sync(cache); + return ret; +} + +int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm) +{ + if (!cache) { + return kr_error(EINVAL); + } + /* Open cache */ + if (!api) { + api = kr_cdb_lmdb(); + } + cache->api = api; + int ret = cache->api->open(&cache->db, opts, mm); + if (ret != 0) { + return ret; + } + memset(&cache->stats, 0, sizeof(cache->stats)); + cache->ttl_min = KR_CACHE_DEFAULT_TTL_MIN; + cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX; + /* Check cache ABI version */ + kr_cache_make_checkpoint(cache); + (void)assert_right_version(cache); + + char *fpath; + ret = asprintf(&fpath, "%s/data.mdb", opts->path); + if (ret > 0) { + kr_cache_emergency_file_to_remove = fpath; + } else { + assert(false); /* non-critical, but still */ + } + return 0; +} + +const char *kr_cache_emergency_file_to_remove = NULL; + + +#define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db) + +void kr_cache_close(struct kr_cache *cache) +{ + if (cache_isvalid(cache)) { + cache_op(cache, close); + cache->db = NULL; + } + free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove); + kr_cache_emergency_file_to_remove = NULL; +} + +int kr_cache_sync(struct kr_cache *cache) +{ + if (!cache_isvalid(cache)) { + return kr_error(EINVAL); + } + if (cache->api->sync) { + return cache_op(cache, sync); + } + return kr_ok(); +} + +int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp) +{ + int err = stash_rrset_precond(rr, NULL); + if (err <= 0) { + return kr_ok(); + } + ssize_t written = stash_rrset(cache, NULL, rr, rrsig, timestamp, rank, NULL, NULL); + /* Zone's NSEC* parames aren't updated, but that's probably OK + * for kr_cache_insert_rr() */ + if (written >= 0) { + return kr_ok(); + } + + return (int) written; +} + +int kr_cache_clear(struct kr_cache *cache) +{ + if (!cache_isvalid(cache)) { + return kr_error(EINVAL); + } + int ret = cache_clear(cache); + if (ret == 0) { + kr_cache_make_checkpoint(cache); + ret = assert_right_version(cache); + } + return ret; +} + +/* When going stricter, BEWARE of breaking entry_h_consistent_NSEC() */ +struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type) +{ + (void) type; /* unused, for now */ + if (!data.data) return NULL; + /* Length checks. */ + if (data.len < offsetof(struct entry_h, data)) + return NULL; + const struct entry_h *eh = data.data; + if (eh->is_packet) { + uint16_t pkt_len; + if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)) { + return NULL; + } + memcpy(&pkt_len, eh->data, sizeof(pkt_len)); + if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len) + + pkt_len) { + return NULL; + } + } + + bool ok = true; + ok = ok && kr_rank_check(eh->rank); + ok = ok && (!kr_rank_test(eh->rank, KR_RANK_BOGUS) + || eh->is_packet); + ok = ok && (eh->is_packet || !eh->has_optout); + + return ok ? /*const-cast*/(struct entry_h *)eh : NULL; +} + +int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry, + const knot_dname_t *owner, uint16_t type, uint32_t now) +{ + int32_t diff = now - entry->time; + if (diff < 0) { + /* We may have obtained the record *after* the request started. */ + diff = 0; + } + int32_t res = entry->ttl - diff; + if (res < 0 && owner && qry && qry->stale_cb) { + /* Stale-serving decision, delegated to a callback. */ + int res_stale = qry->stale_cb(res, owner, type, qry); + if (res_stale >= 0) + return res_stale; + } + return res; +} + +int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry, + const knot_dname_t *name, uint16_t type) +{ + const struct entry_h *eh = peek->raw_data; + return get_new_ttl(eh, qry, name, type, qry->timestamp.tv_sec); +} + +/** Check that no label contains a zero character, incl. a log trace. + * + * We refuse to work with those, as LF and our cache keys might become ambiguous. + * Assuming uncompressed name, as usual. + * CACHE_KEY_DEF + */ +static bool check_dname_for_lf(const knot_dname_t *n, const struct kr_query *qry/*logging*/) +{ + const bool ret = knot_dname_size(n) == strlen((const char *)n) + 1; + if (!ret) { WITH_VERBOSE(qry) { + auto_free char *n_str = kr_dname_text(n); + VERBOSE_MSG(qry, "=> skipping zero-containing name %s\n", n_str); + } } + return ret; +} + +/** Return false on types to be ignored. Meant both for sname and direct cache requests. */ +static bool check_rrtype(uint16_t type, const struct kr_query *qry/*logging*/) +{ + const bool ret = !knot_rrtype_is_metatype(type) + && type != KNOT_RRTYPE_RRSIG; + if (!ret) { WITH_VERBOSE(qry) { + auto_free char *type_str = kr_rrtype_text(type); + VERBOSE_MSG(qry, "=> skipping RR type %s\n", type_str); + } } + return ret; +} + +/** Like key_exact_type() but omits a couple checks not holding for pkt cache. */ +knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type) +{ + assert(check_rrtype(type, NULL)); + switch (type) { + case KNOT_RRTYPE_RRSIG: /* no RRSIG query caching, at least for now */ + assert(false); + return (knot_db_val_t){ NULL, 0 }; + /* xNAME lumped into NS. */ + case KNOT_RRTYPE_CNAME: + case KNOT_RRTYPE_DNAME: + type = KNOT_RRTYPE_NS; + default: + break; + } + + int name_len = k->buf[0]; + k->buf[name_len + 1] = 0; /* make sure different names can never match */ + k->buf[name_len + 2] = 'E'; /* tag for exact name+type matches */ + memcpy(k->buf + name_len + 3, &type, 2); + k->type = type; + /* CACHE_KEY_DEF: key == dname_lf + '\0' + 'E' + RRTYPE */ + return (knot_db_val_t){ k->buf + 1, name_len + 4 }; +} + + +/** The inside for cache_peek(); implementation separated to ./peek.c */ +int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt); +/** function for .produce phase */ +int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + /* We first check various exit-conditions and then call the _real function. */ + + if (!kr_cache_is_open(&req->ctx->cache) + || ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || qry->flags.NO_CACHE + || (qry->flags.CACHE_TRIED && !qry->stale_cb) + || !check_rrtype(qry->stype, qry) /* LATER: some other behavior for some of these? */ + || qry->sclass != KNOT_CLASS_IN) { + return ctx->state; /* Already resolved/failed or already tried, etc. */ + } + /* ATM cache only peeks for qry->sname and that would be useless + * to repeat on every iteration, so disable it from now on. + * LATER(optim.): assist with more precise QNAME minimization. */ + qry->flags.CACHE_TRIED = true; + + if (qry->stype == KNOT_RRTYPE_NSEC) { + VERBOSE_MSG(qry, "=> skipping stype NSEC\n"); + return ctx->state; + } + if (!check_dname_for_lf(qry->sname, qry)) { + return ctx->state; + } + + int ret = peek_nosync(ctx, pkt); + kr_cache_sync(&req->ctx->cache); + return ret; +} + + + +/** It's simply inside of cycle taken out to decrease indentation. \return error code. */ +static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i, + const struct kr_query *qry, struct kr_cache *cache, + int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout); +/** Stash a single nsec_p. \return 0 (errors are ignored). */ +static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v, + struct kr_request *req); + +/** The whole .consume phase for the cache module. */ +int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + struct kr_cache *cache = &req->ctx->cache; + + /* Note: we cache even in KR_STATE_FAIL. For example, + * BOGUS answer can go to +cd cache even without +cd request. */ + if (!kr_cache_is_open(cache) || !qry + || qry->flags.CACHED || !check_rrtype(knot_pkt_qtype(pkt), qry) + || qry->sclass != KNOT_CLASS_IN) { + return ctx->state; + } + /* Do not cache truncated answers, at least for now. LATER */ + if (knot_wire_get_tc(pkt->wire)) { + return ctx->state; + } + /* Stash individual records. */ + ranked_rr_array_t *selected[] = kr_request_selected(req); + int unauth_cnt = 0; + trie_t *nsec_pmap = trie_create(&req->pool); + if (!nsec_pmap) { + assert(!ENOMEM); + goto finally; + } + bool has_optout = false; + /* ^^ DNSSEC_OPTOUT is not fired in cases like `com. A`, + * but currently we don't stash separate NSEC3 proving that. */ + for (int psec = KNOT_ANSWER; psec <= KNOT_ADDITIONAL; ++psec) { + ranked_rr_array_t *arr = selected[psec]; + /* uncached entries are located at the end */ + for (ssize_t i = arr->len - 1; i >= 0; --i) { + ranked_rr_array_entry_t *entry = arr->at[i]; + if (entry->qry_uid != qry->uid) { + continue; + /* TODO: probably safe to break but maybe not worth it */ + } + int ret = stash_rrarray_entry(arr, i, qry, cache, &unauth_cnt, + nsec_pmap, &has_optout); + if (ret) { + VERBOSE_MSG(qry, "=> stashing RRs errored out\n"); + goto finally; + } + /* LATER(optim.): maybe filter out some type-rank combinations + * that won't be useful as separate RRsets. */ + } + } + + trie_it_t *it; + for (it = trie_it_begin(nsec_pmap); !trie_it_finished(it); trie_it_next(it)) { + stash_nsec_p((const knot_dname_t *)trie_it_key(it, NULL), + (const char *)*trie_it_val(it), req); + } + trie_it_free(it); + /* LATER(optim.): typically we also have corresponding NS record in the list, + * so we might save a cache operation. */ + + stash_pkt(pkt, qry, req, has_optout); + +finally: + if (unauth_cnt) { + VERBOSE_MSG(qry, "=> stashed also %d nonauth RRsets\n", unauth_cnt); + }; + kr_cache_sync(cache); + return ctx->state; /* we ignore cache-stashing errors */ +} + +/** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */ +static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/) +{ + if (!rr || rr->rclass != KNOT_CLASS_IN) { + assert(!EINVAL); + return kr_error(EINVAL); + } + if (!check_rrtype(rr->type, qry)) { + return kr_ok(); + } + if (!check_dname_for_lf(rr->owner, qry)) { + return kr_ok(); + } + return 1/*proceed*/; +} + +static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry, + const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp, + uint8_t rank, trie_t *nsec_pmap, bool *has_optout) +{ + assert(stash_rrset_precond(rr, qry) > 0); + if (!cache) { + assert(!EINVAL); + return kr_error(EINVAL); + } + + const int wild_labels = rr_sigs == NULL ? 0 : + knot_dname_labels(rr->owner, NULL) - knot_rrsig_labels(rr_sigs->rrs.rdata); + if (wild_labels < 0) { + return kr_ok(); + } + const knot_dname_t *encloser = rr->owner; /**< the closest encloser name */ + for (int i = 0; i < wild_labels; ++i) { + encloser = knot_wire_next_label(encloser, NULL); + } + int ret = 0; + + /* Construct the key under which RRs will be stored, + * and add corresponding nsec_pmap item (if necessary). */ + struct key k_storage, *k = &k_storage; + knot_db_val_t key; + switch (rr->type) { + case KNOT_RRTYPE_NSEC3: + /* Skip "suspicious" or opt-out NSEC3 sets. */ + if (rr->rrs.count != 1) return kr_ok(); + if (KNOT_NSEC3_FLAG_OPT_OUT & knot_nsec3_flags(rr->rrs.rdata)) { + if (has_optout) *has_optout = true; + return kr_ok(); + } + /* fall through */ + case KNOT_RRTYPE_NSEC: + if (!kr_rank_test(rank, KR_RANK_SECURE)) { + /* Skip any NSEC*s that aren't validated. */ + return kr_ok(); + } + if (!rr_sigs || !rr_sigs->rrs.count || !rr_sigs->rrs.rdata) { + assert(!EINVAL); + return kr_error(EINVAL); + } + const knot_dname_t *signer = knot_rrsig_signer_name(rr_sigs->rrs.rdata); + const int signer_size = knot_dname_size(signer); + k->zlf_len = signer_size - 1; + + void **npp = nsec_pmap == NULL ? NULL + : trie_get_ins(nsec_pmap, (const char *)signer, signer_size); + assert(!nsec_pmap || (npp && ENOMEM)); + if (rr->type == KNOT_RRTYPE_NSEC) { + key = key_NSEC1(k, encloser, wild_labels); + break; + } + + assert(rr->type == KNOT_RRTYPE_NSEC3); + const knot_rdata_t * const rdata = rr->rrs.rdata; + if (rdata->len <= 4) return kr_error(EILSEQ); /*< data from outside; less trust */ + const int np_dlen = nsec_p_rdlen(rdata->data); + if (np_dlen > rdata->len) return kr_error(EILSEQ); + key = key_NSEC3(k, encloser, nsec_p_mkHash(rdata->data)); + if (npp && !*npp) { + *npp = mm_alloc(&qry->request->pool, np_dlen); + if (!*npp) { + assert(!ENOMEM); + break; + } + memcpy(*npp, rdata->data, np_dlen); + } + break; + default: + ret = kr_dname_lf(k->buf, encloser, wild_labels); + if (ret) { + assert(!ret); + return kr_error(ret); + } + key = key_exact_type(k, rr->type); + } + + /* Compute materialized sizes of the new data. */ + const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL; + const int rr_ssize = rdataset_dematerialize_size(&rr->rrs); + assert(rr_ssize == to_even(rr_ssize)); + knot_db_val_t val_new_entry = { + .data = NULL, + .len = offsetof(struct entry_h, data) + rr_ssize + + rdataset_dematerialize_size(rds_sigs), + }; + + /* Prepare raw memory for the new entry. */ + ret = entry_h_splice(&val_new_entry, rank, key, k->type, rr->type, + rr->owner, qry, cache, timestamp); + if (ret) return kr_ok(); /* some aren't really errors */ + assert(val_new_entry.data); + + const uint32_t ttl = rr->ttl; + /* FIXME: consider TTLs and expirations of RRSIGs as well, just in case. */ + + /* Write the entry itself. */ + struct entry_h *eh = val_new_entry.data; + memset(eh, 0, offsetof(struct entry_h, data)); + eh->time = timestamp; + eh->ttl = MAX(MIN(ttl, cache->ttl_max), cache->ttl_min); + eh->rank = rank; + if (rdataset_dematerialize(&rr->rrs, eh->data) + || rdataset_dematerialize(rds_sigs, eh->data + rr_ssize)) { + /* minimize the damage from incomplete write; TODO: better */ + eh->time = 0; + eh->ttl = 0; + eh->rank = 0; + assert(false); + } + assert(entry_h_consistent(val_new_entry, rr->type)); + + #if 0 /* Occasionally useful when debugging some kinds of changes. */ + { + kr_cache_sync(cache); + knot_db_val_t val = { NULL, 0 }; + ret = cache_op(cache, read, &key, &val, 1); + if (ret != kr_error(ENOENT)) { // ENOENT might happen in some edge case, I guess + assert(!ret); + entry_list_t el; + entry_list_parse(val, el); + } + } + #endif + + /* Update metrics */ + cache->stats.insert += 1; + + /* Verbose-log some not-too-common cases. */ + WITH_VERBOSE(qry) { if (kr_rank_test(rank, KR_RANK_AUTH) + || rr->type == KNOT_RRTYPE_NS) { + auto_free char *type_str = kr_rrtype_text(rr->type), + *encl_str = kr_dname_text(encloser); + VERBOSE_MSG(qry, "=> stashed %s%s %s, rank 0%.2o, " + "%d B total, incl. %d RRSIGs\n", + (wild_labels ? "*." : ""), encl_str, type_str, rank, + (int)val_new_entry.len, (rr_sigs ? rr_sigs->rrs.count : 0) + ); + } } + + return (ssize_t) val_new_entry.len; +} + +static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i, + const struct kr_query *qry, struct kr_cache *cache, + int *unauth_cnt, trie_t *nsec_pmap, bool *has_optout) +{ + ranked_rr_array_entry_t *entry = arr->at[arr_i]; + if (entry->cached) { + return kr_ok(); + } + const knot_rrset_t *rr = entry->rr; + if (rr->type == KNOT_RRTYPE_RRSIG) { + return kr_ok(); /* reduce verbose logging from the following call */ + } + int ret = stash_rrset_precond(rr, qry); + if (ret <= 0) { + return ret; + } + + /* Try to find corresponding signatures, always. LATER(optim.): speed. */ + ranked_rr_array_entry_t *entry_rrsigs = NULL; + const knot_rrset_t *rr_sigs = NULL; + for (ssize_t j = arr->len - 1; j >= 0; --j) { + /* TODO: ATM we assume that some properties are the same + * for all RRSIGs in the set (esp. label count). */ + ranked_rr_array_entry_t *e = arr->at[j]; + bool ok = e->qry_uid == qry->uid && !e->cached + && e->rr->type == KNOT_RRTYPE_RRSIG + && knot_rrsig_type_covered(e->rr->rrs.rdata) == rr->type + && knot_dname_is_equal(rr->owner, e->rr->owner); + if (!ok) continue; + entry_rrsigs = e; + rr_sigs = e->rr; + break; + } + + ssize_t written = stash_rrset(cache, qry, rr, rr_sigs, qry->timestamp.tv_sec, + entry->rank, nsec_pmap, has_optout); + if (written < 0) { + kr_log_error("[%05u.%02u][cach] stash failed, ret = %d\n", qry->request->uid, + qry->uid, ret); + return (int) written; + } + + if (written > 0) { + /* Mark entry as cached for the rest of the query processing */ + entry->cached = true; + if (entry_rrsigs) { + entry_rrsigs->cached = true; + } + if (!kr_rank_test(entry->rank, KR_RANK_AUTH) && rr->type != KNOT_RRTYPE_NS) { + *unauth_cnt += 1; + } + } + + return kr_ok(); +} + +static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v, + struct kr_request *req) +{ + const struct kr_query *qry = req->current_query; + struct kr_cache *cache = &req->ctx->cache; + uint32_t valid_until = qry->timestamp.tv_sec + cache->ttl_max; + /* LATER(optim.): be more precise here ^^ and reduce calls. */ + static const int32_t ttl_margin = 3600; + const uint8_t *nsec_p = (const uint8_t *)nsec_p_v; + int data_stride = sizeof(valid_until) + nsec_p_rdlen(nsec_p); + + unsigned int log_hash = 0xFeeeFeee; /* this type is simpler for printf args */ + auto_free char *log_dname = NULL; + WITH_VERBOSE(qry) { + log_hash = nsec_p_v ? nsec_p_mkHash((const uint8_t *)nsec_p_v) : 0; + log_dname = kr_dname_text(dname); + } + /* Find what's in the cache. */ + struct key k_storage, *k = &k_storage; + int ret = kr_dname_lf(k->buf, dname, false); + if (ret) return kr_error(ret); + knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS); + knot_db_val_t val_orig = { NULL, 0 }; + ret = cache_op(cache, read, &key, &val_orig, 1); + if (ret && ret != -ABS(ENOENT)) { + VERBOSE_MSG(qry, "=> EL read failed (ret: %d)\n", ret); + return kr_ok(); + } + /* Prepare new entry_list_t so we can just write at el[0]. */ + entry_list_t el; + int log_refresh_by = 0; + if (ret == -ABS(ENOENT)) { + memset(el, 0, sizeof(el)); + } else { + ret = entry_list_parse(val_orig, el); + if (ret) { + VERBOSE_MSG(qry, "=> EL parse failed (ret: %d)\n", ret); + return kr_error(0); + } + /* Find the index to replace. */ + int i_replace = ENTRY_APEX_NSECS_CNT - 1; + for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) { + if (el[i].len != data_stride) continue; + if (nsec_p && memcmp(nsec_p, (uint8_t *)el[i].data + sizeof(uint32_t), + data_stride - sizeof(uint32_t)) != 0) { + continue; + } + /* Save a cache operation if TTL extended only a little. */ + uint32_t valid_orig; + memcpy(&valid_orig, el[i].data, sizeof(valid_orig)); + const int32_t ttl_extended_by = valid_until - valid_orig; + if (ttl_extended_by < ttl_margin) { + VERBOSE_MSG(qry, + "=> nsec_p stash for %s skipped (extra TTL: %d, hash: %x)\n", + log_dname, ttl_extended_by, log_hash); + return kr_ok(); + } + i_replace = i; + log_refresh_by = ttl_extended_by; + break; + } + /* Shift the other indices: move the first `i_replace` blocks + * by one position. */ + if (i_replace) { + memmove(&el[1], &el[0], sizeof(el[0]) * i_replace); + } + } + /* Prepare old data into a buffer. See entry_h_splice() for why. LATER(optim.) */ + el[0].len = data_stride; + el[0].data = NULL; + knot_db_val_t val; + val.len = entry_list_serial_size(el), + val.data = mm_alloc(&req->pool, val.len), + entry_list_memcpy(val.data, el); + /* Prepare the new data chunk */ + memcpy(el[0].data, &valid_until, sizeof(valid_until)); + if (nsec_p) { + memcpy((uint8_t *)el[0].data + sizeof(valid_until), nsec_p, + data_stride - sizeof(valid_until)); + } + /* Write it all to the cache */ + ret = cache_op(cache, write, &key, &val, 1); + if (ret || !val.data) { + VERBOSE_MSG(qry, "=> EL write failed (ret: %d)\n", ret); + return kr_ok(); + } + if (log_refresh_by) { + VERBOSE_MSG(qry, "=> nsec_p stashed for %s (refresh by %d, hash: %x)\n", + log_dname, log_refresh_by, log_hash); + } else { + VERBOSE_MSG(qry, "=> nsec_p stashed for %s (new, hash: %x)\n", + log_dname, log_hash); + } + return kr_ok(); +} + + +static int peek_exact_real(struct kr_cache *cache, const knot_dname_t *name, uint16_t type, + struct kr_cache_p *peek) +{ + if (!check_rrtype(type, NULL) || !check_dname_for_lf(name, NULL)) { + return kr_error(ENOTSUP); + } + struct key k_storage, *k = &k_storage; + + int ret = kr_dname_lf(k->buf, name, false); + if (ret) return kr_error(ret); + + knot_db_val_t key = key_exact_type(k, type); + knot_db_val_t val = { NULL, 0 }; + ret = cache_op(cache, read, &key, &val, 1); + if (!ret) ret = entry_h_seek(&val, type); + if (ret) return kr_error(ret); + + const struct entry_h *eh = entry_h_consistent(val, type); + if (!eh || eh->is_packet) { + // TODO: no packets, but better get rid of whole kr_cache_peek_exact(). + return kr_error(ENOENT); + } + *peek = (struct kr_cache_p){ + .time = eh->time, + .ttl = eh->ttl, + .rank = eh->rank, + .raw_data = val.data, + .raw_bound = knot_db_val_bound(val), + }; + return kr_ok(); +} +int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type, + struct kr_cache_p *peek) +{ /* Just wrap with extra verbose logging. */ + const int ret = peek_exact_real(cache, name, type, peek); + if (false && VERBOSE_STATUS) { /* too noisy for usual --verbose */ + auto_free char *type_str = kr_rrtype_text(type), + *name_str = kr_dname_text(name); + const char *result_str = (ret == kr_ok() ? "hit" : + (ret == kr_error(ENOENT) ? "miss" : "error")); + VERBOSE_MSG(NULL, "_peek_exact: %s %s %s (ret: %d)", + type_str, name_str, result_str, ret); + } + return ret; +} + +int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type) +{ + if (!cache_isvalid(cache)) { + return kr_error(EINVAL); + } + if (!cache->api->remove) { + return kr_error(ENOSYS); + } + struct key k_storage, *k = &k_storage; + int ret = kr_dname_lf(k->buf, name, false); + if (ret) return kr_error(ret); + + knot_db_val_t key = key_exact_type(k, type); + return cache_op(cache, remove, &key, 1); +} + +int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name, + bool exact_name, knot_db_val_t keyval[][2], int maxcount) +{ + if (!cache_isvalid(cache)) { + return kr_error(EINVAL); + } + if (!cache->api->match) { + return kr_error(ENOSYS); + } + + struct key k_storage, *k = &k_storage; + + int ret = kr_dname_lf(k->buf, name, false); + if (ret) return kr_error(ret); + + // use a mock type + knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_A); + /* CACHE_KEY_DEF */ + key.len -= sizeof(uint16_t); /* the type */ + if (!exact_name) { + key.len -= 2; /* '\0' 'E' */ + if (name[0] == '\0') ++key.len; /* the root name is special ATM */ + } + return cache_op(cache, match, &key, keyval, maxcount); +} + +int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type) +{ + if (key.data == NULL || buf == NULL || type == NULL) { + return kr_error(EINVAL); + } + + int len = -1; + const char *tag, *key_data = key.data; + for (tag = key_data + 1; tag < key_data + key.len; ++tag) { + /* CACHE_KEY_DEF */ + if (tag[-1] == '\0' && (tag == key_data + 1 || tag[-2] == '\0')) { + if (tag[0] != 'E') return kr_error(EINVAL); + len = tag - 1 - key_data; + break; + } + } + + if (len == -1 || len > KNOT_DNAME_MAXLEN) { + return kr_error(EINVAL); + } + + int ret = knot_dname_lf2wire(buf, len, key.data); + if (ret < 0) { + return kr_error(ret); + } + + /* CACHE_KEY_DEF: jump over "\0 E/1" */ + memcpy(type, tag + 1, sizeof(uint16_t)); + + return kr_ok(); +} + + +int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name, + bool exact_name, int maxcount) +{ + if (!cache_isvalid(cache)) { + return kr_error(EINVAL); + } + + knot_db_val_t keyval[maxcount][2], keys[maxcount]; + int ret = kr_cache_match(cache, name, exact_name, keyval, maxcount); + if (ret <= 0) { /* ENOENT -> nothing to remove */ + return (ret == KNOT_ENOENT) ? 0 : ret; + } + const int count = ret; + /* Duplicate the key strings, as deletion may invalidate the pointers. */ + int i; + for (i = 0; i < count; ++i) { + keys[i].len = keyval[i][0].len; + keys[i].data = malloc(keys[i].len); + if (!keys[i].data) { + ret = kr_error(ENOMEM); + goto cleanup; + } + memcpy(keys[i].data, keyval[i][0].data, keys[i].len); + } + ret = cache->api->remove(cache->db, keys, count); +cleanup: + kr_cache_sync(cache); /* Sync even after just kr_cache_match(). */ + /* Free keys */ + while (--i >= 0) { + free(keys[i].data); + } + return ret; +} + diff --git a/lib/cache/api.h b/lib/cache/api.h new file mode 100644 index 0000000..61bf796 --- /dev/null +++ b/lib/cache/api.h @@ -0,0 +1,201 @@ +/* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <libknot/consts.h> +#include <libknot/rrset.h> +#include <sys/time.h> +#include "lib/cache/cdb_api.h" +#include "lib/defines.h" +#include "contrib/ucw/config.h" /*uint*/ + +/** When knot_pkt is passed from cache without ->wire, this is the ->size. */ +static const size_t PKT_SIZE_NOWIRE = -1; + + +#include "lib/module.h" +/* Prototypes for the 'cache' module implementation. */ +int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt); +int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt); + + +/** + * Cache structure, keeps API, instance and metadata. + */ +struct kr_cache +{ + knot_db_t *db; /**< Storage instance */ + const struct kr_cdb_api *api; /**< Storage engine */ + struct { + uint32_t hit; /**< Number of cache hits */ + uint32_t miss; /**< Number of cache misses */ + uint32_t insert; /**< Number of insertions */ + uint32_t delete; /**< Number of deletions */ + } stats; + + uint32_t ttl_min, ttl_max; /**< TTL limits */ + + /* A pair of stamps for detection of real-time shifts during runtime. */ + struct timeval checkpoint_walltime; /**< Wall time on the last check-point. */ + uint64_t checkpoint_monotime; /**< Monotonic milliseconds on the last check-point. */ +}; + +/** + * Open/create cache with provided storage options. + * @param cache cache structure to be initialized + * @param api storage engine API + * @param opts storage-specific options (may be NULL for default) + * @param mm memory context. + * @return 0 or an error code + */ +KR_EXPORT +int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm); + +/** + * Path to cache file to remove on critical out-of-space error. (do NOT modify it) + */ +KR_EXPORT extern +const char *kr_cache_emergency_file_to_remove; + +/** + * Close persistent cache. + * @note This doesn't clear the data, just closes the connection to the database. + * @param cache structure + */ +KR_EXPORT +void kr_cache_close(struct kr_cache *cache); + +/** Run after a row of operations to release transaction/lock if needed. */ +KR_EXPORT +int kr_cache_sync(struct kr_cache *cache); + +/** + * Return true if cache is open and enabled. + */ +static inline bool kr_cache_is_open(struct kr_cache *cache) +{ + return cache->db != NULL; +} + +/** (Re)set the time pair to the current values. */ +static inline void kr_cache_make_checkpoint(struct kr_cache *cache) +{ + cache->checkpoint_monotime = kr_now(); + gettimeofday(&cache->checkpoint_walltime, NULL); +} + +/** + * Insert RRSet into cache, replacing any existing data. + * @param cache cache structure + * @param rr inserted RRSet + * @param rrsig RRSIG for inserted RRSet (optional) + * @param rank rank of the data + * @param timestamp current time + * @return 0 or an errcode + */ +KR_EXPORT +int kr_cache_insert_rr(struct kr_cache *cache, const knot_rrset_t *rr, const knot_rrset_t *rrsig, uint8_t rank, uint32_t timestamp); + +/** + * Clear all items from the cache. + * @param cache cache structure + * @return 0 or an errcode + */ +KR_EXPORT +int kr_cache_clear(struct kr_cache *cache); + + +/* ** This interface is temporary. ** */ + +struct kr_cache_p { + uint32_t time; /**< The time of inception. */ + uint32_t ttl; /**< TTL at inception moment. Assuming it fits into int32_t ATM. */ + uint8_t rank; /**< See enum kr_rank */ + struct { + /* internal: pointer to eh struct */ + void *raw_data, *raw_bound; + }; +}; +KR_EXPORT +int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type, + struct kr_cache_p *peek); +/* Parameters (qry, name, type) are used for timestamp and stale-serving decisions. */ +KR_EXPORT +int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry, + const knot_dname_t *name, uint16_t type); + +KR_EXPORT +int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref, + knot_mm_t *pool); + + +/** + * Remove an entry from cache. + * @param cache cache structure + * @param name dname + * @param type rr type + * @return number of deleted records, or negative error code + * @note only "exact hits" are considered ATM, and + * some other information may be removed alongside. + */ +KR_EXPORT +int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type); + +/** + * Get keys matching a dname lf prefix + * @param cache cache structure + * @param name dname + * @param exact_name whether to only consider exact name matches + * @param keyval matched key-value pairs + * @param maxcount limit on the number of returned key-value pairs + * @return result count or an errcode + * @note the cache keys are matched by prefix, i.e. it very much depends + * on their structure; CACHE_KEY_DEF. + */ +KR_EXPORT +int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name, + bool exact_name, knot_db_val_t keyval[][2], int maxcount); + +/** + * Remove a subtree in cache. It's like _match but removing them instead of returning. + * @return number of deleted entries or an errcode + */ +KR_EXPORT +int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name, + bool exact_name, int maxcount); + +/** + * Find the closest cached zone apex for a name (in cache). + * @param is_DS start searching one name higher + * @return the number of labels to remove from the name, or negative error code + * @note timestamp is found by a syscall, and stale-serving is not considered + */ +KR_EXPORT +int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS, + knot_dname_t **apex); + +/** + * Unpack dname and type from db key + * @param key db key representation + * @param buf output buffer of domain name in dname format + * @param type output for type + * @return length of dname or an errcode + * @note only "exact hits" are considered ATM, moreover xNAME records + * are "hidden" as NS. (see comments in struct entry_h) + */ +KR_EXPORT +int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type); diff --git a/lib/cache/cdb_api.h b/lib/cache/cdb_api.h new file mode 100644 index 0000000..814a5f5 --- /dev/null +++ b/lib/cache/cdb_api.h @@ -0,0 +1,68 @@ +/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <libknot/db/db.h> + +/* Cache options. */ +struct kr_cdb_opts { + const char *path; /*!< Cache URI path. */ + size_t maxsize; /*!< Suggested cache size in bytes. */ +}; + +/*! Cache database API. + * This is a simplified version of generic DB API from libknot, + * that is tailored to caching purposes. + */ +struct kr_cdb_api { + const char *name; + + /* Context operations */ + + int (*open)(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *mm); + void (*close)(knot_db_t *db); + int (*count)(knot_db_t *db); + int (*clear)(knot_db_t *db); + + /** Run after a row of operations to release transaction/lock if needed. */ + int (*sync)(knot_db_t *db); + + /* Data access */ + + int (*read)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val, + int maxcount); + int (*write)(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val, + int maxcount); + + /** Remove maxcount keys. + * \returns the number of succesfully removed keys or the first error code + * It returns on first error, but ENOENT is not considered an error. */ + int (*remove)(knot_db_t *db, knot_db_val_t keys[], int maxcount); + + /* Specialised operations */ + + /** Find key-value pairs that are prefixed by the given key, limited by maxcount. + * \return the number of pairs or negative error. */ + int (*match)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount); + /** Not implemented ATM. */ + int (*prune)(knot_db_t *db, int maxcount); + + /** Less-or-equal search (lexicographic ordering). + * On successful return, key->data and val->data point to DB-owned data. + * return: 0 for equality, > 0 for less, < 0 kr_error */ + int (*read_leq)(knot_db_t *db, knot_db_val_t *key, knot_db_val_t *val); +}; diff --git a/lib/cache/cdb_lmdb.c b/lib/cache/cdb_lmdb.c new file mode 100644 index 0000000..621d2e8 --- /dev/null +++ b/lib/cache/cdb_lmdb.c @@ -0,0 +1,710 @@ +/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. +*/ + +#include <assert.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <lmdb.h> + +#include "contrib/cleanup.h" +#include "lib/cache/cdb_lmdb.h" +#include "lib/cache/api.h" +#include "lib/utils.h" + + +/* Defines */ +#define LMDB_DIR_MODE 0770 +#define LMDB_FILE_MODE 0660 + +struct lmdb_env +{ + size_t mapsize; + MDB_dbi dbi; + MDB_env *env; + + /** Cached transactions + * + * - only one of (ro,rw) may be active at once + * - non-NULL .ro may be active or reset + * - non-NULL .rw is always active + */ + struct { + bool ro_active, ro_curs_active; + MDB_txn *ro, *rw; + MDB_cursor *ro_curs; + } txn; +}; + +/** @brief Convert LMDB error code. */ +static int lmdb_error(int error) +{ + /* _BAD_TXN may happen with overfull DB, + * even during mdb_get with a single fork :-/ */ + if (error == MDB_BAD_TXN) { + kr_log_info("[cache] MDB_BAD_TXN, probably overfull\n"); + error = ENOSPC; + } + switch (error) { + case MDB_SUCCESS: + return kr_ok(); + case MDB_NOTFOUND: + return kr_error(ENOENT); + case ENOSPC: + case MDB_MAP_FULL: + case MDB_TXN_FULL: + return kr_error(ENOSPC); + default: + kr_log_error("[cache] LMDB error: %s\n", mdb_strerror(error)); + return kr_error(error); + } +} + +/** Conversion between knot and lmdb structs for values. */ +static inline knot_db_val_t val_mdb2knot(MDB_val v) +{ + return (knot_db_val_t){ .len = v.mv_size, .data = v.mv_data }; +} +static inline MDB_val val_knot2mdb(knot_db_val_t v) +{ + return (MDB_val){ .mv_size = v.len, .mv_data = v.data }; +} + + +/*! \brief Set the environment map size. + * \note This also sets the maximum database size, see \fn mdb_env_set_mapsize + */ +static int set_mapsize(MDB_env *env, size_t map_size) +{ + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return KNOT_ERROR; + } + + /* Round to page size. */ + map_size = (map_size / page_size) * page_size; + int ret = mdb_env_set_mapsize(env, map_size); + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + + return 0; +} + +#define FLAG_RENEW (2*MDB_RDONLY) +/** mdb_txn_begin or _renew + handle MDB_MAP_RESIZED. + * + * The retrying logic for MDB_MAP_RESIZED is so ugly that it has its own function. + * \note this assumes no transactions are active + * \return MDB_ errcode, not usual kr_error(...) + */ +static int txn_get_noresize(struct lmdb_env *env, unsigned int flag, MDB_txn **txn) +{ + assert(!env->txn.rw && (!env->txn.ro || !env->txn.ro_active)); + int ret; + if (flag == FLAG_RENEW) { + ret = mdb_txn_renew(*txn); + } else { + ret = mdb_txn_begin(env->env, NULL, flag, txn); + } + if (ret != MDB_MAP_RESIZED) { + return ret; + } + //:unlikely + /* Another process increased the size; let's try to recover. */ + kr_log_info("[cache] detected size increased by another process\n"); + ret = mdb_env_set_mapsize(env->env, 0); + if (ret != MDB_SUCCESS) { + return ret; + } + if (flag == FLAG_RENEW) { + ret = mdb_txn_renew(*txn); + } else { + ret = mdb_txn_begin(env->env, NULL, flag, txn); + } + return ret; +} + +/** Obtain a transaction. (they're cached in env->txn) */ +static int txn_get(struct lmdb_env *env, MDB_txn **txn, bool rdonly) +{ + assert(env && txn); + if (env->txn.rw) { + /* Reuse the *open* RW txn even if only reading is requested. + * We leave the management of this to the cdb_sync command. + * The user may e.g. want to do some reads between the writes. */ + *txn = env->txn.rw; + return kr_ok(); + } + + if (!rdonly) { + /* avoid two active transactions */ + if (env->txn.ro && env->txn.ro_active) { + mdb_txn_reset(env->txn.ro); + env->txn.ro_active = false; + env->txn.ro_curs_active = false; + } + int ret = txn_get_noresize(env, 0/*RW*/, &env->txn.rw); + if (ret == MDB_SUCCESS) { + *txn = env->txn.rw; + assert(*txn); + } + return lmdb_error(ret); + } + + /* Get an active RO txn and return it. */ + int ret = MDB_SUCCESS; + if (!env->txn.ro) { //:unlikely + ret = txn_get_noresize(env, MDB_RDONLY, &env->txn.ro); + } else if (!env->txn.ro_active) { + ret = txn_get_noresize(env, FLAG_RENEW, &env->txn.ro); + } + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + env->txn.ro_active = true; + *txn = env->txn.ro; + assert(*txn); + return kr_ok(); +} + +static int cdb_sync(knot_db_t *db) +{ + struct lmdb_env *env = db; + int ret = kr_ok(); + if (env->txn.rw) { + ret = lmdb_error(mdb_txn_commit(env->txn.rw)); + env->txn.rw = NULL; /* the transaction got freed even in case of errors */ + } else if (env->txn.ro && env->txn.ro_active) { + mdb_txn_reset(env->txn.ro); + env->txn.ro_active = false; + env->txn.ro_curs_active = false; + } + return ret; +} + +/** Obtain a read-only cursor (and a read-only transaction). */ +static int txn_curs_get(struct lmdb_env *env, MDB_cursor **curs) +{ + assert(env && curs); + if (env->txn.ro_curs_active) { + goto success; + } + /* Only in a read-only txn; TODO: it's a bit messy/coupled */ + if (env->txn.rw) { + int ret = cdb_sync(env); + if (ret) return ret; + } + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, true); + if (ret) return ret; + + if (env->txn.ro_curs) { + ret = mdb_cursor_renew(txn, env->txn.ro_curs); + } else { + ret = mdb_cursor_open(txn, env->dbi, &env->txn.ro_curs); + } + if (ret) return ret; + env->txn.ro_curs_active = true; +success: + assert(env->txn.ro_curs_active && env->txn.ro && env->txn.ro_active + && !env->txn.rw); + *curs = env->txn.ro_curs; + assert(*curs); + return kr_ok(); +} + +static void free_txn_ro(struct lmdb_env *env) +{ + if (env->txn.ro) { + mdb_txn_abort(env->txn.ro); + env->txn.ro = NULL; + } + if (env->txn.ro_curs) { + mdb_cursor_close(env->txn.ro_curs); + env->txn.ro_curs = NULL; + } +} + +/*! \brief Close the database. */ +static void cdb_close_env(struct lmdb_env *env) +{ + assert(env && env->env); + + /* Get rid of any transactions. */ + cdb_sync(env); + free_txn_ro(env); + + mdb_env_sync(env->env, 1); + mdb_dbi_close(env->env, env->dbi); + mdb_env_close(env->env); + memset(env, 0, sizeof(*env)); +} + +/*! \brief Open database environment. */ +static int cdb_open_env(struct lmdb_env *env, unsigned flags, const char *path, size_t mapsize) +{ + int ret = mkdir(path, LMDB_DIR_MODE); + if (ret == -1 && errno != EEXIST) { + return kr_error(errno); + } + + MDB_env *mdb_env = NULL; + ret = mdb_env_create(&mdb_env); + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + + ret = set_mapsize(mdb_env, mapsize); + if (ret != 0) { + mdb_env_close(mdb_env); + return ret; + } + + ret = mdb_env_open(mdb_env, path, flags, LMDB_FILE_MODE); + if (ret != MDB_SUCCESS) { + mdb_env_close(mdb_env); + return lmdb_error(ret); + } + + /* Keep the environment pointer. */ + env->env = mdb_env; + env->mapsize = mapsize; + return 0; +} + +static int cdb_open(struct lmdb_env *env, const char *path, size_t mapsize) +{ + /* Cache doesn't require durability, we can be + * loose with the requirements as a tradeoff for speed. */ + const unsigned flags = MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOTLS; + int ret = cdb_open_env(env, flags, path, mapsize); + if (ret != 0) { + return ret; + } + + /* Open the database. */ + MDB_txn *txn = NULL; + ret = mdb_txn_begin(env->env, NULL, 0, &txn); + if (ret != MDB_SUCCESS) { + mdb_env_close(env->env); + return lmdb_error(ret); + } + + ret = mdb_dbi_open(txn, NULL, 0, &env->dbi); + if (ret != MDB_SUCCESS) { + mdb_txn_abort(txn); + mdb_env_close(env->env); + return lmdb_error(ret); + } + + ret = mdb_txn_commit(txn); + if (ret != MDB_SUCCESS) { + mdb_env_close(env->env); + return lmdb_error(ret); + } + + return 0; +} + +static int cdb_init(knot_db_t **db, struct kr_cdb_opts *opts, knot_mm_t *pool) +{ + if (!db || !opts) { + return kr_error(EINVAL); + } + + struct lmdb_env *env = malloc(sizeof(*env)); + if (!env) { + return kr_error(ENOMEM); + } + memset(env, 0, sizeof(struct lmdb_env)); + + /* Clear stale lockfiles. */ + auto_free char *lockfile = kr_strcatdup(2, opts->path, "/.cachelock"); + if (lockfile) { + if (unlink(lockfile) == 0) { + kr_log_info("[cache] cleared stale lockfile '%s'\n", lockfile); + } else if (errno != ENOENT) { + kr_log_info("[cache] failed to clear stale lockfile '%s': %s\n", lockfile, + strerror(errno)); + } + } + + /* Open the database. */ + int ret = cdb_open(env, opts->path, opts->maxsize); + if (ret != 0) { + free(env); + return ret; + } + + *db = env; + return 0; +} + +static void cdb_deinit(knot_db_t *db) +{ + struct lmdb_env *env = db; + cdb_close_env(env); + free(env); +} + +static int cdb_count(knot_db_t *db) +{ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, true); + if (ret != 0) { + return ret; + } + + MDB_stat stat; + ret = mdb_stat(txn, env->dbi, &stat); + + return (ret == MDB_SUCCESS) ? stat.ms_entries : lmdb_error(ret); +} + +static int cdb_clear(knot_db_t *db) +{ + struct lmdb_env *env = db; + /* First try mdb_drop() to clear the DB; this may fail with ENOSPC. */ + /* If we didn't do this, explicit cache.clear() ran on an instance + * would lead to the instance detaching from the cache of others, + * until they reopened cache explicitly or cleared it for some reason. + */ + { + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, false); + if (ret == kr_ok()) { + ret = lmdb_error(mdb_drop(txn, env->dbi, 0)); + if (ret == kr_ok()) { + ret = cdb_sync(db); + } + if (ret == kr_ok()) { + return ret; + } + } + kr_log_info("[cache] clearing error, falling back\n"); + } + + /* We are about to switch to a different file, so end all txns, to be sure. */ + (void) cdb_sync(db); + free_txn_ro(db); + + /* Since there is no guarantee that there will be free + * pages to hold whole dirtied db for transaction-safe clear, + * we simply remove the database files and reopen. + * We can afford this since other readers will continue to read + * from removed file, but will reopen when encountering next + * error. */ + mdb_filehandle_t fd = -1; + int ret = mdb_env_get_fd(env->env, &fd); + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + const char *path = NULL; + ret = mdb_env_get_path(env->env, &path); + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + + auto_free char *mdb_datafile = kr_strcatdup(2, path, "/data.mdb"); + auto_free char *mdb_lockfile = kr_strcatdup(2, path, "/lock.mdb"); + auto_free char *lockfile = kr_strcatdup(2, path, "/.cachelock"); + if (!mdb_datafile || !mdb_lockfile || !lockfile) { + return kr_error(ENOMEM); + } + /* Find if we get a lock on lockfile. */ + ret = open(lockfile, O_CREAT|O_EXCL|O_RDONLY, S_IRUSR); + if (ret == -1) { + kr_log_error("[cache] clearing failed to get ./.cachelock; retry later\n"); + /* As we're out of space (almost certainly - mdb_drop didn't work), + * we will retry on the next failing write operation. */ + return kr_error(errno); + } + close(ret); + /* We acquired lockfile. Now find whether *.mdb are what we have open now. */ + struct stat old_stat, new_stat; + if (fstat(fd, &new_stat) || stat(mdb_datafile, &old_stat)) { + ret = errno; + unlink(lockfile); + return kr_error(ret); + } + /* Remove underlying files only if current open environment + * points to file on the disk. Otherwise just reopen as someone + * else has already removed the files. + */ + if (old_stat.st_dev == new_stat.st_dev && old_stat.st_ino == new_stat.st_ino) { + kr_log_verbose("[cache] clear: identical files, unlinking\n"); + // coverity[toctou] + unlink(mdb_datafile); + unlink(mdb_lockfile); + } else + kr_log_verbose("[cache] clear: not identical files, reopening\n"); + /* Keep copy as it points to current handle internals. */ + auto_free char *path_copy = strdup(path); + size_t mapsize = env->mapsize; + cdb_close_env(env); + ret = cdb_open(env, path_copy, mapsize); + /* Environment updated, release lockfile. */ + unlink(lockfile); + return ret; +} + +static int cdb_readv(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val, + int maxcount) +{ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, true); + if (ret) { + return ret; + } + + for (int i = 0; i < maxcount; ++i) { + /* Convert key structs */ + MDB_val _key = val_knot2mdb(key[i]); + MDB_val _val = val_knot2mdb(val[i]); + ret = mdb_get(txn, env->dbi, &_key, &_val); + if (ret != MDB_SUCCESS) { + ret = lmdb_error(ret); + if (ret == kr_error(ENOSPC)) { + /* we're likely to be forced to cache clear anyway */ + ret = kr_error(ENOENT); + } + return ret; + } + /* Update the result. */ + val[i] = val_mdb2knot(_val); + } + return kr_ok(); +} + +static int cdb_write(struct lmdb_env *env, MDB_txn **txn, const knot_db_val_t *key, + knot_db_val_t *val, unsigned flags) +{ + /* Convert key structs and write */ + MDB_val _key = val_knot2mdb(*key); + MDB_val _val = val_knot2mdb(*val); + int ret = mdb_put(*txn, env->dbi, &_key, &_val, flags); + + /* Try to recover from doing too much writing in a single transaction. */ + if (ret == MDB_TXN_FULL) { + ret = cdb_sync(env); + if (ret) { + ret = txn_get(env, txn, false); + } + if (ret) { + ret = mdb_put(*txn, env->dbi, &_key, &_val, flags); + } + } + if (ret != MDB_SUCCESS) { + return lmdb_error(ret); + } + + /* Update the result. */ + val->data = _val.mv_data; + val->len = _val.mv_size; + return kr_ok(); +} + +static int cdb_writev(knot_db_t *db, const knot_db_val_t *key, knot_db_val_t *val, + int maxcount) +{ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, false); + + for (int i = 0; ret == kr_ok() && i < maxcount; ++i) { + /* This is LMDB specific optimisation, + * if caller specifies value with NULL data and non-zero length, + * LMDB will preallocate the entry for caller and leave write + * transaction open, caller is responsible for syncing thus committing transaction. + */ + unsigned mdb_flags = 0; + if (val[i].len > 0 && val[i].data == NULL) { + mdb_flags |= MDB_RESERVE; + } + ret = cdb_write(env, &txn, &key[i], &val[i], mdb_flags); + } + + return ret; +} + +static int cdb_remove(knot_db_t *db, knot_db_val_t keys[], int maxcount) +{ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, false); + int deleted = 0; + + for (int i = 0; ret == kr_ok() && i < maxcount; ++i) { + MDB_val _key = val_knot2mdb(keys[i]); + MDB_val val = { 0, NULL }; + ret = lmdb_error(mdb_del(txn, env->dbi, &_key, &val)); + if (ret == kr_ok()) + deleted++; + else if (ret == KNOT_ENOENT) + ret = kr_ok(); /* skip over non-existing entries */ + } + + return ret < 0 ? ret : deleted; +} + +static int cdb_match(knot_db_t *db, knot_db_val_t *key, knot_db_val_t keyval[][2], int maxcount) +{ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, true); + if (ret != 0) { + return ret; + } + + /* LATER(optim.): use txn_curs_get() instead, to save resources. */ + MDB_cursor *cur = NULL; + ret = mdb_cursor_open(txn, env->dbi, &cur); + if (ret != 0) { + return lmdb_error(ret); + } + + MDB_val cur_key = val_knot2mdb(*key); + MDB_val cur_val = { 0, NULL }; + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_SET_RANGE); + if (ret != MDB_SUCCESS) { + mdb_cursor_close(cur); + return lmdb_error(ret); + } + + int results = 0; + while (ret == MDB_SUCCESS) { + /* Retrieve current key and compare with prefix */ + if (cur_key.mv_size < key->len || memcmp(cur_key.mv_data, key->data, key->len) != 0) { + break; + } + /* Add to result set */ + if (results < maxcount) { + keyval[results][0] = val_mdb2knot(cur_key); + keyval[results][1] = val_mdb2knot(cur_val); + ++results; + } else { + break; + } + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT); + } + + mdb_cursor_close(cur); + return results; +} + + +static int cdb_prune(knot_db_t *db, int limit) +{ + return -1; +#if 0 + /* Sync in-flight transactions */ + cdb_sync(db); + + /* Prune old records */ + struct lmdb_env *env = db; + MDB_txn *txn = NULL; + int ret = txn_get(env, &txn, false); + if (ret != 0) { + return ret; + } + + MDB_cursor *cur = NULL; + ret = mdb_cursor_open(txn, env->dbi, &cur); + if (ret != 0) { + return lmdb_error(ret); + } + + MDB_val cur_key, cur_val; + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_FIRST); + if (ret != 0) { + mdb_cursor_close(cur); + return lmdb_error(ret); + } + + int results = 0; + struct timeval now; + gettimeofday(&now, NULL); + while (ret == 0 && results < limit) { + /* Ignore special namespaces. */ + if (cur_key.mv_size < 2 || ((const char *)cur_key.mv_data)[0] == 'V') { + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT); + continue; + } + /* Check entry age. */ + struct kr_cache_entry *entry = cur_val.mv_data; + if (entry->timestamp > now.tv_sec || + (now.tv_sec - entry->timestamp) < entry->ttl) { + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT); + continue; + } + /* Remove entry */ + mdb_cursor_del(cur, 0); + ++results; + ret = mdb_cursor_get(cur, &cur_key, &cur_val, MDB_NEXT); + } + mdb_cursor_close(cur); + return ret < 0 ? ret : results; +#endif +} + +static int cdb_read_leq(knot_db_t *env, knot_db_val_t *key, knot_db_val_t *val) +{ + assert(env && key && key->data && val); + MDB_cursor *curs = NULL; + int ret = txn_curs_get(env, &curs); + if (ret) return ret; + + MDB_val key2_m = val_knot2mdb(*key); + MDB_val val2_m = { 0, NULL }; + ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_SET_RANGE); + if (ret) return lmdb_error(ret); + /* test for equality //:unlikely */ + if (key2_m.mv_size == key->len + && memcmp(key2_m.mv_data, key->data, key->len) == 0) { + ret = 0; /* equality */ + goto success; + } + /* we must be greater than key; do one step to smaller */ + ret = mdb_cursor_get(curs, &key2_m, &val2_m, MDB_PREV); + if (ret) return lmdb_error(ret); + ret = 1; +success: + /* finalize the output */ + *key = val_mdb2knot(key2_m); + *val = val_mdb2knot(val2_m); + return ret; +} + + +const struct kr_cdb_api *kr_cdb_lmdb(void) +{ + static const struct kr_cdb_api api = { + "lmdb", + cdb_init, cdb_deinit, cdb_count, cdb_clear, cdb_sync, + cdb_readv, cdb_writev, cdb_remove, + cdb_match, cdb_prune, + cdb_read_leq + }; + + return &api; +} diff --git a/lib/cache/cdb_lmdb.h b/lib/cache/cdb_lmdb.h new file mode 100644 index 0000000..bc2c7d6 --- /dev/null +++ b/lib/cache/cdb_lmdb.h @@ -0,0 +1,23 @@ +/* Copyright (C) 2016-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "lib/cache/cdb_api.h" +#include "lib/defines.h" + +KR_EXPORT KR_CONST +const struct kr_cdb_api *kr_cdb_lmdb(void); diff --git a/lib/cache/entry_list.c b/lib/cache/entry_list.c new file mode 100644 index 0000000..6a5001c --- /dev/null +++ b/lib/cache/entry_list.c @@ -0,0 +1,293 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of chaining in struct entry_h. Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" +#include "lib/utils.h" + + +static int entry_h_len(knot_db_val_t val); + + +void entry_list_memcpy(struct entry_apex *ea, entry_list_t list) +{ + assert(ea); + memset(ea, 0, offsetof(struct entry_apex, data)); + ea->has_ns = list[EL_NS ].len; + ea->has_cname = list[EL_CNAME ].len; + ea->has_dname = list[EL_DNAME ].len; + for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) { + ea->nsecs[i] = list[i].len == 0 ? 0 : + (list[i].len == 4 ? 1 : 3); + } + uint8_t *it = ea->data; + for (int i = 0; i < EL_LENGTH; ++i) { + if (list[i].data) { + memcpy(it, list[i].data, list[i].len); + /* LATER(optim.): coalesce consecutive writes? */ + } else { + list[i].data = it; + } + it += to_even(list[i].len); + } +} + +int entry_list_parse(const knot_db_val_t val, entry_list_t list) +{ + const bool ok = val.data && val.len && list; + if (!ok) { + assert(!EINVAL); + return kr_error(EINVAL); + } + /* Parse the apex itself (nsec parameters). */ + const struct entry_apex *ea = entry_apex_consistent(val); + if (!ea) { + return kr_error(EILSEQ); + } + const uint8_t *it = ea->data, + *it_bound = knot_db_val_bound(val); + for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) { + if (it > it_bound) { + return kr_error(EILSEQ); + } + list[i].data = (void *)it; + switch (ea->nsecs[i]) { + case 0: + list[i].len = 0; + break; + case 1: + list[i].len = sizeof(uint32_t); /* just timestamp */ + break; + case 3: { /* timestamp + NSEC3PARAM wire */ + if (it + sizeof(uint32_t) + 4 > it_bound) { + return kr_error(EILSEQ); + } + list[i].len = sizeof(uint32_t) + + nsec_p_rdlen(it + sizeof(uint32_t)); + break; + } + default: + return kr_error(EILSEQ); + }; + it += to_even(list[i].len); + } + /* Parse every entry_h. */ + for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) { + list[i].data = (void *)it; + bool has_type; + switch (i) { + case EL_NS: has_type = ea->has_ns; break; + case EL_CNAME: has_type = ea->has_cname; break; + case EL_DNAME: has_type = ea->has_dname; break; + default: assert(false); return kr_error(EINVAL); /* something very bad */ + } + if (!has_type) { + list[i].len = 0; + continue; + } + if (it >= it_bound) { + assert(!EILSEQ); + return kr_error(EILSEQ); + } + const int len = entry_h_len( + (knot_db_val_t){ .data = (void *)it, .len = it_bound - it }); + if (len < 0) { + assert(false); + return kr_error(len); + } + list[i].len = len; + it += to_even(len); + } + assert(it == it_bound); + return kr_ok(); +} + +/** Given a valid entry header, find its length (i.e. offset of the next entry). + * \param val The beginning of the data and the bound (read only). + */ +static int entry_h_len(const knot_db_val_t val) +{ + const bool ok = val.data && ((ssize_t)val.len) > 0; + if (!ok) return kr_error(EINVAL); + const struct entry_h *eh = val.data; + const uint8_t *d = eh->data; /* iterates over the data in entry */ + const uint8_t *data_bound = knot_db_val_bound(val); + if (d >= data_bound) return kr_error(EILSEQ); + if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */ + int sets = 2; + while (sets-- > 0) { + d += rdataset_dematerialized_size(d); + if (d > data_bound) { + assert(!EILSEQ); + return kr_error(EILSEQ); + } + } + } else { /* A "packet" (opaque ATM). */ + uint16_t len; + if (d + sizeof(len) > data_bound) return kr_error(EILSEQ); + memcpy(&len, d, sizeof(len)); + d += 2 + to_even(len); + } + if (d > data_bound) { + assert(!EILSEQ); + return kr_error(EILSEQ); + } + return d - (uint8_t *)val.data; +} + +struct entry_apex * entry_apex_consistent(knot_db_val_t val) +{ + //XXX: check lengths, etc. + return val.data; +} + +/* See the header file. */ +int entry_h_seek(knot_db_val_t *val, uint16_t type) +{ + int i = -1; + switch (type) { + case KNOT_RRTYPE_NS: i = EL_NS; break; + case KNOT_RRTYPE_CNAME: i = EL_CNAME; break; + case KNOT_RRTYPE_DNAME: i = EL_DNAME; break; + default: return kr_ok(); + } + + entry_list_t el; + int ret = entry_list_parse(*val, el); + if (ret) return ret; + *val = el[i]; + return val->len ? kr_ok() : kr_error(ENOENT); +} + +static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key, + knot_db_val_t *val, const struct kr_query *qry) +{ + int ret = cache_op(cache, write, key, val, 1); + if (!ret) return kr_ok(); + /* Clear cache if overfull. It's nontrivial to do better with LMDB. + * LATER: some garbage-collection mechanism. */ + if (ret == kr_error(ENOSPC)) { + ret = kr_cache_clear(cache); + const char *msg = "[cache] clearing because overfull, ret = %d\n"; + if (ret) { + kr_log_error(msg, ret); + } else { + kr_log_info(msg, ret); + ret = kr_error(ENOSPC); + } + return ret; + } + VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret); + return kr_error(ret ? ret : ENOSPC); +} + + +/* See the header file. */ +int entry_h_splice( + knot_db_val_t *val_new_entry, uint8_t rank, + const knot_db_val_t key, const uint16_t ktype, const uint16_t type, + const knot_dname_t *owner/*log only*/, + const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp) +{ + //TODO: another review, perhaps incuding the API + const bool ok = val_new_entry && val_new_entry->len > 0; + if (!ok) { + assert(!EINVAL); + return kr_error(EINVAL); + } + + int i_type; + switch (type) { + case KNOT_RRTYPE_NS: i_type = EL_NS; break; + case KNOT_RRTYPE_CNAME: i_type = EL_CNAME; break; + case KNOT_RRTYPE_DNAME: i_type = EL_DNAME; break; + default: i_type = 0; + } + + /* Get eh_orig (original entry), and also el list if multi-entry case. */ + const struct entry_h *eh_orig = NULL; + entry_list_t el; + int ret = -1; + if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) { + knot_db_val_t val; + ret = cache_op(cache, read, &key, &val, 1); + if (i_type) { + if (!ret) ret = entry_list_parse(val, el); + if (ret) memset(el, 0, sizeof(el)); + val = el[i_type]; + } + /* val is on the entry, in either case (or error) */ + if (!ret) { + eh_orig = entry_h_consistent(val, type); + } + } else { + /* We want to fully overwrite the entry, so don't even read it. */ + memset(el, 0, sizeof(el)); + } + + if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) { + /* If equal rank was accepted, spoofing a *single* answer would be + * enough to e.g. override NS record in AUTHORITY section. + * This way they would have to hit the first answer + * (whenever TTL nears expiration). + * Stale-serving is NOT considered, but TTL 1 would be considered + * as expiring anyway, ... */ + int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp); + if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl) + && rank <= eh_orig->rank) { + WITH_VERBOSE(qry) { + auto_free char *type_str = kr_rrtype_text(type), + *owner_str = kr_dname_text(owner); + VERBOSE_MSG(qry, "=> not overwriting %s %s\n", + type_str, owner_str); + } + return kr_error(EEXIST); + } + } + + if (!i_type) { + /* The non-list types are trivial now. */ + return cache_write_or_clear(cache, &key, val_new_entry, qry); + } + /* Now we're in trouble. In some cases, parts of data to be written + * is an lmdb entry that may be invalidated by our write request. + * (lmdb does even in-place updates!) Therefore we copy all into a buffer. + * (We don't bother deallocating from the mempool.) + * LATER(optim.): do this only when neccessary, or perhaps another approach. + * This is also complicated by the fact that the val_new_entry part + * is to be written *afterwards* by the caller. + */ + el[i_type] = (knot_db_val_t){ + .len = val_new_entry->len, + .data = NULL, /* perhaps unclear in the entry_h_splice() API */ + }; + knot_db_val_t val = { + .len = entry_list_serial_size(el), + .data = NULL, + }; + void *buf = mm_alloc(&qry->request->pool, val.len); + entry_list_memcpy(buf, el); + ret = cache_write_or_clear(cache, &key, &val, qry); + if (ret) return kr_error(ret); + memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */ + val_new_entry->data = (uint8_t *)val.data + + ((uint8_t *)el[i_type].data - (uint8_t *)buf); + return kr_ok(); +} + diff --git a/lib/cache/entry_pkt.c b/lib/cache/entry_pkt.c new file mode 100644 index 0000000..00e73d4 --- /dev/null +++ b/lib/cache/entry_pkt.c @@ -0,0 +1,224 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of packet-caching. Prototypes in ./impl.h + * + * The packet is stashed in entry_h::data as uint16_t length + full packet wire format. + */ + +#include "lib/utils.h" +#include "lib/layer/iterate.h" /* kr_response_classify */ +#include "lib/cache/impl.h" + + +/** Compute TTL for a packet. Generally it's minimum TTL, with extra conditions. */ +static uint32_t packet_ttl(const knot_pkt_t *pkt, bool is_negative) +{ + bool has_ttl = false; + uint32_t ttl = UINT32_MAX; + /* Find minimum entry TTL in the packet or SOA minimum TTL. */ + for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) { + const knot_pktsection_t *sec = knot_pkt_section(pkt, i); + for (unsigned k = 0; k < sec->count; ++k) { + const knot_rrset_t *rr = knot_pkt_rr(sec, k); + if (is_negative) { + /* Use SOA minimum TTL for negative answers. */ + if (rr->type == KNOT_RRTYPE_SOA) { + return MIN(rr->ttl, knot_soa_minimum(rr->rrs.rdata)); + } else { + continue; /* Use SOA only for negative answers. */ + } + } + if (knot_rrtype_is_metatype(rr->type)) { + continue; /* Skip metatypes. */ + } + ttl = MIN(ttl, rr->ttl); + } + } + /* If no valid TTL present, go with zero (will get clamped to minimum). */ + return has_ttl ? ttl : 0; +} + + +void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry, + const struct kr_request *req, const bool has_optout) +{ + /* In some cases, stash also the packet. */ + const bool is_negative = kr_response_classify(pkt) + & (PKT_NODATA|PKT_NXDOMAIN); + const struct kr_qflags * const qf = &qry->flags; + const bool want_negative = qf->DNSSEC_INSECURE || !qf->DNSSEC_WANT || has_optout; + const bool want_pkt = qf->DNSSEC_BOGUS /*< useful for +cd answers */ + || (is_negative && want_negative); + + if (!want_pkt || !knot_wire_get_aa(pkt->wire) + || pkt->parsed != pkt->size /*< malformed packet; still can't detect KNOT_EFEWDATA */ + ) { + return; + } + + /* Compute rank. If cd bit is set or we got answer via non-validated + * forwarding, make the rank bad; otherwise it depends on flags. + * TODO: probably make validator attempt validation even with +cd. */ + uint8_t rank = KR_RANK_AUTH; + const bool risky_vldr = is_negative && qf->FORWARD && qf->CNAME; + /* ^^ CNAME'ed NXDOMAIN answer in forwarding mode can contain + * unvalidated records; original commit: d6e22f476. */ + if (knot_wire_get_cd(req->qsource.packet->wire) || qf->STUB || risky_vldr) { + kr_rank_set(&rank, KR_RANK_OMIT); + } else { + if (qf->DNSSEC_BOGUS) { + kr_rank_set(&rank, KR_RANK_BOGUS); + } else if (qf->DNSSEC_INSECURE) { + kr_rank_set(&rank, KR_RANK_INSECURE); + } else if (!qf->DNSSEC_WANT) { + /* no TAs at all, leave _RANK_AUTH */ + } else if (has_optout) { + /* All bad cases should be filtered above, + * at least the same way as pktcache in kresd 1.5.x. */ + kr_rank_set(&rank, KR_RANK_SECURE); + } else assert(false); + } + + const uint16_t pkt_type = knot_pkt_qtype(pkt); + const knot_dname_t *owner = knot_pkt_qname(pkt); /* qname can't be compressed */ + + // LATER: nothing exists under NXDOMAIN. Implement that (optionally)? +#if 0 + if (knot_wire_get_rcode(pkt->wire) == KNOT_RCODE_NXDOMAIN + /* && !qf->DNSSEC_INSECURE */ ) { + pkt_type = KNOT_RRTYPE_NS; + } +#endif + + /* Construct the key under which the pkt will be stored. */ + struct key k_storage, *k = &k_storage; + knot_db_val_t key; + int ret = kr_dname_lf(k->buf, owner, false); + if (ret) { + /* A server might (incorrectly) reply with QDCOUNT=0. */ + assert(owner == NULL); + return; + } + key = key_exact_type_maypkt(k, pkt_type); + + /* For now we stash the full packet byte-exactly as it came from upstream. */ + const uint16_t pkt_size = pkt->size; + knot_db_val_t val_new_entry = { + .data = NULL, + .len = offsetof(struct entry_h, data) + sizeof(pkt_size) + pkt->size, + }; + /* Prepare raw memory for the new entry and fill it. */ + struct kr_cache *cache = &req->ctx->cache; + ret = entry_h_splice(&val_new_entry, rank, key, k->type, pkt_type, + owner, qry, cache, qry->timestamp.tv_sec); + if (ret) return; /* some aren't really errors */ + assert(val_new_entry.data); + struct entry_h *eh = val_new_entry.data; + memset(eh, 0, offsetof(struct entry_h, data)); + eh->time = qry->timestamp.tv_sec; + eh->ttl = MAX(MIN(packet_ttl(pkt, is_negative), cache->ttl_max), cache->ttl_min); + eh->rank = rank; + eh->is_packet = true; + eh->has_optout = qf->DNSSEC_OPTOUT; + memcpy(eh->data, &pkt_size, sizeof(pkt_size)); + memcpy(eh->data + sizeof(pkt_size), pkt->wire, pkt_size); + + WITH_VERBOSE(qry) { + auto_free char *type_str = kr_rrtype_text(pkt_type), + *owner_str = kr_dname_text(owner); + VERBOSE_MSG(qry, "=> stashed packet: rank 0%.2o, TTL %d, " + "%s %s (%d B)\n", + eh->rank, eh->ttl, + type_str, owner_str, (int)val_new_entry.len); + } +} + + +int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type, + const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + + uint16_t pkt_len; + memcpy(&pkt_len, eh->data, sizeof(pkt_len)); + if (pkt_len > pkt->max_size) { + return kr_error(ENOENT); + } + + /* Copy answer and reparse it, but keep the original message id. */ + uint16_t msgid = knot_wire_get_id(pkt->wire); + knot_pkt_clear(pkt); + memcpy(pkt->wire, eh->data + 2, pkt_len); + pkt->size = pkt_len; + int ret = knot_pkt_parse(pkt, 0); + if (ret == KNOT_EFEWDATA || ret == KNOT_EMALF) { + return kr_error(ENOENT); + /* LATER(opt): try harder to avoid stashing such packets */ + } + if (ret != KNOT_EOK) { + assert(!ret); + return kr_error(ret); + } + knot_wire_set_id(pkt->wire, msgid); + + /* Add rank into the additional field. */ + for (size_t i = 0; i < pkt->rrset_count; ++i) { + assert(!pkt->rr[i].additional); + uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank)); + if (!rr_rank) { + return kr_error(ENOMEM); + } + *rr_rank = eh->rank; + pkt->rr[i].additional = rr_rank; + } + + /* Adjust TTL in each record. */ + const uint32_t drift = eh->ttl - new_ttl; + for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) { + const knot_pktsection_t *sec = knot_pkt_section(pkt, i); + for (unsigned k = 0; k < sec->count; ++k) { + knot_rrset_t *rrs = // vv FIXME?? + /*const-cast*/(knot_rrset_t *)knot_pkt_rr(sec, k); + /* We need to be careful: due to enforcing minimum TTL + * on packet, some records may be below that value. + * We keep those records at TTL 0. */ + if (rrs->ttl >= drift) { + rrs->ttl -= drift; + } else { + rrs->ttl = 0; + } + } + } + + /* Finishing touches. TODO: perhaps factor out */ + struct kr_qflags * const qf = &qry->flags; + qf->EXPIRING = is_expiring(eh->ttl, new_ttl); + qf->CACHED = true; + qf->NO_MINIMIZE = true; + qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE); + qf->DNSSEC_BOGUS = kr_rank_test(eh->rank, KR_RANK_BOGUS); + if (qf->DNSSEC_INSECURE || qf->DNSSEC_BOGUS) { + qf->DNSSEC_WANT = false; + } + qf->DNSSEC_OPTOUT = eh->has_optout; + VERBOSE_MSG(qry, "=> satisfied by exact packet: rank 0%.2o, new TTL %d\n", + eh->rank, new_ttl); + return kr_ok(); +} + diff --git a/lib/cache/entry_rr.c b/lib/cache/entry_rr.c new file mode 100644 index 0000000..94dd0e8 --- /dev/null +++ b/lib/cache/entry_rr.c @@ -0,0 +1,146 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of RRset (de)materialization, i.e. (de)serialization to storage + * format used in cache (some repeated fields are omitted). Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" + +int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data) +{ + /* FIXME: either give up on even alignment and thus direct usability + * of rdatasets as they are in lmdb, or align inside cdb_* functions + * (request sizes one byte longer and shift iff on an odd address). */ + //if ((size_t)data & 1) VERBOSE_MSG(NULL, "dematerialize: odd address\n"); + //const uint8_t *data0 = data; + if (!data) { + assert(data); + return kr_error(EINVAL); + } + const uint16_t rr_count = rds ? rds->count : 0; + memcpy(data, &rr_count, sizeof(rr_count)); + data += sizeof(rr_count); + if (rr_count) { + size_t size = knot_rdataset_size(rds); + memcpy(data, rds->rdata, size); + data += size; + } + //VERBOSE_MSG(NULL, "dematerialized to %d B\n", (int)(data - data0)); + (void)data; + return kr_ok(); +} + +/** Materialize a knot_rdataset_t from cache with given TTL. + * Return the number of bytes consumed or an error code. + */ +static int rdataset_materialize(knot_rdataset_t * restrict rds, const uint8_t * const data, + const uint8_t *data_bound, knot_mm_t *pool) +{ + assert(rds && data && data_bound && data_bound > data && !rds->rdata + /*&& !((size_t)data & 1)*/); + assert(pool); /* not required, but that's our current usage; guard leaks */ + const uint8_t *d = data; /* iterates over the cache data */ + { + uint16_t rr_count; + memcpy(&rr_count, d, sizeof(rr_count)); + d += sizeof(rr_count); + rds->count = rr_count; + if (!rr_count) { /* avoid mm_alloc(pool, 0); etc. */ + return d - data; + } + } + /* First sum up the sizes for wire format length. */ + const knot_rdataset_t rds_tmp = { + .count = rds->count, + .rdata = (knot_rdata_t *)d, + }; + size_t rds_size = knot_rdataset_size(&rds_tmp); /* TODO: we might overrun here already, + but we need to trust cache anyway...*/ + if (d + rds_size > data_bound) { + VERBOSE_MSG(NULL, "materialize: EILSEQ!\n"); + return kr_error(EILSEQ); + } + rds->rdata = mm_alloc(pool, rds_size); + if (!rds->rdata) { + return kr_error(ENOMEM); + } + memcpy(rds->rdata, d, rds_size); + d += rds_size; + //VERBOSE_MSG(NULL, "materialized from %d B\n", (int)(d - data)); + return d - data; +} + +int kr_cache_materialize(knot_rdataset_t *dst, const struct kr_cache_p *ref, + knot_mm_t *pool) +{ + struct entry_h *eh = ref->raw_data; + return rdataset_materialize(dst, eh->data, ref->raw_bound, pool); +} + + +int entry2answer(struct answer *ans, int id, + const struct entry_h *eh, const uint8_t *eh_bound, + const knot_dname_t *owner, uint16_t type, uint32_t new_ttl) +{ + /* We assume it's zeroed. Do basic sanity check. */ + if (ans->rrsets[id].set.rr || ans->rrsets[id].sig_rds.rdata + || (type == KNOT_RRTYPE_NSEC && ans->nsec_p.raw) + || (type == KNOT_RRTYPE_NSEC3 && !ans->nsec_p.raw) + ) + { + assert(false); + return kr_error(EINVAL); + } + /* Materialize the base RRset. */ + knot_rrset_t *rr = ans->rrsets[id].set.rr + = knot_rrset_new(owner, type, KNOT_CLASS_IN, new_ttl, ans->mm); + if (!rr) { + assert(!ENOMEM); + return kr_error(ENOMEM); + } + int ret = rdataset_materialize(&rr->rrs, eh->data, eh_bound, ans->mm); + if (ret < 0) goto fail; + size_t data_off = ret; + ans->rrsets[id].set.rank = eh->rank; + ans->rrsets[id].set.expiring = is_expiring(eh->ttl, new_ttl); + /* Materialize the RRSIG RRset for the answer in (pseudo-)packet. */ + bool want_rrsigs = true; /* LATER(optim.): might be omitted in some cases. */ + if (want_rrsigs) { + ret = rdataset_materialize(&ans->rrsets[id].sig_rds, eh->data + data_off, + eh_bound, ans->mm); + if (ret < 0) goto fail; + /* Sanity check: we consumed exactly all data. */ + int unused_bytes = eh_bound - (uint8_t *)eh->data - data_off - ret; + if (unused_bytes) { + kr_log_error("[cach] entry2answer ERROR: unused bytes: %d\n", + unused_bytes); + assert(!EILSEQ); + ret = kr_error(EILSEQ); + goto fail; /* to be on the safe side */ + } + } + return kr_ok(); +fail: + assert(/*false*/!ret); + /* Cleanup the item that we might've (partially) written to. */ + knot_rrset_free(ans->rrsets[id].set.rr, ans->mm); + knot_rdataset_clear(&ans->rrsets[id].sig_rds, ans->mm); + memset(&ans->rrsets[id], 0, sizeof(ans->rrsets[id])); + return kr_error(ret); +} + diff --git a/lib/cache/impl.h b/lib/cache/impl.h new file mode 100644 index 0000000..a08f355 --- /dev/null +++ b/lib/cache/impl.h @@ -0,0 +1,412 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Header internal for cache implementation(s). + * Only LMDB works for now. + */ +#pragma once + +#include <stdbool.h> +#include <stdint.h> + +#include <libdnssec/error.h> +#include <libdnssec/nsec.h> +#include <libknot/consts.h> +#include <libknot/db/db.h> +#include <libknot/dname.h> + +#include "contrib/cleanup.h" +#include "contrib/murmurhash3/murmurhash3.h" /* hash() for nsec_p_hash() */ +#include "lib/cache/cdb_api.h" +#include "lib/resolve.h" + +/* Cache entry values - binary layout. + * + * It depends on type which is recognizable by the key. + * Code depending on the contents of the key is marked by CACHE_KEY_DEF. + * + * 'E' entry (exact hit): + * - ktype == NS: struct entry_apex - multiple types inside (NS and xNAME); + * - ktype != NS: struct entry_h + * * is_packet: uint16_t length, the rest is opaque and handled by ./entry_pkt.c + * * otherwise RRset + its RRSIG set (possibly empty). + * '1' or '3' entry (NSEC or NSEC3) + * - struct entry_h, contents is the same as for exact hit + * - flags don't make sense there + */ + +struct entry_h { + uint32_t time; /**< The time of inception. */ + uint32_t ttl; /**< TTL at inception moment. Assuming it fits into int32_t ATM. */ + uint8_t rank : 6; /**< See enum kr_rank */ + bool is_packet : 1; /**< Negative-answer packet for insecure/bogus name. */ + bool has_optout : 1; /**< Only for packets; persisted DNSSEC_OPTOUT. */ + uint8_t _pad; /**< We need even alignment for data now. */ + uint8_t data[]; +}; +struct entry_apex; + +/** Check basic consistency of entry_h for 'E' entries, not looking into ->data. + * (for is_packet the length of data is checked) + */ +struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t type); + +struct entry_apex * entry_apex_consistent(knot_db_val_t val); + +/** Consistency check, ATM common for NSEC and NSEC3. */ +static inline struct entry_h * entry_h_consistent_NSEC(knot_db_val_t data) +{ + /* ATM it's enough to just extend the checks for exact entries. */ + const struct entry_h *eh = entry_h_consistent(data, KNOT_RRTYPE_NSEC); + bool ok = eh != NULL; + ok = ok && !eh->is_packet && !eh->has_optout; + return ok ? /*const-cast*/(struct entry_h *)eh : NULL; +} + + +/* nsec_p* - NSEC* chain parameters */ + +static inline int nsec_p_rdlen(const uint8_t *rdata) +{ + //TODO: do we really need the zero case? + return rdata ? 5 + rdata[4] : 0; /* rfc5155 4.2 and 3.2. */ +} +static const int NSEC_P_MAXLEN = sizeof(uint32_t) + 5 + 255; // TODO: remove?? + +/** Hash of NSEC3 parameters, used as a tag to separate different chains for same zone. */ +typedef uint32_t nsec_p_hash_t; +static inline nsec_p_hash_t nsec_p_mkHash(const uint8_t *nsec_p) +{ + assert(nsec_p && !(KNOT_NSEC3_FLAG_OPT_OUT & nsec_p[1])); + return hash((const char *)nsec_p, nsec_p_rdlen(nsec_p)); +} + +/** NSEC* parameters for the chain. */ +struct nsec_p { + const uint8_t *raw; /**< Pointer to raw NSEC3 parameters; NULL for NSEC. */ + nsec_p_hash_t hash; /**< Hash of `raw`, used for cache keys. */ + dnssec_nsec3_params_t libknot; /**< Format for libknot; owns malloced memory! */ +}; + + + +/** LATER(optim.): this is overshot, but struct key usage should be cheap ATM. */ +#define KR_CACHE_KEY_MAXLEN (KNOT_DNAME_MAXLEN + 100) /* CACHE_KEY_DEF */ + +struct key { + const knot_dname_t *zname; /**< current zone name (points within qry->sname) */ + uint8_t zlf_len; /**< length of current zone's lookup format */ + + /** Corresponding key type; e.g. NS for CNAME. + * Note: NSEC type is ambiguous (exact and range key). */ + uint16_t type; + /** The key data start at buf+1, and buf[0] contains some length. + * For details see key_exact* and key_NSEC* functions. */ + uint8_t buf[KR_CACHE_KEY_MAXLEN]; + /* LATER(opt.): ^^ probably change the anchoring, so that kr_dname_lf() + * doesn't need to move data after knot_dname_lf(). */ +}; + +static inline size_t key_nwz_off(const struct key *k) +{ + /* CACHE_KEY_DEF: zone name lf + 0 ('1' or '3'). + * NSEC '1' case continues just with the name within zone. */ + return k->zlf_len + 2; +} +static inline size_t key_nsec3_hash_off(const struct key *k) +{ + /* CACHE_KEY_DEF NSEC3: tag (nsec_p_hash_t) + 20 bytes NSEC3 name hash) */ + return key_nwz_off(k) + sizeof(nsec_p_hash_t); +} +/** Hash is always SHA1; I see no plans to standardize anything else. + * https://www.iana.org/assignments/dnssec-nsec3-parameters/dnssec-nsec3-parameters.xhtml#dnssec-nsec3-parameters-3 + */ +static const int NSEC3_HASH_LEN = 20, + NSEC3_HASH_TXT_LEN = 32; + +/** Finish constructing string key for for exact search. + * It's assumed that kr_dname_lf(k->buf, owner, *) had been ran. + */ +knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type); + +/** Like key_exact_type_maypkt but with extra checks if used for RRs only. */ +static inline knot_db_val_t key_exact_type(struct key *k, uint16_t type) +{ + switch (type) { + /* Sanity check: forbidden types represented in other way(s). */ + case KNOT_RRTYPE_NSEC: + case KNOT_RRTYPE_NSEC3: + assert(false); + return (knot_db_val_t){ NULL, 0 }; + } + return key_exact_type_maypkt(k, type); +} + + +/* entry_h chaining; implementation in ./entry_list.c */ + +enum { ENTRY_APEX_NSECS_CNT = 2 }; + +/** Header of 'E' entry with ktype == NS. Inside is private to ./entry_list.c + * + * We store xNAME at NS type to lower the number of searches in closest_NS(). + * CNAME is only considered for equal name, of course. + * We also store NSEC* parameters at NS type. + */ +struct entry_apex { + /* ENTRY_H_FLAGS */ + bool has_ns : 1; + bool has_cname : 1; + bool has_dname : 1; + + uint8_t pad_; /**< Weird: 1 byte + 2 bytes + x bytes; let's do 2+2+x. */ + int8_t nsecs[ENTRY_APEX_NSECS_CNT]; /**< values: 0: none, 1: NSEC, 3: NSEC3 */ + uint8_t data[]; + /* XXX: if not first, stamp of last being the first? + * Purpose: save cache operations if rolled the algo/params long ago. */ +}; + +/** Indices for decompressed entry_list_t. */ +enum EL { + EL_NS = ENTRY_APEX_NSECS_CNT, + EL_CNAME, + EL_DNAME, + EL_LENGTH +}; +/** Decompressed entry_apex. It's an array of unparsed entry_h references. + * Note: arrays are passed "by reference" to functions (in C99). */ +typedef knot_db_val_t entry_list_t[EL_LENGTH]; + +static inline uint16_t EL2RRTYPE(enum EL i) +{ + switch (i) { + case EL_NS: return KNOT_RRTYPE_NS; + case EL_CNAME: return KNOT_RRTYPE_CNAME; + case EL_DNAME: return KNOT_RRTYPE_DNAME; + default: assert(false); return 0; + } +} + +/** There may be multiple entries within, so rewind `val` to the one we want. + * + * ATM there are multiple types only for the NS ktype - it also accommodates xNAMEs. + * \note `val->len` represents the bound of the whole list, not of a single entry. + * \note in case of ENOENT, `val` is still rewound to the beginning of the next entry. + * \return error code + * TODO: maybe get rid of this API? + */ +int entry_h_seek(knot_db_val_t *val, uint16_t type); + +/** Prepare space to insert an entry. + * + * Some checks are performed (rank, TTL), the current entry in cache is copied + * with a hole ready for the new entry (old one of the same type is cut out). + * + * \param val_new_entry The only changing parameter; ->len is read, ->data written. + * \return error code + */ +int entry_h_splice( + knot_db_val_t *val_new_entry, uint8_t rank, + const knot_db_val_t key, const uint16_t ktype, const uint16_t type, + const knot_dname_t *owner/*log only*/, + const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp); + +/** Parse an entry_apex into individual items. @return error code. */ +int entry_list_parse(const knot_db_val_t val, entry_list_t list); + +static inline size_t to_even(size_t n) +{ + return n + (n & 1); +} + +static inline int entry_list_serial_size(const entry_list_t list) +{ + int size = offsetof(struct entry_apex, data); + for (int i = 0; i < EL_LENGTH; ++i) { + size += to_even(list[i].len); + } + return size; +} + +/** Fill contents of an entry_apex. + * + * @note NULL pointers are overwritten - caller may like to fill the space later. + */ +void entry_list_memcpy(struct entry_apex *ea, entry_list_t list); + + + +/* Packet caching; implementation in ./entry_pkt.c */ + +/** Stash the packet into cache (if suitable, etc.) + * \param has_optout whether the packet contains an opt-out NSEC3 */ +void stash_pkt(const knot_pkt_t *pkt, const struct kr_query *qry, + const struct kr_request *req, bool has_optout); + +/** Try answering from packet cache, given an entry_h. + * + * This assumes the TTL is OK and entry_h_consistent, but it may still return error. + * On success it handles all the rest, incl. qry->flags. + */ +int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type, + const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl); + + +/** Record is expiring if it has less than 1% TTL (or less than 5s) */ +static inline bool is_expiring(uint32_t orig_ttl, uint32_t new_ttl) +{ + int64_t nttl = new_ttl; /* avoid potential over/under-flow */ + return 100 * (nttl - 5) < orig_ttl; +} + +/** Returns signed result so you can inspect how much stale the RR is. + * + * @param owner name for stale-serving decisions. You may pass NULL to disable stale. + * @note: NSEC* uses zone name ATM; for NSEC3 the owner may not even be knowable. + * @param type for stale-serving. + */ +int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry, + const knot_dname_t *owner, uint16_t type, uint32_t now); + + +/* RRset (de)materialization; implementation in ./entry_rr.c */ + +/** Size of the RR count field */ +#define KR_CACHE_RR_COUNT_SIZE sizeof(uint16_t) + +/** Compute size of serialized rdataset. NULL is accepted as empty set. */ +static inline int rdataset_dematerialize_size(const knot_rdataset_t *rds) +{ + return KR_CACHE_RR_COUNT_SIZE + (rds == NULL ? 0 : knot_rdataset_size(rds)); +} + +static inline int rdataset_dematerialized_size(const uint8_t *data) +{ + knot_rdataset_t rds; + memcpy(&rds.count, data, sizeof(rds.count)); + rds.rdata = (knot_rdata_t *)(data + sizeof(rds.count)); + return sizeof(rds.count) + knot_rdataset_size(&rds); +} + +/** Serialize an rdataset. */ +int rdataset_dematerialize(const knot_rdataset_t *rds, uint8_t * restrict data); + + +/** Partially constructed answer when gathering RRsets from cache. */ +struct answer { + int rcode; /**< PKT_NODATA, etc. */ + struct nsec_p nsec_p; /**< Don't mix different NSEC* parameters in one answer. */ + knot_mm_t *mm; /**< Allocator for rrsets */ + struct answer_rrset { + ranked_rr_array_entry_t set; /**< set+rank for the main data */ + knot_rdataset_t sig_rds; /**< RRSIG data, if any */ + } rrsets[1+1+3]; /**< see AR_ANSWER and friends; only required records are filled */ +}; +enum { + AR_ANSWER = 0, /**< Positive answer record. It might be wildcard-expanded. */ + AR_SOA, /**< SOA record. */ + AR_NSEC, /**< NSEC* covering or matching the SNAME (next closer name in NSEC3 case). */ + AR_WILD, /**< NSEC* covering or matching the source of synthesis. */ + AR_CPE, /**< NSEC3 matching the closest provable encloser. */ +}; + +/** Materialize RRset + RRSIGs into ans->rrsets[id]. + * LATER(optim.): it's slightly wasteful that we allocate knot_rrset_t for the packet + * + * \return error code. They are all bad conditions and "guarded" by assert. + */ +int entry2answer(struct answer *ans, int id, + const struct entry_h *eh, const uint8_t *eh_bound, + const knot_dname_t *owner, uint16_t type, uint32_t new_ttl); + + +/* Preparing knot_pkt_t for cache answer from RRs; implementation in ./knot_pkt.c */ + +/** Prepare answer packet to be filled by RRs (without RR data in wire). */ +int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type); + +/** Append RRset + its RRSIGs into the current section (*shallow* copy), with given rank. + * \note it works with empty set as well (skipped) + * \note pkt->wire is not updated in any way + * \note KNOT_CLASS_IN is assumed + */ +int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank); + + + +/* NSEC (1) stuff. Implementation in ./nsec1.c */ + +/** Construct a string key for for NSEC (1) predecessor-search. + * \param add_wildcard Act as if the name was extended by "*." + * \note k->zlf_len is assumed to have been correctly set */ +knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard); + +/** Closest encloser check for NSEC (1). + * To understand the interface, see the call point. + * \param k space to store key + input: zname and zlf_len + * \return 0: success; >0: try other (NSEC3); <0: exit cache immediately. */ +int nsec1_encloser(struct key *k, struct answer *ans, + const int sname_labels, int *clencl_labels, + knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz, + const struct kr_query *qry, struct kr_cache *cache); + +/** Source of synthesis (SS) check for NSEC (1). + * To understand the interface, see the call point. + * \return 0: continue; <0: exit cache immediately; + * AR_SOA: skip to adding SOA (SS was covered or matched for NODATA). */ +int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz, + const struct kr_query *qry, struct kr_cache *cache); + + +/* NSEC3 stuff. Implementation in ./nsec3.c */ + +/** Construct a string key for for NSEC3 predecessor-search, from an NSEC3 name. + * \note k->zlf_len is assumed to have been correctly set */ +knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name, + const nsec_p_hash_t nsec_p_hash); + +/** TODO. See nsec1_encloser(...) */ +int nsec3_encloser(struct key *k, struct answer *ans, + const int sname_labels, int *clencl_labels, + const struct kr_query *qry, struct kr_cache *cache); + +/** TODO. See nsec1_src_synth(...) */ +int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + const struct kr_query *qry, struct kr_cache *cache); + + + +#define VERBOSE_MSG(qry, ...) QRVERBOSE((qry), "cach", ## __VA_ARGS__) + +/** Shorthand for operations on cache backend */ +#define cache_op(cache, op, ...) (cache)->api->op((cache)->db, ## __VA_ARGS__) + + +static inline uint16_t get_uint16(const void *address) +{ + uint16_t tmp; + memcpy(&tmp, address, sizeof(tmp)); + return tmp; +} + +/** Useful pattern, especially as void-pointer arithmetic isn't standard-compliant. */ +static inline uint8_t * knot_db_val_bound(knot_db_val_t val) +{ + return (uint8_t *)val.data + val.len; +} + diff --git a/lib/cache/knot_pkt.c b/lib/cache/knot_pkt.c new file mode 100644 index 0000000..56836a6 --- /dev/null +++ b/lib/cache/knot_pkt.c @@ -0,0 +1,106 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of preparing knot_pkt_t for filling with RRs. + * Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" + +int pkt_renew(knot_pkt_t *pkt, const knot_dname_t *name, uint16_t type) +{ + /* Update packet question if needed. */ + if (!knot_dname_is_equal(knot_pkt_qname(pkt), name) + || knot_pkt_qtype(pkt) != type || knot_pkt_qclass(pkt) != KNOT_CLASS_IN) { + int ret = kr_pkt_recycle(pkt); + if (ret) return kr_error(ret); + ret = knot_pkt_put_question(pkt, name, KNOT_CLASS_IN, type); + if (ret) return kr_error(ret); + } + + pkt->parsed = pkt->size = PKT_SIZE_NOWIRE; + knot_wire_set_qr(pkt->wire); + knot_wire_set_aa(pkt->wire); + return kr_ok(); +} + +/** Reserve space for additional `count` RRsets. + * \note pkt->rr_info gets correct length but is always zeroed + */ +static int pkt_alloc_space(knot_pkt_t *pkt, int count) +{ + size_t allocd_orig = pkt->rrset_allocd; + if (pkt->rrset_count + count <= allocd_orig) { + return kr_ok(); + } + /* A simple growth strategy, amortized O(count). */ + pkt->rrset_allocd = MAX( + pkt->rrset_count + count, + pkt->rrset_count + allocd_orig); + + pkt->rr = mm_realloc(&pkt->mm, pkt->rr, + sizeof(pkt->rr[0]) * pkt->rrset_allocd, + sizeof(pkt->rr[0]) * allocd_orig); + if (!pkt->rr) { + return kr_error(ENOMEM); + } + /* Allocate pkt->rr_info to be certain, but just leave it zeroed. */ + mm_free(&pkt->mm, pkt->rr_info); + pkt->rr_info = mm_alloc(&pkt->mm, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd); + if (!pkt->rr_info) { + return kr_error(ENOMEM); + } + memset(pkt->rr_info, 0, sizeof(pkt->rr_info[0]) * pkt->rrset_allocd); + return kr_ok(); +} + +int pkt_append(knot_pkt_t *pkt, const struct answer_rrset *rrset, uint8_t rank) +{ + /* allocate space, to be sure */ + int rrset_cnt = (rrset->set.rr->rrs.count > 0) + (rrset->sig_rds.count > 0); + int ret = pkt_alloc_space(pkt, rrset_cnt); + if (ret) return kr_error(ret); + /* write both sets */ + const knot_rdataset_t *rdss[2] = { &rrset->set.rr->rrs, &rrset->sig_rds }; + for (int i = 0; i < rrset_cnt; ++i) { + assert(rdss[i]->count); + /* allocate rank */ + uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank)); + if (!rr_rank) return kr_error(ENOMEM); + *rr_rank = (i == 0) ? rank : (KR_RANK_OMIT | KR_RANK_AUTH); + /* rank for RRSIGs isn't really useful: ^^ */ + if (i == 0) { + pkt->rr[pkt->rrset_count] = *rrset->set.rr; + pkt->rr[pkt->rrset_count].additional = rr_rank; + } else { + /* append the RR array */ + pkt->rr[pkt->rrset_count] = (knot_rrset_t){ + .owner = knot_dname_copy(rrset->set.rr->owner, &pkt->mm), + /* ^^ well, another copy isn't really needed */ + .ttl = rrset->set.rr->ttl, + .type = KNOT_RRTYPE_RRSIG, + .rclass = KNOT_CLASS_IN, + .rrs = *rdss[i], + .additional = rr_rank, + }; + } + ++pkt->rrset_count; + ++(pkt->sections[pkt->current].count); + } + return kr_ok(); +} + diff --git a/lib/cache/nsec1.c b/lib/cache/nsec1.c new file mode 100644 index 0000000..74b3d34 --- /dev/null +++ b/lib/cache/nsec1.c @@ -0,0 +1,511 @@ +/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of NSEC (1) handling. Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" +#include "lib/dnssec/nsec.h" +#include "lib/layer/iterate.h" + + +/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN). + * \return kr_ok() or error code (<0). */ +static int dname_wire_reconstruct(knot_dname_t *buf, const struct key *k, + knot_db_val_t kwz) +{ + /* Reconstruct from key: first the ending, then zone name. */ + int ret = knot_dname_lf2wire(buf, kwz.len, kwz.data); + if (ret < 0) { + VERBOSE_MSG(NULL, "=> NSEC: LF2wire ret = %d\n", ret); + assert(false); + return ret; + } + /* The last written byte is the zero label for root -> overwrite. */ + knot_dname_t *zone_start = buf + ret - 1; + assert(*zone_start == '\0'); + ret = knot_dname_to_wire(zone_start, k->zname, KNOT_DNAME_MAXLEN - kwz.len); + if (ret != k->zlf_len + 1) { + assert(false); + return ret < 0 ? ret : kr_error(EILSEQ); + } + return kr_ok(); +} + + +knot_db_val_t key_NSEC1(struct key *k, const knot_dname_t *name, bool add_wildcard) +{ + /* we basically need dname_lf with two bytes added + * on a correct place within the name (the cut) */ + int ret; + const bool ok = k && name + && !(ret = kr_dname_lf(k->buf, name, add_wildcard)); + if (!ok) { + assert(false); + return (knot_db_val_t){ NULL, 0 }; + } + + uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */ + uint8_t *end = k->buf + 1 + k->buf[0]; /* we don't use the final zero in key, + * but move it anyway */ + if (end < begin) { + assert(false); + return (knot_db_val_t){ NULL, 0 }; + } + int key_len; + if (end > begin) { + memmove(begin + 2, begin, end - begin); + key_len = k->buf[0] + 1; + } else { + key_len = k->buf[0] + 2; + } + /* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '1' + dname_lf + * of the name within the zone without the final 0. Iff the latter is empty, + * there's no zero to cut and thus the key_len difference. + */ + begin[0] = 0; + begin[1] = '1'; /* tag for NSEC1 */ + k->type = KNOT_RRTYPE_NSEC; + + /* + VERBOSE_MSG(NULL, "<> key_NSEC1; name: "); + kr_dname_print(name, add_wildcard ? "*." : "" , " "); + kr_log_verbose("(zone name LF length: %d; total key length: %d)\n", + k->zlf_len, key_len); + */ + + return (knot_db_val_t){ k->buf + 1, key_len }; +} + + +/** Assuming that k1 < k4, find where k2 is. (Considers DNS wrap-around.) + * + * \return Intuition: position of k2 among kX. + * 0: k2 < k1; 1: k1 == k2; 2: k1 is a prefix of k2 < k4; + * 3: k1 < k2 < k4 (and not 2); 4: k2 == k4; 5: k2 > k4 + * \note k1.data may be NULL, meaning assumption that k1 < k2 and not a prefix + * (i.e. return code will be > 2) + */ +static int kwz_between(knot_db_val_t k1, knot_db_val_t k2, knot_db_val_t k4) +{ + assert(k2.data && k4.data); + /* CACHE_KEY_DEF; we need to beware of one key being a prefix of another */ + int ret_maybe; /**< result, assuming we confirm k2 < k4 */ + if (k1.data) { + const int cmp12 = memcmp(k1.data, k2.data, MIN(k1.len, k2.len)); + if (cmp12 == 0 && k1.len == k2.len) /* iff k1 == k2 */ + return 1; + if (cmp12 > 0 || (cmp12 == 0 && k1.len > k2.len)) /* iff k1 > k2 */ + return 0; + ret_maybe = cmp12 == 0 ? 2 : 3; + } else { + ret_maybe = 3; + } + if (k4.len == 0) { /* wrap-around */ + return k2.len > 0 ? ret_maybe : 4; + } else { + const int cmp24 = memcmp(k2.data, k4.data, MIN(k2.len, k4.len)); + if (cmp24 == 0 && k2.len == k4.len) /* iff k2 == k4 */ + return 4; + if (cmp24 > 0 || (cmp24 == 0 && k2.len > k4.len)) /* iff k2 > k4 */ + return 5; + return ret_maybe; + } +} + + +/** NSEC1 range search. + * + * \param key Pass output of key_NSEC1(k, ...) + * \param value[out] The raw data of the NSEC cache record (optional; consistency checked). + * \param exact_match[out] Whether the key was matched exactly or just covered (optional). + * \param kwz_low[out] Output the low end of covering NSEC, pointing within DB (optional). + * \param kwz_high[in,out] Storage for the high end of covering NSEC (optional). + * It's only set if !exact_match. + * \param new_ttl[out] New TTL of the NSEC (optional). + * \return Error message or NULL. + * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4. + */ +static const char * find_leq_NSEC1(struct kr_cache *cache, const struct kr_query *qry, + const knot_db_val_t key, const struct key *k, knot_db_val_t *value, + bool *exact_match, knot_db_val_t *kwz_low, knot_db_val_t *kwz_high, + uint32_t *new_ttl) +{ + /* Do the cache operation. */ + const size_t nwz_off = key_nwz_off(k); + if (!key.data || key.len < nwz_off) { + assert(false); + return "range search ERROR"; + } + knot_db_val_t key_nsec = key; + knot_db_val_t val = { NULL, 0 }; + int ret = cache_op(cache, read_leq, &key_nsec, &val); + if (ret < 0) { + if (ret == kr_error(ENOENT)) { + return "range search miss"; + } else { + assert(false); + return "range search ERROR"; + } + } + if (value) { + *value = val; + } + /* Check consistency, TTL, rank. */ + const bool is_exact = (ret == 0); + if (exact_match) { + *exact_match = is_exact; + } + const struct entry_h *eh = entry_h_consistent_NSEC(val); + if (!eh) { + /* This might be just finding something else than NSEC1 entry, + * in case we searched before the very first one in the zone. */ + return "range search found inconsistent entry"; + } + /* Passing just zone name instead of owner, as we don't + * have it reconstructed at this point. */ + int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC, + qry->timestamp.tv_sec); + if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) { + return "range search found stale or insecure entry"; + /* TODO: remove the stale record *and* retry, + * in case we haven't run off. Perhaps start by in_zone check. */ + } + if (new_ttl) { + *new_ttl = new_ttl_; + } + if (kwz_low) { + *kwz_low = (knot_db_val_t){ + .data = (uint8_t *)key_nsec.data + nwz_off, + .len = key_nsec.len - nwz_off, + }; /* CACHE_KEY_DEF */ + } + if (is_exact) { + /* Nothing else to do. */ + return NULL; + } + /* The NSEC starts strictly before our target name; + * now check that it still belongs into that zone. */ + const bool nsec_in_zone = key_nsec.len >= nwz_off + /* CACHE_KEY_DEF */ + && memcmp(key.data, key_nsec.data, nwz_off) == 0; + if (!nsec_in_zone) { + return "range search miss (!nsec_in_zone)"; + } + /* We know it starts before sname, so let's check the other end. + * 1. construct the key for the next name - kwz_hi. */ + /* it's *full* name ATM */ + const knot_rdata_t *next = (const knot_rdata_t *) + (eh->data + KR_CACHE_RR_COUNT_SIZE); + if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) { + assert(false); + return "ERROR"; + /* TODO: more checks? */ + } + /* + WITH_VERBOSE { + VERBOSE_MSG(qry, "=> NSEC: next name: "); + kr_dname_print(next, "", "\n"); + } + */ + knot_dname_t ch_buf[KNOT_DNAME_MAXLEN]; + knot_dname_t *chs = kwz_high ? kwz_high->data : ch_buf; + if (!chs) { + assert(false); + return "EINVAL"; + } + { + /* Lower-case chs; see also RFC 6840 5.1. + * LATER(optim.): we do lots of copying etc. */ + knot_dname_t lower_buf[KNOT_DNAME_MAXLEN]; + ret = knot_dname_to_wire(lower_buf, next->data, + MIN(next->len, KNOT_DNAME_MAXLEN)); + if (ret < 0) { /* _ESPACE */ + return "range search found record with incorrect contents"; + } + knot_dname_to_lower(lower_buf); + ret = kr_dname_lf(chs, lower_buf, false); + } + if (ret) { + assert(false); + return "ERROR"; + } + knot_db_val_t kwz_hi = { /* skip the zone name */ + .data = chs + 1 + k->zlf_len, + .len = chs[0] - k->zlf_len, + }; + assert((ssize_t)(kwz_hi.len) >= 0); + /* 2. do the actual range check. */ + const knot_db_val_t kwz_sname = { + .data = (void *)/*const-cast*/(k->buf + 1 + nwz_off), + .len = k->buf[0] - k->zlf_len, + }; + assert((ssize_t)(kwz_sname.len) >= 0); + bool covers = /* we know for sure that the low end is before kwz_sname */ + 3 == kwz_between((knot_db_val_t){ NULL, 0 }, kwz_sname, kwz_hi); + if (!covers) { + return "range search miss (!covers)"; + } + if (kwz_high) { + *kwz_high = kwz_hi; + } + return NULL; +} + + +int nsec1_encloser(struct key *k, struct answer *ans, + const int sname_labels, int *clencl_labels, + knot_db_val_t *cover_low_kwz, knot_db_val_t *cover_hi_kwz, + const struct kr_query *qry, struct kr_cache *cache) +{ + static const int ESKIP = ABS(ENOENT); + /* Basic sanity check. */ + const bool ok = k && ans && clencl_labels && cover_low_kwz && cover_hi_kwz + && qry && cache; + if (!ok) { + assert(!EINVAL); + return kr_error(EINVAL); + } + + /* Find a previous-or-equal name+NSEC in cache covering the QNAME, + * checking TTL etc. */ + knot_db_val_t key = key_NSEC1(k, qry->sname, false); + knot_db_val_t val = { NULL, 0 }; + bool exact_match; + uint32_t new_ttl; + const char *err = find_leq_NSEC1(cache, qry, key, k, &val, + &exact_match, cover_low_kwz, cover_hi_kwz, &new_ttl); + if (err) { + VERBOSE_MSG(qry, "=> NSEC sname: %s\n", err); + return ESKIP; + } + + /* Get owner name of the record. */ + const knot_dname_t *owner; + knot_dname_t owner_buf[KNOT_DNAME_MAXLEN]; + if (exact_match) { + owner = qry->sname; + } else { + int ret = dname_wire_reconstruct(owner_buf, k, *cover_low_kwz); + if (unlikely(ret)) return ESKIP; + owner = owner_buf; + } + /* Basic checks OK -> materialize data. */ + { + const struct entry_h *nsec_eh = val.data; + int ret = entry2answer(ans, AR_NSEC, nsec_eh, knot_db_val_bound(val), + owner, KNOT_RRTYPE_NSEC, new_ttl); + if (ret) return kr_error(ret); + } + + /* Final checks, split for matching vs. covering our sname. */ + const knot_rrset_t *nsec_rr = ans->rrsets[AR_NSEC].set.rr; + const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata); + uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata); + assert(bm); + + if (exact_match) { + if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) != 0) { + VERBOSE_MSG(qry, + "=> NSEC sname: match but failed type check\n"); + return ESKIP; + } + /* NODATA proven; just need to add SOA+RRSIG later */ + VERBOSE_MSG(qry, "=> NSEC sname: match proved NODATA, new TTL %d\n", + new_ttl); + ans->rcode = PKT_NODATA; + return kr_ok(); + } /* else */ + + /* Inexact match. First check if sname is delegated by that NSEC. */ + const int nsec_matched = knot_dname_matched_labels(nsec_rr->owner, qry->sname); + const bool is_sub = nsec_matched == knot_dname_labels(nsec_rr->owner, NULL); + if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) { + VERBOSE_MSG(qry, "=> NSEC sname: covered but delegated (or error)\n"); + return ESKIP; + } + /* NXDOMAIN proven *except* for wildcards. */ + WITH_VERBOSE(qry) { + auto_free char *owner_str = kr_dname_text(nsec_rr->owner), + *next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata)); + VERBOSE_MSG(qry, "=> NSEC sname: covered by: %s -> %s, new TTL %d\n", + owner_str, next_str, new_ttl); + } + + /* Find label count of the closest encloser. + * Both endpoints in an NSEC do exist (though possibly in a child zone) + * and any prefixes of those names as well (empty non-terminals), + * but nothing else exists inside this "triangle". + * + * Note that we have to lower-case the next name for comparison, + * even though we have canonicalized NSEC already; see RFC 6840 5.1. + * LATER(optim.): it might be faster to use the LFs we already have. + */ + knot_dname_t next[KNOT_DNAME_MAXLEN]; + int ret = knot_dname_to_wire(next, knot_nsec_next(nsec_rr->rrs.rdata), sizeof(next)); + if (ret < 0) { + assert(!ret); + return kr_error(ret); + } + knot_dname_to_lower(next); + *clencl_labels = MAX( + nsec_matched, + knot_dname_matched_labels(qry->sname, next) + ); + + /* Empty non-terminals don't need to have + * a matching NSEC record. */ + if (sname_labels == *clencl_labels) { + ans->rcode = PKT_NODATA; + VERBOSE_MSG(qry, + "=> NSEC sname: empty non-terminal by the same RR\n"); + } else { + ans->rcode = PKT_NXDOMAIN; + } + return kr_ok(); +} + +/** Verify non-existence after kwz_between() call. */ +static bool nonexistence_ok(int cmp, const knot_rrset_t *rrs) +{ + if (cmp == 3) { + return true; + } + if (cmp != 2) { + return false; + } + const uint8_t *bm = knot_nsec_bitmap(rrs->rrs.rdata); + uint16_t bm_size = knot_nsec_bitmap_len(rrs->rrs.rdata); + return kr_nsec_children_in_zone_check(bm, bm_size) != 0; +} + +int nsec1_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + knot_db_val_t cover_low_kwz, knot_db_val_t cover_hi_kwz, + const struct kr_query *qry, struct kr_cache *cache) +{ + /* Construct key for the source of synthesis. */ + knot_db_val_t key = key_NSEC1(k, clencl_name, true); + const size_t nwz_off = key_nwz_off(k); + if (!key.data || key.len < nwz_off) { + assert(false); + return kr_error(1); + } + /* Check if our sname-covering NSEC also covers/matches SS. */ + knot_db_val_t kwz = { + .data = (uint8_t *)key.data + nwz_off, + .len = key.len - nwz_off, + }; + assert((ssize_t)(kwz.len) >= 0); + const int cmp = kwz_between(cover_low_kwz, kwz, cover_hi_kwz); + if (nonexistence_ok(cmp, ans->rrsets[AR_NSEC].set.rr)) { + VERBOSE_MSG(qry, "=> NSEC wildcard: covered by the same RR\n"); + return AR_SOA; + } + const knot_rrset_t *nsec_rr = NULL; /**< the wildcard proof NSEC */ + bool exact_match; /**< whether it matches the source of synthesis */ + if (cmp == 1) { + exact_match = true; + nsec_rr = ans->rrsets[AR_NSEC].set.rr; + } else { + /* Try to find the NSEC for SS. */ + knot_db_val_t val = { NULL, 0 }; + knot_db_val_t wild_low_kwz = { NULL, 0 }; + uint32_t new_ttl; + const char *err = find_leq_NSEC1(cache, qry, key, k, &val, + &exact_match, &wild_low_kwz, NULL, &new_ttl); + if (err) { + VERBOSE_MSG(qry, "=> NSEC wildcard: %s\n", err); + return kr_ok(); + } + /* Materialize the record into answer (speculatively). */ + knot_dname_t owner[KNOT_DNAME_MAXLEN]; + int ret = dname_wire_reconstruct(owner, k, wild_low_kwz); + if (ret) return kr_error(ret); + const struct entry_h *nsec_eh = val.data; + ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val), + owner, KNOT_RRTYPE_NSEC, new_ttl); + if (ret) return kr_error(ret); + nsec_rr = ans->rrsets[AR_WILD].set.rr; + } + + assert(nsec_rr); + const uint32_t new_ttl_log = + kr_verbose_status ? nsec_rr->ttl : -1; + const uint8_t *bm = knot_nsec_bitmap(nsec_rr->rrs.rdata); + uint16_t bm_size = knot_nsec_bitmap_len(nsec_rr->rrs.rdata); + int ret; + struct answer_rrset * const arw = &ans->rrsets[AR_WILD]; + if (!bm) { + assert(false); + ret = kr_error(1); + goto clean_wild; + } + if (!exact_match) { + /* Finish verification that the source of synthesis doesn't exist. */ + const int nsec_matched = + knot_dname_matched_labels(nsec_rr->owner, clencl_name); + /* we don't need to use the full source of synthesis ^ */ + const bool is_sub = + nsec_matched == knot_dname_labels(nsec_rr->owner, NULL); + if (is_sub && kr_nsec_children_in_zone_check(bm, bm_size) != 0) { + VERBOSE_MSG(qry, + "=> NSEC wildcard: covered but delegated (or error)\n"); + ret = kr_ok(); + goto clean_wild; + } + /* We have a record proving wildcard non-existence. */ + WITH_VERBOSE(qry) { + auto_free char *owner_str = kr_dname_text(nsec_rr->owner), + *next_str = kr_dname_text(knot_nsec_next(nsec_rr->rrs.rdata)); + VERBOSE_MSG(qry, "=> NSEC wildcard: covered by: %s -> %s, new TTL %d\n", + owner_str, next_str, new_ttl_log); + } + return AR_SOA; + } + + /* The wildcard exists. Find if it's NODATA - check type bitmap. */ + if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) { + /* NODATA proven; just need to add SOA+RRSIG later */ + WITH_VERBOSE(qry) { + const char *msg_start = "=> NSEC wildcard: match proved NODATA"; + if (arw->set.rr) { + auto_free char *owner_str = kr_dname_text(nsec_rr->owner); + VERBOSE_MSG(qry, "%s: %s, new TTL %d\n", + msg_start, owner_str, new_ttl_log); + } else { + /* don't repeat the RR if it's the same */ + VERBOSE_MSG(qry, "%s, by the same RR\n", msg_start); + } + } + ans->rcode = PKT_NODATA; + return AR_SOA; + + } /* else */ + /* The data probably exists -> don't add this NSEC + * and (later) try to find the real wildcard data */ + VERBOSE_MSG(qry, "=> NSEC wildcard: should exist (or error)\n"); + ans->rcode = PKT_NOERROR; + ret = kr_ok(); +clean_wild: + if (arw->set.rr) { /* we may have matched AR_NSEC */ + knot_rrset_free(arw->set.rr, ans->mm); + arw->set.rr = NULL; + knot_rdataset_clear(&arw->sig_rds, ans->mm); + } + return ret; +} + diff --git a/lib/cache/nsec3.c b/lib/cache/nsec3.c new file mode 100644 index 0000000..d2ae72a --- /dev/null +++ b/lib/cache/nsec3.c @@ -0,0 +1,501 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @file + * Implementation of NSEC3 handling. Prototypes in ./impl.h + */ + +#include "lib/cache/impl.h" + +#include "contrib/base32hex.h" +#include "lib/dnssec/nsec.h" +#include "lib/layer/iterate.h" + +#include <libknot/rrtype/nsec3.h> + +static const knot_db_val_t VAL_EMPTY = { NULL, 0 }; + +/** Common part: write all but the NSEC3 hash. */ +static knot_db_val_t key_NSEC3_common(struct key *k, const knot_dname_t *zname, + const nsec_p_hash_t nsec_p_hash) +{ + int ret; + const bool ok = k && zname + && !(ret = kr_dname_lf(k->buf, zname, false)); + if (!ok) { + assert(false); + return VAL_EMPTY; + } + + /* CACHE_KEY_DEF: key == zone's dname_lf + '\0' + '3' + nsec_p hash (4B) + * + NSEC3 hash (20B == NSEC3_HASH_LEN binary!) + * LATER(optim.) nsec_p hash: perhaps 2B would give a sufficient probability + * of avoiding collisions. + */ + uint8_t *begin = k->buf + 1 + k->zlf_len; /* one byte after zone's zero */ + begin[0] = 0; + begin[1] = '3'; /* tag for NSEC3 */ + k->type = KNOT_RRTYPE_NSEC3; + memcpy(begin + 2, &nsec_p_hash, sizeof(nsec_p_hash)); + return (knot_db_val_t){ + .data = k->buf + 1, + .len = begin + 2 + sizeof(nsec_p_hash) - (k->buf + 1), + }; +} + +knot_db_val_t key_NSEC3(struct key *k, const knot_dname_t *nsec3_name, + const nsec_p_hash_t nsec_p_hash) +{ + knot_db_val_t val = key_NSEC3_common(k, nsec3_name /*only zname required*/, + nsec_p_hash); + if (!val.data) return val; + int len = base32hex_decode(nsec3_name + 1, nsec3_name[0], + knot_db_val_bound(val), KR_CACHE_KEY_MAXLEN - val.len); + if (len != NSEC3_HASH_LEN) { + return VAL_EMPTY; + } + val.len += len; + return val; +} + +/** Construct a string key for for NSEC3 predecessor-search, from an non-NSEC3 name. + * \note k->zlf_len and k->zname are assumed to have been correctly set */ +static knot_db_val_t key_NSEC3_name(struct key *k, const knot_dname_t *name, + const bool add_wildcard, const struct nsec_p *nsec_p) +{ + bool ok = k && name && nsec_p && nsec_p->raw; + if (!ok) return VAL_EMPTY; + knot_db_val_t val = key_NSEC3_common(k, k->zname, nsec_p->hash); + if (!val.data) return val; + + /* Make `name` point to correctly wildcarded owner name. */ + uint8_t buf[KNOT_DNAME_MAXLEN]; + int name_len; + if (add_wildcard) { + buf[0] = '\1'; + buf[1] = '*'; + name_len = knot_dname_to_wire(buf + 2, name, sizeof(buf) - 2); + if (name_len < 0) return VAL_EMPTY; /* wants wildcard but doesn't fit */ + name = buf; + name_len += 2; + } else { + name_len = knot_dname_size(name); + } + /* Append the NSEC3 hash. */ + const dnssec_binary_t dname = { + .size = name_len, + .data = (uint8_t *)/*const-cast*/name, + }; + + #if 0 // LATER(optim.): this requires a patched libdnssec - tries to realloc() + dnssec_binary_t hash = { + .size = KR_CACHE_KEY_MAXLEN - val.len, + .data = val.data + val.len, + }; + int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash); + if (ret != DNSSEC_EOK) return VAL_EMPTY; + assert(hash.size == NSEC3_HASH_LEN); + + #else + dnssec_binary_t hash = { .size = 0, .data = NULL }; + int ret = dnssec_nsec3_hash(&dname, &nsec_p->libknot, &hash); + if (ret != DNSSEC_EOK) return VAL_EMPTY; + if (hash.size != NSEC3_HASH_LEN || !hash.data) { + assert(false); + return VAL_EMPTY; + } + memcpy(knot_db_val_bound(val), hash.data, NSEC3_HASH_LEN); + free(hash.data); + #endif + + val.len += hash.size; + return val; +} + +/** Return h1 < h2, semantically on NSEC3 hashes. */ +static inline bool nsec3_hash_ordered(const uint8_t *h1, const uint8_t *h2) +{ + return memcmp(h1, h2, NSEC3_HASH_LEN) < 0; +} + +/** NSEC3 range search. + * + * \param key Pass output of key_NSEC3(k, ...) + * \param nsec_p Restrict to this NSEC3 parameter-set. + * \param value[out] The raw data of the NSEC3 cache record (optional; consistency checked). + * \param exact_match[out] Whether the key was matched exactly or just covered (optional). + * \param hash_low[out] Output the low end hash of covering NSEC3, pointing within DB (optional). + * \param new_ttl[out] New TTL of the NSEC3 (optional). + * \return Error message or NULL. + * \note The function itself does *no* bitmap checks, e.g. RFC 6840 sec. 4. + */ +static const char * find_leq_NSEC3(struct kr_cache *cache, const struct kr_query *qry, + const knot_db_val_t key, const struct key *k, const struct nsec_p *nsec_p, + knot_db_val_t *value, bool *exact_match, const uint8_t **hash_low, + uint32_t *new_ttl) +{ + /* Do the cache operation. */ + const size_t hash_off = key_nsec3_hash_off(k); + if (!key.data || key.len < hash_off) { + assert(false); + return "range search ERROR"; + } + knot_db_val_t key_found = key; + knot_db_val_t val = { NULL, 0 }; + int ret = cache_op(cache, read_leq, &key_found, &val); + /* ^^ LATER(optim.): incrementing key and doing less-than search + * would probably be slightly more efficient with LMDB, + * but the code complexity would grow considerably. */ + if (ret < 0) { + if (ret == kr_error(ENOENT)) { + return "range search miss"; + } else { + assert(false); + return "range search ERROR"; + } + } + if (value) { + *value = val; + } + /* Check consistency, TTL, rank. */ + const bool is_exact = (ret == 0); + if (exact_match) { + *exact_match = is_exact; + } + const struct entry_h *eh = entry_h_consistent_NSEC(val); + if (!eh) { + /* This might be just finding something else than NSEC3 entry, + * in case we searched before the very first one in the zone. */ + return "range search found inconsistent entry"; + } + /* Passing just zone name instead of owner. */ + int32_t new_ttl_ = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_NSEC3, + qry->timestamp.tv_sec); + if (new_ttl_ < 0 || !kr_rank_test(eh->rank, KR_RANK_SECURE)) { + return "range search found stale or insecure entry"; + /* TODO: remove the stale record *and* retry, + * in case we haven't run off. Perhaps start by in_zone check. */ + } + if (new_ttl) { + *new_ttl = new_ttl_; + } + if (hash_low) { + *hash_low = (uint8_t *)key_found.data + hash_off; + } + if (is_exact) { + /* Nothing else to do. */ + return NULL; + } + /* The NSEC3 starts strictly before our target name; + * now check that it still belongs into that zone and chain. */ + const uint8_t *nsec_p_raw = eh->data + KR_CACHE_RR_COUNT_SIZE + + 2 /* RDLENGTH from rfc1034 */; + const int nsec_p_len = nsec_p_rdlen(nsec_p_raw); + const bool same_chain = key_found.len == hash_off + NSEC3_HASH_LEN + /* CACHE_KEY_DEF */ + && memcmp(key.data, key_found.data, hash_off) == 0 + /* exact comparison of NSEC3 parameters */ + && nsec_p_len == nsec_p_rdlen(nsec_p->raw) + && memcmp(nsec_p_raw, nsec_p->raw, nsec_p_len) == 0; + if (!same_chain) { + return "range search miss (!same_chain)"; + } + /* We know it starts before sname, so let's check the other end. + * A. find the next hash and check its length. */ + if (KR_CACHE_RR_COUNT_SIZE != 2 || get_uint16(eh->data) == 0) { + assert(false); + return "ERROR"; + /* TODO: more checks? Also, `next` computation is kinda messy. */ + } + const uint8_t *hash_next = nsec_p_raw + nsec_p_len + + sizeof(uint8_t) /* hash length from rfc5155 */; + if (hash_next[-1] != NSEC3_HASH_LEN) { + return "unexpected next hash length"; + } + /* B. do the actual range check. */ + const uint8_t * const hash_searched = (uint8_t *)key.data + hash_off; + bool covers = /* we know for sure that the low end is before the searched name */ + nsec3_hash_ordered(hash_searched, hash_next) + /* and the wrap-around case */ + || nsec3_hash_ordered(hash_next, (const uint8_t *)key_found.data + hash_off); + if (!covers) { + return "range search miss (!covers)"; + } + return NULL; +} + +/** Extract textual representation of NSEC3 hash from a cache key. + * \param text must have length at least NSEC3_HASH_TXT_LEN+1 (will get 0-terminated). */ +static void key_NSEC3_hash2text(const knot_db_val_t key, char *text) +{ + assert(key.data && key.len > NSEC3_HASH_LEN); + const uint8_t *hash_raw = knot_db_val_bound(key) - NSEC3_HASH_LEN; + /* CACHE_KEY_DEF ^^ */ + int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, (uint8_t *)text, + NSEC3_HASH_TXT_LEN); + assert(len == NSEC3_HASH_TXT_LEN); (void)len; + text[NSEC3_HASH_TXT_LEN] = '\0'; +} + +/** Reconstruct a name into a buffer (assuming length at least KNOT_DNAME_MAXLEN). + * \return kr_ok() or error code (<0). */ +static int dname_wire_reconstruct(knot_dname_t *buf, const knot_dname_t *zname, + const uint8_t *hash_raw) +{ + int len = base32hex_encode(hash_raw, NSEC3_HASH_LEN, buf + 1, NSEC3_HASH_TXT_LEN); + if (len != NSEC3_HASH_TXT_LEN) { + assert(false); + return kr_error(EINVAL); + } + buf[0] = len; + int ret = knot_dname_to_wire(buf + 1 + len, zname, KNOT_DNAME_MAXLEN - 1 - len); + return ret < 0 ? kr_error(ret) : kr_ok(); +} + +static void nsec3_hash2text(const knot_dname_t *owner, char *text) +{ + assert(owner[0] == NSEC3_HASH_TXT_LEN); + memcpy(text, owner + 1, MIN(owner[0], NSEC3_HASH_TXT_LEN)); + text[NSEC3_HASH_TXT_LEN] = '\0'; +} + +int nsec3_encloser(struct key *k, struct answer *ans, + const int sname_labels, int *clencl_labels, + const struct kr_query *qry, struct kr_cache *cache) +{ + static const int ESKIP = ABS(ENOENT); + /* Basic sanity check. */ + const bool ok = k && k->zname && ans && clencl_labels + && qry && cache; + if (!ok) { + assert(!EINVAL); + return kr_error(EINVAL); + } + + /*** Find the closest encloser - cycle: name starting at sname, + * proceeding while longer than zname, shortening by one label on step. + * We need a pair where a name doesn't exist *and* its parent does. */ + /* LATER(optim.): perhaps iterate in the other order - that + * should help significantly against deep queries where we have + * a shallow proof in the cache. We can also optimize by using + * only exact search unless we had a match in the previous iteration. */ + const int zname_labels = knot_dname_labels(k->zname, NULL); + int last_nxproven_labels = -1; + const knot_dname_t *name = qry->sname; + for (int name_labels = sname_labels; name_labels >= zname_labels; + --name_labels, name += 1 + name[0]) { + /* Find a previous-or-equal NSEC3 in cache covering the name, + * checking TTL etc. */ + const knot_db_val_t key = key_NSEC3_name(k, name, false, &ans->nsec_p); + if (!key.data) continue; + WITH_VERBOSE(qry) { + char hash_txt[NSEC3_HASH_TXT_LEN + 1]; + key_NSEC3_hash2text(key, hash_txt); + VERBOSE_MSG(qry, "=> NSEC3 depth %d: hash %s\n", + name_labels - zname_labels, hash_txt); + } + knot_db_val_t val = { NULL, 0 }; + bool exact_match; + uint32_t new_ttl; + const uint8_t *hash_low; + const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val, + &exact_match, &hash_low, &new_ttl); + if (err) { + WITH_VERBOSE(qry) { + auto_free char *name_str = kr_dname_text(name); + VERBOSE_MSG(qry, "=> NSEC3 encloser error for %s: %s\n", + name_str, err); + } + continue; + } + if (exact_match && name_labels != sname_labels + && name_labels + 1 != last_nxproven_labels) { + /* This name exists (checked rank and TTL), and it's + * neither of the two interesting cases, so we do not + * keep searching for non-existence above this name. */ + VERBOSE_MSG(qry, + "=> NSEC3 encloser: only found existence of an ancestor\n"); + return ESKIP; + } + /* Optimization: avoid the rest of the last iteration if pointless. */ + if (!exact_match && name_labels == zname_labels + && last_nxproven_labels != name_labels + 1) { + break; + } + + /* Basic checks OK -> materialize data, cleaning any previous + * records on that answer index (unsuccessful attempts). */ + knot_dname_t owner[KNOT_DNAME_MAXLEN]; + { + int ret = dname_wire_reconstruct(owner, k->zname, hash_low); + if (unlikely(ret)) continue; + } + const int ans_id = (exact_match && name_labels + 1 == last_nxproven_labels) + ? AR_CPE : AR_NSEC; + { + const struct entry_h *nsec_eh = val.data; + memset(&ans->rrsets[ans_id], 0, sizeof(ans->rrsets[ans_id])); + int ret = entry2answer(ans, ans_id, nsec_eh, knot_db_val_bound(val), + owner, KNOT_RRTYPE_NSEC3, new_ttl); + if (ret) return kr_error(ret); + } + + if (!exact_match) { + /* Non-existence proven, but we don't know if `name` + * is the next closer name. + * Note: we don't need to check for the sname being + * delegated away by this record, as with NSEC3 only + * *exact* match on an ancestor could do that. */ + last_nxproven_labels = name_labels; + WITH_VERBOSE(qry) { + char hash_low_txt[NSEC3_HASH_TXT_LEN + 1]; + nsec3_hash2text(owner, hash_low_txt); + VERBOSE_MSG(qry, + "=> NSEC3 depth %d: covered by %s -> TODO, new TTL %d\n", + name_labels - zname_labels, hash_low_txt, new_ttl); + } + continue; + } + + /* Exactly matched NSEC3: two cases, one after another. */ + const knot_rrset_t *nsec_rr = ans->rrsets[ans_id].set.rr; + const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata); + uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata); + assert(bm); + if (name_labels == sname_labels) { + if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, + nsec_rr->owner) != 0) { + VERBOSE_MSG(qry, + "=> NSEC3 sname: match but failed type check\n"); + return ESKIP; + } + /* NODATA proven; just need to add SOA+RRSIG later */ + VERBOSE_MSG(qry, + "=> NSEC3 sname: match proved NODATA, new TTL %d\n", + new_ttl); + ans->rcode = PKT_NODATA; + return kr_ok(); + + } /* else */ + + assert(name_labels + 1 == last_nxproven_labels); + if (kr_nsec_children_in_zone_check(bm, bm_size) != 0) { + VERBOSE_MSG(qry, + "=> NSEC3 encloser: found but delegated (or error)\n"); + return ESKIP; + } + /* NXDOMAIN proven *except* for wildcards. */ + WITH_VERBOSE(qry) { + auto_free char *name_str = kr_dname_text(name); + VERBOSE_MSG(qry, + "=> NSEC3 encloser: confirmed as %s, new TTL %d\n", + name_str, new_ttl); + } + *clencl_labels = name_labels; + ans->rcode = PKT_NXDOMAIN; + /* Avoid repeated NSEC3 - remove either if the hashes match. + * This is very unlikely in larger zones: 1/size (per attempt). + * Well, deduplication would happen anyway when the answer + * from cache is read by kresd (internally). */ + if (unlikely(0 == memcmp(ans->rrsets[AR_NSEC].set.rr->owner + 1, + ans->rrsets[AR_CPE ].set.rr->owner + 1, + NSEC3_HASH_LEN))) { + memset(&ans->rrsets[AR_CPE], 0, sizeof(ans->rrsets[AR_CPE])); + /* LATER(optim.): perhaps check this earlier and avoid some work? */ + } + return kr_ok(); + } + + /* We've ran out of options. */ + if (last_nxproven_labels > 0) { + /* We didn't manage to prove existence of the closest encloser, + * meaning the only chance left is a *positive* wildcard record. */ + *clencl_labels = last_nxproven_labels - 1; + ans->rcode = PKT_NXDOMAIN; + /* FIXME: review */ + } + return ESKIP; +} + +int nsec3_src_synth(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + const struct kr_query *qry, struct kr_cache *cache) +{ + /* Find a previous-or-equal NSEC3 in cache covering or matching + * the source of synthesis, checking TTL etc. */ + const knot_db_val_t key = key_NSEC3_name(k, clencl_name, true, &ans->nsec_p); + if (!key.data) return kr_error(1); + WITH_VERBOSE(qry) { + char hash_txt[NSEC3_HASH_TXT_LEN + 1]; + key_NSEC3_hash2text(key, hash_txt); + VERBOSE_MSG(qry, "=> NSEC3 wildcard: hash %s\n", hash_txt); + } + knot_db_val_t val = { NULL, 0 }; + bool exact_match; + uint32_t new_ttl; + const uint8_t *hash_low; + const char *err = find_leq_NSEC3(cache, qry, key, k, &ans->nsec_p, &val, + &exact_match, &hash_low, &new_ttl); + if (err) { + VERBOSE_MSG(qry, "=> NSEC3 wildcard: %s\n", err); + return kr_ok(); + } + + /* LATER(optim.): avoid duplicities in answer. */ + + /* Basic checks OK -> materialize the data (speculatively). */ + knot_dname_t owner[KNOT_DNAME_MAXLEN]; + { + int ret = dname_wire_reconstruct(owner, k->zname, hash_low); + if (unlikely(ret)) return kr_ok(); + const struct entry_h *nsec_eh = val.data; + ret = entry2answer(ans, AR_WILD, nsec_eh, knot_db_val_bound(val), + owner, KNOT_RRTYPE_NSEC3, new_ttl); + if (ret) return kr_error(ret); + } + const knot_rrset_t *nsec_rr = ans->rrsets[AR_WILD].set.rr; + + if (!exact_match) { + /* The record proves wildcard non-existence. */ + WITH_VERBOSE(qry) { + char hash_low_txt[NSEC3_HASH_TXT_LEN + 1]; + nsec3_hash2text(owner, hash_low_txt); + VERBOSE_MSG(qry, + "=> NSEC3 wildcard: covered by %s -> TODO, new TTL %d\n", + hash_low_txt, new_ttl); + } + return AR_SOA; + } + + /* The wildcard exists. Find if it's NODATA - check type bitmap. */ + const uint8_t *bm = knot_nsec3_bitmap(nsec_rr->rrs.rdata); + uint16_t bm_size = knot_nsec3_bitmap_len(nsec_rr->rrs.rdata); + assert(bm); + if (kr_nsec_bitmap_nodata_check(bm, bm_size, qry->stype, nsec_rr->owner) == 0) { + /* NODATA proven; just need to add SOA+RRSIG later */ + VERBOSE_MSG(qry, "=> NSEC3 wildcard: match proved NODATA, new TTL %d\n", + new_ttl); + ans->rcode = PKT_NODATA; + return AR_SOA; + + } /* else */ + /* The data probably exists -> don't add this NSEC3 + * and (later) try to find the real wildcard data */ + VERBOSE_MSG(qry, "=> NSEC3 wildcard: should exist (or error)\n"); + ans->rcode = PKT_NOERROR; + memset(&ans->rrsets[AR_WILD], 0, sizeof(ans->rrsets[AR_WILD])); + return kr_ok(); +} + diff --git a/lib/cache/peek.c b/lib/cache/peek.c new file mode 100644 index 0000000..1af1627 --- /dev/null +++ b/lib/cache/peek.c @@ -0,0 +1,724 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "lib/cache/impl.h" + +#include "lib/dnssec/ta.h" +#include "lib/layer/iterate.h" + +/* The whole file only exports peek_nosync(). + * Forwards for larger chunks of code: */ + +static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val, + uint8_t lowest_rank); +static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el, + struct kr_query *qry, bool only_NS, bool is_DS); +static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type, + const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl); +static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + uint16_t type, uint8_t lowest_rank, + const struct kr_query *qry, struct kr_cache *cache); + +static int peek_encloser( + struct key *k, struct answer *ans, int sname_labels, + uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache); + + +static int nsec_p_init(struct nsec_p *nsec_p, knot_db_val_t nsec_p_entry, bool with_knot) +{ + const size_t stamp_len = sizeof(uint32_t); + if (nsec_p_entry.len <= stamp_len) { /* plain NSEC if equal */ + nsec_p->raw = NULL; + nsec_p->hash = 0; + return kr_ok(); + } + nsec_p->raw = (uint8_t *)nsec_p_entry.data + stamp_len; + nsec_p->hash = nsec_p_mkHash(nsec_p->raw); + if (!with_knot) return kr_ok(); + /* Convert NSEC3 params to another format. */ + const dnssec_binary_t rdata = { + .size = nsec_p_rdlen(nsec_p->raw), + .data = (uint8_t *)/*const-cast*/nsec_p->raw, + }; + int ret = dnssec_nsec3_params_from_rdata(&nsec_p->libknot, &rdata); + return ret == DNSSEC_EOK ? kr_ok() : kr_error(ret); +} + +static void nsec_p_cleanup(struct nsec_p *nsec_p) +{ + dnssec_binary_free(&nsec_p->libknot.salt); + /* We don't really need to clear it, but it's not large. (`salt` zeroed above) */ + memset(nsec_p, 0, sizeof(*nsec_p)); +} + +/** Compute new TTL for nsec_p entry, using SOA serial arith. + * \param new_ttl (optionally) write the new TTL (even if negative) + * \return error code, e.g. kr_error(ESTALE) */ +static int nsec_p_ttl(knot_db_val_t entry, const uint32_t timestamp, int32_t *new_ttl) +{ + if (!entry.data) { + assert(!EINVAL); + return kr_error(EINVAL); + } + uint32_t stamp; + if (!entry.len) { + return kr_error(ENOENT); + } + if (entry.len < sizeof(stamp)) { + assert(!EILSEQ); + return kr_error(EILSEQ); + } + memcpy(&stamp, entry.data, sizeof(stamp)); + int32_t newttl = stamp - timestamp; + if (new_ttl) *new_ttl = newttl; + return newttl < 0 ? kr_error(ESTALE) : kr_ok(); +} + +static uint8_t get_lowest_rank(const struct kr_request *req, const struct kr_query *qry) +{ + /* TODO: move rank handling into the iterator (DNSSEC_* flags)? */ + const bool allow_unverified = + knot_wire_get_cd(req->qsource.packet->wire) || qry->flags.STUB; + /* in stub mode we don't trust RRs anyway ^^ */ + if (qry->flags.NONAUTH) { + return KR_RANK_INITIAL; + /* Note: there's little sense in validation status for non-auth records. + * In case of using NONAUTH to get NS IPs, knowing that you ask correct + * IP doesn't matter much for security; it matters whether you can + * validate the answers from the NS. + */ + } else if (!allow_unverified) { + /* Records not present under any TA don't have their security + * verified at all, so we also accept low ranks in that case. */ + const bool ta_covers = kr_ta_covers_qry(req->ctx, qry->sname, qry->stype); + /* ^ TODO: performance? TODO: stype - call sites */ + if (ta_covers) { + return KR_RANK_INSECURE | KR_RANK_AUTH; + } /* else falltrhough */ + } + return KR_RANK_INITIAL | KR_RANK_AUTH; +} + + +/** Almost whole .produce phase for the cache module. + * \note we don't transition to KR_STATE_FAIL even in case of "unexpected errors". + */ +int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + struct kr_cache *cache = &req->ctx->cache; + + struct key k_storage, *k = &k_storage; + int ret = kr_dname_lf(k->buf, qry->sname, false); + if (unlikely(ret)) { + assert(false); + return ctx->state; + } + + const uint8_t lowest_rank = get_lowest_rank(req, qry); + + /**** 1. find the name or the closest (available) zone, not considering wildcards + **** 1a. exact name+type match (can be negative answer in insecure zones) */ + { + knot_db_val_t key = key_exact_type_maypkt(k, qry->stype); + knot_db_val_t val = { NULL, 0 }; + ret = cache_op(cache, read, &key, &val, 1); + if (!ret) { + /* found an entry: test conditions, materialize into pkt, etc. */ + ret = found_exact_hit(ctx, pkt, val, lowest_rank); + } + } + if (ret && ret != -abs(ENOENT)) { + VERBOSE_MSG(qry, "=> exact hit error: %d %s\n", ret, kr_strerror(ret)); + assert(false); + return ctx->state; + } else if (!ret) { + return KR_STATE_DONE; + } + + /**** 1b. otherwise, find the longest prefix zone/xNAME (with OK time+rank). [...] */ + k->zname = qry->sname; + ret = kr_dname_lf(k->buf, k->zname, false); /* LATER(optim.): probably remove */ + if (unlikely(ret)) { + assert(false); + return ctx->state; + } + entry_list_t el; + ret = closest_NS(cache, k, el, qry, false, qry->stype == KNOT_RRTYPE_DS); + if (ret) { + assert(ret == kr_error(ENOENT)); + if (ret != kr_error(ENOENT) || !el[0].len) { + return ctx->state; + } + } + switch (k->type) { + case KNOT_RRTYPE_CNAME: { + const knot_db_val_t v = el[EL_CNAME]; + assert(v.data && v.len); + const int32_t new_ttl = get_new_ttl(v.data, qry, qry->sname, + KNOT_RRTYPE_CNAME, qry->timestamp.tv_sec); + ret = answer_simple_hit(ctx, pkt, KNOT_RRTYPE_CNAME, v.data, + knot_db_val_bound(v), new_ttl); + /* TODO: ^^ cumbersome code; we also recompute the TTL */ + return ret == kr_ok() ? KR_STATE_DONE : ctx->state; + } + case KNOT_RRTYPE_DNAME: + VERBOSE_MSG(qry, "=> DNAME not supported yet\n"); // LATER + return ctx->state; + } + + /* We have to try proving from NSEC*. */ + auto_free char *log_zname = NULL; + WITH_VERBOSE(qry) { + log_zname = kr_dname_text(k->zname); + if (!el[0].len) { + VERBOSE_MSG(qry, "=> no NSEC* cached for zone: %s\n", log_zname); + } + } + +#if 0 + if (!eh) { /* fall back to root hints? */ + ret = kr_zonecut_set_sbelt(req->ctx, &qry->zone_cut); + if (ret) return ctx->state; + assert(!qry->zone_cut.parent); + + //VERBOSE_MSG(qry, "=> using root hints\n"); + //qry->flags.AWAIT_CUT = false; + return ctx->state; + } + + /* Now `eh` points to the closest NS record that we've found, + * and that's the only place to start - we may either find + * a negative proof or we may query upstream from that point. */ + kr_zonecut_set(&qry->zone_cut, k->zname); + ret = kr_make_query(qry, pkt); // TODO: probably not yet - qname minimization + if (ret) return ctx->state; +#endif + + /** Structure for collecting multiple NSEC* + RRSIG records, + * in preparation for the answer, and for tracking the progress. */ + struct answer ans; + memset(&ans, 0, sizeof(ans)); + ans.mm = &pkt->mm; + const int sname_labels = knot_dname_labels(qry->sname, NULL); + + /* Try the NSEC* parameters in order, until success. + * Let's not mix different parameters for NSEC* RRs in a single proof. */ + for (int i = 0; ;) { + int32_t log_new_ttl = -123456789; /* visually recognizable value */ + ret = nsec_p_ttl(el[i], qry->timestamp.tv_sec, &log_new_ttl); + if (!ret || VERBOSE_STATUS) { + nsec_p_init(&ans.nsec_p, el[i], !ret); + } + if (ret) { + VERBOSE_MSG(qry, "=> skipping zone: %s, %s, hash %x;" + "new TTL %d, ret %d\n", + log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"), + (unsigned)ans.nsec_p.hash, (int)log_new_ttl, ret); + /* no need for nsec_p_cleanup() in this case */ + goto cont; + } + VERBOSE_MSG(qry, "=> trying zone: %s, %s, hash %x\n", + log_zname, (ans.nsec_p.raw ? "NSEC3" : "NSEC"), + (unsigned)ans.nsec_p.hash); + /**** 2. and 3. inside */ + ret = peek_encloser(k, &ans, sname_labels, + lowest_rank, qry, cache); + nsec_p_cleanup(&ans.nsec_p); + if (!ret) break; + if (ret < 0) return ctx->state; + cont: + /* Otherwise we try another nsec_p, if available. */ + if (++i == ENTRY_APEX_NSECS_CNT) return ctx->state; + /* clear possible partial answers in `ans` (no need to deallocate) */ + ans.rcode = 0; + memset(&ans.rrsets, 0, sizeof(ans.rrsets)); + } + + /**** 4. add SOA iff needed */ + if (ans.rcode != PKT_NOERROR) { + /* Assuming k->buf still starts with zone's prefix, + * look up the SOA in cache. */ + k->buf[0] = k->zlf_len; + knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_SOA); + knot_db_val_t val = { NULL, 0 }; + ret = cache_op(cache, read, &key, &val, 1); + const struct entry_h *eh; + if (ret || !(eh = entry_h_consistent(val, KNOT_RRTYPE_SOA))) { + assert(ret); /* only want to catch `eh` failures */ + VERBOSE_MSG(qry, "=> SOA missed\n"); + return ctx->state; + } + /* Check if the record is OK. */ + int32_t new_ttl = get_new_ttl(eh, qry, k->zname, KNOT_RRTYPE_SOA, + qry->timestamp.tv_sec); + if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) { + VERBOSE_MSG(qry, "=> SOA unfit %s: rank 0%.2o, new TTL %d\n", + (eh->is_packet ? "packet" : "RR"), + eh->rank, new_ttl); + return ctx->state; + } + /* Add the SOA into the answer. */ + ret = entry2answer(&ans, AR_SOA, eh, knot_db_val_bound(val), + k->zname, KNOT_RRTYPE_SOA, new_ttl); + if (ret) return ctx->state; + } + + /* Find our target RCODE. */ + int real_rcode; + switch (ans.rcode) { + case PKT_NODATA: + case PKT_NOERROR: /* positive wildcarded response */ + real_rcode = KNOT_RCODE_NOERROR; + break; + case PKT_NXDOMAIN: + real_rcode = KNOT_RCODE_NXDOMAIN; + break; + default: + assert(false); + case 0: /* i.e. nothing was found */ + /* LATER(optim.): zone cut? */ + VERBOSE_MSG(qry, "=> cache miss\n"); + return ctx->state; + } + + if (pkt_renew(pkt, qry->sname, qry->stype) + || knot_pkt_begin(pkt, KNOT_ANSWER) + ) { + assert(false); + return ctx->state; + } + knot_wire_set_rcode(pkt->wire, real_rcode); + + bool expiring = false; // TODO + VERBOSE_MSG(qry, "=> writing RRsets: "); + for (int i = 0; i < sizeof(ans.rrsets) / sizeof(ans.rrsets[0]); ++i) { + if (i == 1) knot_pkt_begin(pkt, KNOT_AUTHORITY); + if (!ans.rrsets[i].set.rr) continue; + expiring = expiring || ans.rrsets[i].set.expiring; + ret = pkt_append(pkt, &ans.rrsets[i], ans.rrsets[i].set.rank); + if (ret) { + assert(false); + return ctx->state; + } + kr_log_verbose(kr_rank_test(ans.rrsets[i].set.rank, KR_RANK_SECURE) + ? "+" : "-"); + } + kr_log_verbose("\n"); + + /* Finishing touches. */ + struct kr_qflags * const qf = &qry->flags; + qf->EXPIRING = expiring; + qf->CACHED = true; + qf->NO_MINIMIZE = true; + + return KR_STATE_DONE; +} + +/** + * This is where the high-level "business logic" of aggressive cache is. + * \return 0: success (may need SOA); >0: try other nsec_p; <0: exit cache immediately. + */ +static int peek_encloser( + struct key *k, struct answer *ans, const int sname_labels, + uint8_t lowest_rank, const struct kr_query *qry, struct kr_cache *cache) +{ + /** Start of NSEC* covering the sname; + * it's part of key - the one within zone (read only) */ + knot_db_val_t cover_low_kwz = { NULL, 0 }; + knot_dname_t cover_hi_storage[KNOT_DNAME_MAXLEN]; + /** End of NSEC* covering the sname. */ + knot_db_val_t cover_hi_kwz = { + .data = cover_hi_storage, + .len = sizeof(cover_hi_storage), + }; + + /**** 2. Find a closest (provable) encloser (of sname). */ + int clencl_labels = -1; + bool clencl_is_tentative = false; + if (!ans->nsec_p.raw) { /* NSEC */ + int ret = nsec1_encloser(k, ans, sname_labels, &clencl_labels, + &cover_low_kwz, &cover_hi_kwz, qry, cache); + if (ret) return ret; + } else { + int ret = nsec3_encloser(k, ans, sname_labels, &clencl_labels, + qry, cache); + clencl_is_tentative = ret == ABS(ENOENT) && clencl_labels >= 0; + /* ^^ Last chance: *positive* wildcard record under this clencl. */ + if (ret && !clencl_is_tentative) return ret; + } + + /* We should have either a match or a cover at this point. */ + if (ans->rcode != PKT_NODATA && ans->rcode != PKT_NXDOMAIN) { + assert(false); + return kr_error(EINVAL); + } + const bool ncloser_covered = ans->rcode == PKT_NXDOMAIN; + + /** Name of the closest (provable) encloser. */ + const knot_dname_t *clencl_name = qry->sname; + for (int l = sname_labels; l > clencl_labels; --l) + clencl_name = knot_wire_next_label(clencl_name, NULL); + + /**** 3. source of synthesis checks, in case the next closer name was covered. + **** 3a. We want to query for NSEC* of source of synthesis (SS) or its + * predecessor, providing us with a proof of its existence or non-existence. */ + if (ncloser_covered && !ans->nsec_p.raw) { + int ret = nsec1_src_synth(k, ans, clencl_name, + cover_low_kwz, cover_hi_kwz, qry, cache); + if (ret == AR_SOA) return 0; + assert(ret <= 0); + if (ret) return ret; + + } else if (ncloser_covered && ans->nsec_p.raw && !clencl_is_tentative) { + int ret = nsec3_src_synth(k, ans, clencl_name, qry, cache); + if (ret == AR_SOA) return 0; + assert(ret <= 0); + if (ret) return ret; + + } /* else (!ncloser_covered) so no wildcard checks needed, + * as we proved that sname exists. */ + + /**** 3b. find wildcarded answer, if next closer name was covered + * and we don't have a full proof yet. (common for NSEC*) */ + if (!ncloser_covered) + return kr_ok(); /* decrease indentation */ + /* Construct key for exact qry->stype + source of synthesis. */ + int ret = kr_dname_lf(k->buf, clencl_name, true); + if (ret) { + assert(!ret); + return kr_error(ret); + } + const uint16_t types[] = { qry->stype, KNOT_RRTYPE_CNAME }; + for (int i = 0; i < (2 - (qry->stype == KNOT_RRTYPE_CNAME)); ++i) { + ret = try_wild(k, ans, clencl_name, types[i], + lowest_rank, qry, cache); + if (ret == kr_ok()) { + return kr_ok(); + } else if (ret != -ABS(ENOENT) && ret != -ABS(ESTALE)) { + assert(false); + return kr_error(ret); + } + /* else continue */ + } + /* Neither attempt succeeded, but the NSEC* proofs were found, + * so skip trying other parameters, as it seems very unlikely + * to turn out differently than by the same wildcard search. */ + return -ABS(ENOENT); +} + + +static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type, + const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl) +#define CHECK_RET(ret) do { \ + if ((ret) < 0) { assert(false); return kr_error((ret)); } \ +} while (false) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + + /* All OK, so start constructing the (pseudo-)packet. */ + int ret = pkt_renew(pkt, qry->sname, qry->stype); + CHECK_RET(ret); + + /* Materialize the sets for the answer in (pseudo-)packet. */ + struct answer ans; + memset(&ans, 0, sizeof(ans)); + ans.mm = &pkt->mm; + ret = entry2answer(&ans, AR_ANSWER, eh, eh_bound, + qry->sname, type, new_ttl); + CHECK_RET(ret); + /* Put links to the materialized data into the pkt. */ + ret = pkt_append(pkt, &ans.rrsets[AR_ANSWER], eh->rank); + CHECK_RET(ret); + + /* Finishing touches. */ + struct kr_qflags * const qf = &qry->flags; + qf->EXPIRING = is_expiring(eh->ttl, new_ttl); + qf->CACHED = true; + qf->NO_MINIMIZE = true; + qf->DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE); + if (qf->DNSSEC_INSECURE) { + qf->DNSSEC_WANT = false; + } + VERBOSE_MSG(qry, "=> satisfied by exact %s: rank 0%.2o, new TTL %d\n", + (type == KNOT_RRTYPE_CNAME ? "CNAME" : "RRset"), + eh->rank, new_ttl); + return kr_ok(); +} +#undef CHECK_RET + + +/** TODO: description; see the single call site for now. */ +static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val, + uint8_t lowest_rank) +{ + struct kr_request *req = ctx->req; + struct kr_query *qry = req->current_query; + + int ret = entry_h_seek(&val, qry->stype); + if (ret) return ret; + const struct entry_h *eh = entry_h_consistent(val, qry->stype); + if (!eh) { + assert(false); + return kr_error(ENOENT); + // LATER: recovery in case of error, perhaps via removing the entry? + // LATER(optim): pehaps optimize the zone cut search + } + + int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, qry->stype, + qry->timestamp.tv_sec); + if (new_ttl < 0 || eh->rank < lowest_rank) { + /* Positive record with stale TTL or bad rank. + * LATER(optim.): It's unlikely that we find a negative one, + * so we might theoretically skip all the cache code. */ + + VERBOSE_MSG(qry, "=> skipping exact %s: rank 0%.2o (min. 0%.2o), new TTL %d\n", + eh->is_packet ? "packet" : "RR", eh->rank, lowest_rank, new_ttl); + return kr_error(ENOENT); + } + + const uint8_t *eh_bound = knot_db_val_bound(val); + if (eh->is_packet) { + /* Note: we answer here immediately, even if it's (theoretically) + * possible that we could generate a higher-security negative proof. + * Rank is high-enough so we take it to save time searching. */ + return answer_from_pkt (ctx, pkt, qry->stype, eh, eh_bound, new_ttl); + } else { + return answer_simple_hit(ctx, pkt, qry->stype, eh, eh_bound, new_ttl); + } +} + + +/** Try to satisfy via wildcard (positively). See the single call site. */ +static int try_wild(struct key *k, struct answer *ans, const knot_dname_t *clencl_name, + const uint16_t type, const uint8_t lowest_rank, + const struct kr_query *qry, struct kr_cache *cache) +{ + knot_db_val_t key = key_exact_type(k, type); + /* Find the record. */ + knot_db_val_t val = { NULL, 0 }; + int ret = cache_op(cache, read, &key, &val, 1); + if (!ret) { + ret = entry_h_seek(&val, type); + } + if (ret) { + if (ret != -ABS(ENOENT)) { + VERBOSE_MSG(qry, "=> wildcard: hit error %d %s\n", + ret, strerror(abs(ret))); + assert(false); + } + WITH_VERBOSE(qry) { + auto_free char *clencl_str = kr_dname_text(clencl_name), + *type_str = kr_rrtype_text(type); + VERBOSE_MSG(qry, "=> wildcard: not found: *.%s %s\n", + clencl_str, type_str); + } + return ret; + } + /* Check if the record is OK. */ + const struct entry_h *eh = entry_h_consistent(val, type); + if (!eh) { + assert(false); + return kr_error(ret); + // LATER: recovery in case of error, perhaps via removing the entry? + } + int32_t new_ttl = get_new_ttl(eh, qry, qry->sname, type, qry->timestamp.tv_sec); + /* ^^ here we use the *expanded* wildcard name */ + if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) { + /* Wildcard record with stale TTL, bad rank or packet. */ + VERBOSE_MSG(qry, "=> wildcard: skipping %s, rank 0%.2o, new TTL %d\n", + eh->is_packet ? "packet" : "RR", eh->rank, new_ttl); + return -ABS(ESTALE); + } + /* Add the RR into the answer. */ + ret = entry2answer(ans, AR_ANSWER, eh, knot_db_val_bound(val), + qry->sname, type, new_ttl); + VERBOSE_MSG(qry, "=> wildcard: answer expanded, ret = %d, new TTL %d\n", + ret, (int)new_ttl); + if (ret) return kr_error(ret); + ans->rcode = PKT_NOERROR; + return kr_ok(); +} + +int kr_cache_closest_apex(struct kr_cache *cache, const knot_dname_t *name, bool is_DS, + knot_dname_t ** apex) +{ + if (!cache || !cache->db || !name || !apex || *apex) { + assert(!EINVAL); + return kr_error(EINVAL); + } + struct key k_storage, *k = &k_storage; + int ret = kr_dname_lf(k->buf, name, false); + if (ret) + return kr_error(ret); + entry_list_t el_; + k->zname = name; + ret = closest_NS(cache, k, el_, NULL, true, is_DS); + if (ret && ret != -abs(ENOENT)) + return ret; + *apex = knot_dname_copy(k->zname, NULL); + if (!*apex) + return kr_error(ENOMEM); + return kr_ok(); +} + +/** \internal for closest_NS. Check suitability of a single entry, setting k->type if OK. + * \return error code, negative iff whole list should be skipped. + */ +static int check_NS_entry(struct key *k, knot_db_val_t entry, int i, + bool exact_match, bool is_DS, + const struct kr_query *qry, uint32_t timestamp); + +/** + * Find the longest prefix zone/xNAME (with OK time+rank), starting at k->*. + * + * The found type is returned via k->type; the values are returned in el. + * \note we use k->type = KNOT_RRTYPE_NS also for the nsec_p result. + * \param qry can be NULL (-> gettimeofday(), but you lose the stale-serve hook) + * \param only_NS don't consider xNAMEs + * \return error code + */ +static int closest_NS(struct kr_cache *cache, struct key *k, entry_list_t el, + struct kr_query *qry, const bool only_NS, const bool is_DS) +{ + /* get the current timestamp */ + uint32_t timestamp; + if (qry) { + timestamp = qry->timestamp.tv_sec; + } else { + struct timeval tv; + if (gettimeofday(&tv, NULL)) return kr_error(errno); + timestamp = tv.tv_sec; + } + + int zlf_len = k->buf[0]; + + // LATER(optim): if stype is NS, we check the same value again + bool exact_match = true; + bool need_zero = true; + /* Inspect the NS/xNAME entries, shortening by a label on each iteration. */ + do { + k->buf[0] = zlf_len; + knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS); + knot_db_val_t val; + int ret = cache_op(cache, read, &key, &val, 1); + if (ret == -abs(ENOENT)) goto next_label; + if (ret) { + assert(!ret); + if (need_zero) memset(el, 0, sizeof(entry_list_t)); + return kr_error(ret); + } + + /* Check consistency, find any type; + * using `goto` for shortening by another label. */ + ret = entry_list_parse(val, el); + if (ret) { + assert(!ret); // do something about it? + goto next_label; + } + need_zero = false; + /* More types are possible; try in order. + * For non-fatal failures just "continue;" to try the next type. */ + const int el_count = only_NS ? EL_NS + 1 : EL_LENGTH; + for (int i = 0; i < el_count; ++i) { + ret = check_NS_entry(k, el[i], i, exact_match, is_DS, + qry, timestamp); + if (ret < 0) goto next_label; else + if (!ret) { + /* We found our match. */ + k->zlf_len = zlf_len; + return kr_ok(); + } + } + + next_label: + /* remove one more label */ + exact_match = false; + if (k->zname[0] == 0) { + /* We miss root NS in cache, but let's at least assume it exists. */ + k->type = KNOT_RRTYPE_NS; + k->zlf_len = zlf_len; + assert(zlf_len == 0); + if (need_zero) memset(el, 0, sizeof(entry_list_t)); + return kr_error(ENOENT); + } + zlf_len -= (k->zname[0] + 1); + k->zname += (k->zname[0] + 1); + k->buf[zlf_len + 1] = 0; + } while (true); +} + +static int check_NS_entry(struct key *k, const knot_db_val_t entry, const int i, + const bool exact_match, const bool is_DS, + const struct kr_query *qry, uint32_t timestamp) +{ + const int ESKIP = ABS(ENOENT); + if (!entry.len + /* On a zone cut we want DS from the parent zone. */ + || (i <= EL_NS && exact_match && is_DS) + /* CNAME is interesting only if we + * directly hit the name that was asked. + * Note that we want it even in the DS case. */ + || (i == EL_CNAME && !exact_match) + /* DNAME is interesting only if we did NOT + * directly hit the name that was asked. */ + || (i == EL_DNAME && exact_match) + ) { + return ESKIP; + } + + uint16_t type; + if (i < ENTRY_APEX_NSECS_CNT) { + type = KNOT_RRTYPE_NS; + int32_t log_new_ttl = -123456789; /* visually recognizable value */ + const int err = nsec_p_ttl(entry, timestamp, &log_new_ttl); + if (err) { + VERBOSE_MSG(qry, + "=> skipping unfit nsec_p: new TTL %d, error %d\n", + (int)log_new_ttl, err); + return ESKIP; + } + } else { + type = EL2RRTYPE(i); + /* Find the entry for the type, check positivity, TTL */ + const struct entry_h *eh = entry_h_consistent(entry, type); + if (!eh) { + VERBOSE_MSG(qry, "=> EH not consistent\n"); + assert(false); + return kr_error(EILSEQ); + } + const int32_t log_new_ttl = get_new_ttl(eh, qry, k->zname, type, timestamp); + const uint8_t rank_min = KR_RANK_INSECURE | KR_RANK_AUTH; + const bool ok = /* For NS any kr_rank is accepted, + * as insecure or even nonauth is OK */ + (type == KNOT_RRTYPE_NS || eh->rank >= rank_min) + /* Not interested in negative bogus or outdated RRs. */ + && !eh->is_packet && log_new_ttl >= 0; + WITH_VERBOSE(qry) { if (!ok) { + auto_free char *type_str = kr_rrtype_text(type); + const char *packet_str = eh->is_packet ? "packet" : "RR"; + VERBOSE_MSG(qry, + "=> skipping unfit %s %s: rank 0%.2o, new TTL %d\n", + type_str, packet_str, eh->rank, (int)log_new_ttl); + } } + if (!ok) return ESKIP; + } + k->type = type; + return kr_ok(); +} + |